xref: /illumos-gate/usr/src/uts/common/os/devcache.c (revision bb57d1f5164aca913cbd286ae1b61c896167cfa7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/note.h>
29 #include <sys/t_lock.h>
30 #include <sys/cmn_err.h>
31 #include <sys/instance.h>
32 #include <sys/conf.h>
33 #include <sys/stat.h>
34 #include <sys/ddi.h>
35 #include <sys/hwconf.h>
36 #include <sys/sunddi.h>
37 #include <sys/sunndi.h>
38 #include <sys/ddi_impldefs.h>
39 #include <sys/ndi_impldefs.h>
40 #include <sys/modctl.h>
41 #include <sys/dacf.h>
42 #include <sys/promif.h>
43 #include <sys/cpuvar.h>
44 #include <sys/pathname.h>
45 #include <sys/kobj.h>
46 #include <sys/devcache.h>
47 #include <sys/devcache_impl.h>
48 #include <sys/sysmacros.h>
49 #include <sys/varargs.h>
50 #include <sys/callb.h>
51 
52 /*
53  * This facility provides interfaces to clients to register,
54  * read and update cache data in persisted backing store files,
55  * usually in /etc/devices.  The data persisted through this
56  * mechanism should be stateless data, functioning in the sense
57  * of a cache.  Writes are performed by a background daemon
58  * thread, permitting a client to schedule an update without
59  * blocking, then continue updating the data state in
60  * parallel.  The data is only locked by the daemon thread
61  * to pack the data in preparation for the write.
62  *
63  * Data persisted through this mechanism should be capable
64  * of being regenerated through normal system operation,
65  * for example attaching all disk devices would cause all
66  * devids to be registered for those devices.  By caching
67  * a devid-device tuple, the system can operate in a
68  * more optimal way, directly attaching the device mapped
69  * to a devid, rather than burdensomely driving attach of
70  * the entire device tree to discover a single device.
71  *
72  * Note that a client should only need to include
73  * <sys/devcache.h> for the supported interfaces.
74  *
75  * The data per client is entirely within the control of
76  * the client.  When reading, data unpacked from the backing
77  * store should be inserted in the list.  The pointer to
78  * the list can be retrieved via nvf_list().  When writing,
79  * the data on the list is to be packed and returned to the
80  * nvpdaemon as an nvlist.
81  *
82  * Obvious restrictions are imposed by the limits of the
83  * nvlist format.  The data cannot be read or written
84  * piecemeal, and large amounts of data aren't recommended.
85  * However, nvlists do allow that data be named and typed
86  * and can be size-of-int invariant, and the cached data
87  * can be versioned conveniently.
88  *
89  * The registration involves two steps: a handle is
90  * allocated by calling the registration function.
91  * This sets up the data referenced by the handle and
92  * initializes the lock.  Following registration, the
93  * client must initialize the data list.  The list
94  * interfaces require that the list element with offset
95  * to the node link be provided.  The format of the
96  * list element is under the control of the client.
97  *
98  * Locking: the address of the data list r/w lock provided
99  * can be accessed with nvf_lock().  The lock must be held
100  * as reader when traversing the list or checking state,
101  * such as nvf_is_dirty().  The lock must be held as
102  * writer when updating the list or marking it dirty.
103  * The lock must not be held when waking the daemon.
104  *
105  * The data r/w lock is held as writer when the pack,
106  * unpack and free list handlers are called.  The
107  * lock should not be dropped and must be still held
108  * upon return.  The client should also hold the lock
109  * as reader when checking if the list is dirty, and
110  * as writer when marking the list dirty or initiating
111  * a read.
112  *
113  * The asynchronous nature of updates allows for the
114  * possibility that the data may continue to be updated
115  * once the daemon has been notified that an update is
116  * desired.  The data only needs to be locked against
117  * updates when packing the data into the form to be
118  * written.  When the write of the packed data has
119  * completed, the daemon will automatically reschedule
120  * an update if the data was marked dirty after the
121  * point at which it was packed.  Before beginning an
122  * update, the daemon attempts to lock the data as
123  * writer; if the writer lock is already held, it
124  * backs off and retries later.  The model is to give
125  * priority to the kernel processes generating the
126  * data, and that the nature of the data is that
127  * it does not change often, can be re-generated when
128  * needed, so updates should not happen often and
129  * can be delayed until the data stops changing.
130  * The client may update the list or mark it dirty
131  * any time it is able to acquire the lock as
132  * writer first.
133  *
134  * A failed write will be retried after some delay,
135  * in the hope that the cause of the error will be
136  * transient, for example a filesystem with no space
137  * available.  An update on a read-only filesystem
138  * is failed silently and not retried; this would be
139  * the case when booted off install media.
140  *
141  * There is no unregister mechanism as of yet, as it
142  * hasn't been needed so far.
143  */
144 
145 /*
146  * Global list of files registered and updated by the nvpflush
147  * daemon, protected by the nvf_cache_mutex.  While an
148  * update is taking place, a file is temporarily moved to
149  * the dirty list to avoid locking the primary list for
150  * the duration of the update.
151  */
152 list_t		nvf_cache_files;
153 list_t		nvf_dirty_files;
154 kmutex_t	nvf_cache_mutex;
155 
156 
157 /*
158  * Allow some delay from an update of the data before flushing
159  * to permit simultaneous updates of multiple changes.
160  * Changes in the data are expected to be bursty, ie
161  * reconfig or hot-plug of a new adapter.
162  *
163  * kfio_report_error (default 0)
164  *	Set to 1 to enable some error messages related to low-level
165  *	kernel file i/o operations.
166  *
167  * nvpflush_delay (default 10)
168  *	The number of seconds after data is marked dirty before the
169  *	flush daemon is triggered to flush the data.  A longer period
170  *	of time permits more data updates per write.  Note that
171  *	every update resets the timer so no repository write will
172  *	occur while data is being updated continuously.
173  *
174  * nvpdaemon_idle_time (default 60)
175  *	The number of seconds the daemon will sleep idle before exiting.
176  *
177  */
178 #define	NVPFLUSH_DELAY		10
179 #define	NVPDAEMON_IDLE_TIME	60
180 
181 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
182 
183 /*
184  * Tunables
185  */
186 int kfio_report_error = 0;		/* kernel file i/o operations */
187 int kfio_disable_read = 0;		/* disable all reads */
188 int kfio_disable_write = 0;		/* disable all writes */
189 
190 int nvpflush_delay	= NVPFLUSH_DELAY;
191 int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
192 
193 static timeout_id_t	nvpflush_id = 0;
194 static int		nvpflush_timer_busy = 0;
195 static int		nvpflush_daemon_active = 0;
196 static kthread_t	*nvpflush_thr_id = 0;
197 
198 static int		do_nvpflush = 0;
199 static int		nvpbusy = 0;
200 static kmutex_t		nvpflush_lock;
201 static kcondvar_t	nvpflush_cv;
202 static kthread_id_t	nvpflush_thread;
203 static clock_t		nvpticks;
204 
205 static void nvpflush_daemon(void);
206 
207 #ifdef	DEBUG
208 int nvpdaemon_debug = 0;
209 int kfio_debug = 0;
210 #endif	/* DEBUG */
211 
212 extern int modrootloaded;
213 extern void mdi_read_devices_files(void);
214 extern void mdi_clean_vhcache(void);
215 
216 /*
217  * Initialize the overall cache file management
218  */
219 void
220 i_ddi_devices_init(void)
221 {
222 	list_create(&nvf_cache_files, sizeof (nvfd_t),
223 	    offsetof(nvfd_t, nvf_link));
224 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
225 	    offsetof(nvfd_t, nvf_link));
226 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
227 	retire_store_init();
228 	devid_cache_init();
229 }
230 
231 /*
232  * Read cache files
233  * The files read here should be restricted to those
234  * that may be required to mount root.
235  */
236 void
237 i_ddi_read_devices_files(void)
238 {
239 	/*
240 	 * The retire store should be the first file read as it
241 	 * may need to offline devices. kfio_disable_read is not
242 	 * used for retire. For the rationale see the tunable
243 	 * ddi_retire_store_bypass and comments in:
244 	 *	uts/common/os/retire_store.c
245 	 */
246 
247 	retire_store_read();
248 
249 	if (!kfio_disable_read) {
250 		mdi_read_devices_files();
251 		devid_cache_read();
252 	}
253 }
254 
255 void
256 i_ddi_start_flush_daemon(void)
257 {
258 	nvfd_t	*nvfdp;
259 
260 	ASSERT(i_ddi_io_initialized());
261 
262 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
263 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
264 
265 	mutex_enter(&nvf_cache_mutex);
266 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
267 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
268 		if (NVF_IS_DIRTY(nvfdp)) {
269 			nvf_wake_daemon();
270 			break;
271 		}
272 	}
273 	mutex_exit(&nvf_cache_mutex);
274 }
275 
276 void
277 i_ddi_clean_devices_files(void)
278 {
279 	devid_cache_cleanup();
280 	mdi_clean_vhcache();
281 }
282 
283 /*
284  * Register a cache file to be managed and updated by the nvpflush daemon.
285  * All operations are performed through the returned handle.
286  * There is no unregister mechanism for now.
287  */
288 nvf_handle_t
289 nvf_register_file(nvf_ops_t *ops)
290 {
291 	nvfd_t *nvfdp;
292 
293 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
294 
295 	nvfdp->nvf_ops = ops;
296 	nvfdp->nvf_flags = 0;
297 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
298 
299 	mutex_enter(&nvf_cache_mutex);
300 	list_insert_tail(&nvf_cache_files, nvfdp);
301 	mutex_exit(&nvf_cache_mutex);
302 
303 	return ((nvf_handle_t)nvfdp);
304 }
305 
306 /*PRINTFLIKE1*/
307 void
308 nvf_error(const char *fmt, ...)
309 {
310 	va_list ap;
311 
312 	if (kfio_report_error) {
313 		va_start(ap, fmt);
314 		vcmn_err(CE_NOTE, fmt, ap);
315 		va_end(ap);
316 	}
317 }
318 
319 /*
320  * Some operations clients may use to manage the data
321  * to be persisted in a cache file.
322  */
323 char *
324 nvf_cache_name(nvf_handle_t handle)
325 {
326 	return (((nvfd_t *)handle)->nvf_cache_path);
327 }
328 
329 krwlock_t *
330 nvf_lock(nvf_handle_t handle)
331 {
332 	return (&(((nvfd_t *)handle)->nvf_lock));
333 }
334 
335 list_t *
336 nvf_list(nvf_handle_t handle)
337 {
338 	return (&(((nvfd_t *)handle)->nvf_data_list));
339 }
340 
341 void
342 nvf_mark_dirty(nvf_handle_t handle)
343 {
344 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
345 	NVF_MARK_DIRTY((nvfd_t *)handle);
346 }
347 
348 int
349 nvf_is_dirty(nvf_handle_t handle)
350 {
351 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
352 	return (NVF_IS_DIRTY((nvfd_t *)handle));
353 }
354 
355 static uint16_t
356 nvp_cksum(uchar_t *buf, int64_t buflen)
357 {
358 	uint16_t cksum = 0;
359 	uint16_t *p = (uint16_t *)buf;
360 	int64_t n;
361 
362 	if ((buflen & 0x01) != 0) {
363 		buflen--;
364 		cksum = buf[buflen];
365 	}
366 	n = buflen / 2;
367 	while (n-- > 0)
368 		cksum ^= *p++;
369 	return (cksum);
370 }
371 
372 int
373 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
374 {
375 	struct _buf	*file;
376 	nvpf_hdr_t	hdr;
377 	char		*buf;
378 	nvlist_t	*nvl;
379 	int		rval;
380 	uint_t		offset;
381 	int		n;
382 	char		c;
383 	uint16_t	cksum, hdrsum;
384 
385 	*ret_nvlist = NULL;
386 
387 	file = kobj_open_file(filename);
388 	if (file == (struct _buf *)-1) {
389 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
390 		return (ENOENT);
391 	}
392 
393 	offset = 0;
394 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
395 	if (n != sizeof (hdr)) {
396 		kobj_close_file(file);
397 		if (n < 0) {
398 			nvf_error("error reading header: %s\n", filename);
399 			return (EIO);
400 		} else if (n == 0) {
401 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
402 		} else {
403 			nvf_error("header size incorrect: %s\n", filename);
404 		}
405 		return (EINVAL);
406 	}
407 	offset += n;
408 
409 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
410 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
411 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
412 		(longlong_t)hdr.nvpf_size));
413 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
414 		hdr.nvpf_hdr_chksum));
415 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
416 
417 	cksum = hdr.nvpf_hdr_chksum;
418 	hdr.nvpf_hdr_chksum = 0;
419 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
420 
421 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
422 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
423 		kobj_close_file(file);
424 		if (hdrsum != cksum) {
425 			nvf_error("%s: checksum error "
426 			    "(actual 0x%x, expected 0x%x)\n",
427 			    filename, hdrsum, cksum);
428 		}
429 		nvf_error("%s: header information incorrect", filename);
430 		return (EINVAL);
431 	}
432 
433 	ASSERT(hdr.nvpf_size >= 0);
434 
435 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
436 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
437 	if (n != hdr.nvpf_size) {
438 		kmem_free(buf, hdr.nvpf_size);
439 		kobj_close_file(file);
440 		if (n < 0) {
441 			nvf_error("%s: read error %d", filename, n);
442 		} else {
443 			nvf_error("%s: incomplete read %d/%lld",
444 				filename, n, (longlong_t)hdr.nvpf_size);
445 		}
446 		return (EINVAL);
447 	}
448 	offset += n;
449 
450 	rval = kobj_read_file(file, &c, 1, offset);
451 	kobj_close_file(file);
452 	if (rval > 0) {
453 		nvf_error("%s is larger than %lld\n",
454 			filename, (longlong_t)hdr.nvpf_size);
455 		kmem_free(buf, hdr.nvpf_size);
456 		return (EINVAL);
457 	}
458 
459 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
460 	if (hdr.nvpf_chksum != cksum) {
461 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
462 		    filename, hdr.nvpf_chksum, cksum);
463 		kmem_free(buf, hdr.nvpf_size);
464 		return (EINVAL);
465 	}
466 
467 	nvl = NULL;
468 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
469 	if (rval != 0) {
470 		nvf_error("%s: error %d unpacking nvlist\n",
471 			filename, rval);
472 		kmem_free(buf, hdr.nvpf_size);
473 		return (EINVAL);
474 	}
475 
476 	kmem_free(buf, hdr.nvpf_size);
477 	*ret_nvlist = nvl;
478 	return (0);
479 }
480 
481 static int
482 kfcreate(char *filename, kfile_t **kfilep)
483 {
484 	kfile_t	*fp;
485 	int	rval;
486 
487 	ASSERT(modrootloaded);
488 
489 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
490 
491 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
492 	fp->kf_fname = filename;
493 	fp->kf_fpos = 0;
494 	fp->kf_state = 0;
495 
496 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
497 		filename, fp->kf_vnflags));
498 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
499 	    0444, &fp->kf_vp, CRCREAT, 0);
500 	if (rval != 0) {
501 		kmem_free(fp, sizeof (kfile_t));
502 		KFDEBUG((CE_CONT, "%s: create error %d\n",
503 			filename, rval));
504 		return (rval);
505 	}
506 
507 	*kfilep = fp;
508 	return (0);
509 }
510 
511 static int
512 kfremove(char *filename)
513 {
514 	int rval;
515 
516 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
517 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
518 	if (rval != 0) {
519 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
520 			filename, rval));
521 	}
522 	return (rval);
523 }
524 
525 static int
526 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
527 {
528 	ssize_t		resid;
529 	int		err;
530 	ssize_t		n;
531 
532 	ASSERT(modrootloaded);
533 
534 	if (fp->kf_state != 0)
535 		return (fp->kf_state);
536 
537 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
538 		UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
539 	if (err != 0) {
540 		KFDEBUG((CE_CONT, "%s: read error %d\n",
541 			fp->kf_fname, err));
542 		fp->kf_state = err;
543 		return (err);
544 	}
545 
546 	ASSERT(resid >= 0 && resid <= bufsiz);
547 	n = bufsiz - resid;
548 
549 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
550 		fp->kf_fname, n, bufsiz, resid));
551 
552 	fp->kf_fpos += n;
553 	*ret_n = n;
554 	return (0);
555 }
556 
557 static int
558 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
559 {
560 	rlim64_t	rlimit;
561 	ssize_t		resid;
562 	int		err;
563 	ssize_t		len;
564 	ssize_t		n = 0;
565 
566 	ASSERT(modrootloaded);
567 
568 	if (fp->kf_state != 0)
569 		return (fp->kf_state);
570 
571 	len = bufsiz;
572 	rlimit = bufsiz + 1;
573 	for (;;) {
574 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
575 			UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
576 		if (err) {
577 			KFDEBUG((CE_CONT, "%s: write error %d\n",
578 				fp->kf_fname, err));
579 			fp->kf_state = err;
580 			return (err);
581 		}
582 
583 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
584 			fp->kf_fname, len-resid, resid));
585 
586 		ASSERT(resid >= 0 && resid <= len);
587 
588 		n += (len - resid);
589 		if (resid == 0)
590 			break;
591 
592 		if (resid == len) {
593 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
594 				fp->kf_fname));
595 			fp->kf_state = ENOSPC;
596 			return (ENOSPC);
597 		}
598 
599 		len -= resid;
600 		buf += len;
601 		fp->kf_fpos += len;
602 		len = resid;
603 	}
604 
605 	ASSERT(n == bufsiz);
606 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
607 
608 	*ret_n = n;
609 	return (0);
610 }
611 
612 
613 static int
614 kfclose(kfile_t *fp)
615 {
616 	int		rval;
617 
618 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
619 
620 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
621 		rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
622 		if (rval != 0) {
623 			nvf_error("%s: sync error %d\n",
624 				fp->kf_fname, rval);
625 		}
626 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
627 	}
628 
629 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred,
630 		NULL);
631 	if (rval != 0) {
632 		if (fp->kf_state == 0) {
633 			nvf_error("%s: close error %d\n",
634 				fp->kf_fname, rval);
635 		}
636 	} else {
637 		if (fp->kf_state == 0)
638 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
639 	}
640 
641 	VN_RELE(fp->kf_vp);
642 	kmem_free(fp, sizeof (kfile_t));
643 	return (rval);
644 }
645 
646 static int
647 kfrename(char *oldname, char *newname)
648 {
649 	int rval;
650 
651 	ASSERT(modrootloaded);
652 
653 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
654 
655 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
656 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
657 			oldname, newname, rval));
658 	}
659 
660 	return (rval);
661 }
662 
663 int
664 fwrite_nvlist(char *filename, nvlist_t *nvl)
665 {
666 	char	*buf;
667 	char	*nvbuf;
668 	kfile_t	*fp;
669 	char	*newname;
670 	int	len, err, err1;
671 	size_t	buflen;
672 	ssize_t	n;
673 
674 	ASSERT(modrootloaded);
675 
676 	nvbuf = NULL;
677 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
678 	if (err != 0) {
679 		nvf_error("%s: error %d packing nvlist\n",
680 			filename, err);
681 		return (err);
682 	}
683 
684 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
685 	bzero(buf, sizeof (nvpf_hdr_t));
686 
687 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
688 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
689 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
690 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
691 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
692 		nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
693 
694 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
695 	kmem_free(nvbuf, buflen);
696 	buflen += sizeof (nvpf_hdr_t);
697 
698 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
699 	newname = kmem_alloc(len, KM_SLEEP);
700 
701 
702 	(void) sprintf(newname, "%s.%s",
703 		filename, NEW_FILENAME_SUFFIX);
704 
705 	/*
706 	 * To make it unlikely we suffer data loss, write
707 	 * data to the new temporary file.  Once successful
708 	 * complete the transaction by renaming the new file
709 	 * to replace the previous.
710 	 */
711 
712 	if ((err = kfcreate(newname, &fp)) == 0) {
713 		err = kfwrite(fp, buf, buflen, &n);
714 		if (err) {
715 			nvf_error("%s: write error - %d\n",
716 				newname, err);
717 		} else {
718 			if (n != buflen) {
719 				nvf_error(
720 				    "%s: partial write %ld of %ld bytes\n",
721 				    newname, n, buflen);
722 				nvf_error("%s: filesystem may be full?\n",
723 				    newname);
724 				err = EIO;
725 			}
726 		}
727 		if ((err1 = kfclose(fp)) != 0) {
728 			nvf_error("%s: close error\n", newname);
729 			if (err == 0)
730 				err = err1;
731 		}
732 		if (err != 0) {
733 			if (kfremove(newname) != 0) {
734 				nvf_error("%s: remove failed\n",
735 				    newname);
736 			}
737 		}
738 	} else {
739 		nvf_error("%s: create failed - %d\n", filename, err);
740 	}
741 
742 	if (err == 0) {
743 		if ((err = kfrename(newname, filename)) != 0) {
744 			nvf_error("%s: rename from %s failed\n",
745 				newname, filename);
746 		}
747 	}
748 
749 	kmem_free(newname, len);
750 	kmem_free(buf, buflen);
751 
752 	return (err);
753 }
754 
755 static int
756 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
757 {
758 	int err;
759 
760 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
761 		return (DDI_SUCCESS);
762 	else {
763 		if (err == EROFS)
764 			NVF_MARK_READONLY(nvfd);
765 		return (DDI_FAILURE);
766 	}
767 }
768 
769 static void
770 nvp_list_free(nvfd_t *nvf)
771 {
772 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
773 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
774 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
775 }
776 
777 /*
778  * Read a file in the nvlist format
779  *	EIO - i/o error during read
780  *	ENOENT - file not found
781  *	EINVAL - file contents corrupted
782  */
783 static int
784 fread_nvp_list(nvfd_t *nvfd)
785 {
786 	nvlist_t	*nvl;
787 	nvpair_t	*nvp;
788 	char		*name;
789 	nvlist_t	*sublist;
790 	int		rval;
791 	int		rv;
792 
793 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
794 
795 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
796 	if (rval != 0)
797 		return (rval);
798 	ASSERT(nvl != NULL);
799 
800 	nvp = NULL;
801 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
802 		name = nvpair_name(nvp);
803 		ASSERT(strlen(name) > 0);
804 
805 		switch (nvpair_type(nvp)) {
806 		case DATA_TYPE_NVLIST:
807 			rval = nvpair_value_nvlist(nvp, &sublist);
808 			if (rval != 0) {
809 				nvf_error(
810 				    "nvpair_value_nvlist error %s %d\n",
811 				    name, rval);
812 				goto error;
813 			}
814 
815 			/*
816 			 * unpack nvlist for this device and
817 			 * add elements to data list.
818 			 */
819 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
820 			rv = (nvfd->nvf_unpack_nvlist)
821 			    ((nvf_handle_t)nvfd, sublist, name);
822 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
823 			if (rv != 0) {
824 				nvf_error(
825 				    "%s: %s invalid list element\n",
826 				    nvfd->nvf_cache_path, name);
827 				rval = EINVAL;
828 				goto error;
829 			}
830 			break;
831 
832 		default:
833 			nvf_error("%s: %s unsupported data type %d\n",
834 				nvfd->nvf_cache_path, name, nvpair_type(nvp));
835 			rval = EINVAL;
836 			goto error;
837 		}
838 	}
839 
840 	nvlist_free(nvl);
841 
842 	return (0);
843 
844 error:
845 	nvlist_free(nvl);
846 	nvp_list_free(nvfd);
847 	return (rval);
848 }
849 
850 
851 int
852 nvf_read_file(nvf_handle_t nvf_handle)
853 {
854 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
855 	int rval;
856 
857 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
858 
859 	if (kfio_disable_read)
860 		return (0);
861 
862 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
863 
864 	rval = fread_nvp_list(nvfd);
865 	if (rval) {
866 		switch (rval) {
867 		case EIO:
868 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
869 			cmn_err(CE_WARN, "%s: I/O error",
870 				nvfd->nvf_cache_path);
871 			break;
872 		case ENOENT:
873 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
874 			nvf_error("%s: not found\n",
875 				nvfd->nvf_cache_path);
876 			break;
877 		case EINVAL:
878 		default:
879 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
880 			cmn_err(CE_WARN, "%s: data file corrupted",
881 				nvfd->nvf_cache_path);
882 			break;
883 		}
884 	}
885 	return (rval);
886 }
887 
888 static void
889 nvf_write_is_complete(nvfd_t *fd)
890 {
891 	if (fd->nvf_write_complete) {
892 		(fd->nvf_write_complete)((nvf_handle_t)fd);
893 	}
894 }
895 
896 /*ARGSUSED*/
897 static void
898 nvpflush_timeout(void *arg)
899 {
900 	clock_t nticks;
901 
902 	mutex_enter(&nvpflush_lock);
903 	nticks = nvpticks - ddi_get_lbolt();
904 	if (nticks > 4) {
905 		nvpflush_timer_busy = 1;
906 		mutex_exit(&nvpflush_lock);
907 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
908 	} else {
909 		do_nvpflush = 1;
910 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
911 		cv_signal(&nvpflush_cv);
912 		nvpflush_id = 0;
913 		nvpflush_timer_busy = 0;
914 		mutex_exit(&nvpflush_lock);
915 	}
916 }
917 
918 /*
919  * After marking a list as dirty, wake the nvpflush daemon
920  * to perform the update.
921  */
922 void
923 nvf_wake_daemon(void)
924 {
925 	clock_t nticks;
926 
927 	/*
928 	 * If the system isn't up yet
929 	 * don't even think about starting a flush.
930 	 */
931 	if (!i_ddi_io_initialized())
932 		return;
933 
934 	mutex_enter(&nvpflush_lock);
935 
936 	if (nvpflush_daemon_active == 0) {
937 		nvpflush_daemon_active = 1;
938 		mutex_exit(&nvpflush_lock);
939 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
940 		nvpflush_thr_id = thread_create(NULL, 0,
941 		    (void (*)())nvpflush_daemon,
942 		    NULL, 0, &p0, TS_RUN, minclsyspri);
943 		mutex_enter(&nvpflush_lock);
944 	}
945 
946 	nticks = nvpflush_delay * TICKS_PER_SECOND;
947 	nvpticks = ddi_get_lbolt() + nticks;
948 	if (nvpflush_timer_busy == 0) {
949 		nvpflush_timer_busy = 1;
950 		mutex_exit(&nvpflush_lock);
951 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
952 	} else
953 		mutex_exit(&nvpflush_lock);
954 }
955 
956 static int
957 nvpflush_one(nvfd_t *nvfd)
958 {
959 	int rval = DDI_SUCCESS;
960 	nvlist_t *nvl;
961 
962 	rw_enter(&nvfd->nvf_lock, RW_READER);
963 
964 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
965 
966 	if (!NVF_IS_DIRTY(nvfd) ||
967 	    NVF_IS_READONLY(nvfd) || kfio_disable_write) {
968 		NVF_CLEAR_DIRTY(nvfd);
969 		rw_exit(&nvfd->nvf_lock);
970 		return (DDI_SUCCESS);
971 	}
972 
973 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
974 		nvf_error("nvpflush: "
975 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
976 		rw_exit(&nvfd->nvf_lock);
977 		return (DDI_FAILURE);
978 	}
979 	if (((nvfd->nvf_pack_list)
980 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
981 		nvf_error("nvpflush: "
982 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
983 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
984 		rw_exit(&nvfd->nvf_lock);
985 		return (DDI_FAILURE);
986 	}
987 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
988 
989 	NVF_CLEAR_DIRTY(nvfd);
990 	nvfd->nvf_flags |= NVF_F_FLUSHING;
991 	rw_exit(&nvfd->nvf_lock);
992 
993 	rval = e_fwrite_nvlist(nvfd, nvl);
994 	nvlist_free(nvl);
995 
996 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
997 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
998 	if (rval == DDI_FAILURE) {
999 		if (NVF_IS_READONLY(nvfd)) {
1000 			rval = DDI_SUCCESS;
1001 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
1002 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
1003 			cmn_err(CE_CONT,
1004 			    "%s: updated failed\n", nvfd->nvf_cache_path);
1005 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
1006 		}
1007 	} else {
1008 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
1009 			cmn_err(CE_CONT,
1010 			    "!Creating %s\n", nvfd->nvf_cache_path);
1011 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
1012 		}
1013 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
1014 			cmn_err(CE_CONT,
1015 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
1016 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
1017 		}
1018 		if (nvfd->nvf_flags & NVF_F_ERROR) {
1019 			cmn_err(CE_CONT,
1020 			    "%s: update now ok\n", nvfd->nvf_cache_path);
1021 			nvfd->nvf_flags &= ~NVF_F_ERROR;
1022 		}
1023 		/*
1024 		 * The file may need to be flushed again if the cached
1025 		 * data was touched while writing the earlier contents.
1026 		 */
1027 		if (NVF_IS_DIRTY(nvfd))
1028 			rval = DDI_FAILURE;
1029 	}
1030 
1031 	rw_exit(&nvfd->nvf_lock);
1032 	return (rval);
1033 }
1034 
1035 
1036 static void
1037 nvpflush_daemon(void)
1038 {
1039 	callb_cpr_t cprinfo;
1040 	nvfd_t *nvfdp, *nextfdp;
1041 	clock_t clk;
1042 	int rval;
1043 	int want_wakeup;
1044 	int is_now_clean;
1045 
1046 	ASSERT(modrootloaded);
1047 
1048 	nvpflush_thread = curthread;
1049 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
1050 
1051 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
1052 	mutex_enter(&nvpflush_lock);
1053 	for (;;) {
1054 
1055 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1056 		while (do_nvpflush == 0) {
1057 			clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock,
1058 			    ddi_get_lbolt() +
1059 				(nvpdaemon_idle_time * TICKS_PER_SECOND));
1060 			if (clk == -1 &&
1061 			    do_nvpflush == 0 && nvpflush_timer_busy == 0) {
1062 				/*
1063 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
1064 				 * on the lock passed in to CALLB_CPR_INIT,
1065 				 * so the lock must be held when invoking it.
1066 				 */
1067 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1068 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
1069 				ASSERT(mutex_owned(&nvpflush_lock));
1070 				nvpflush_thr_id = NULL;
1071 				nvpflush_daemon_active = 0;
1072 				CALLB_CPR_EXIT(&cprinfo);
1073 				thread_exit();
1074 			}
1075 		}
1076 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1077 
1078 		nvpbusy = 1;
1079 		want_wakeup = 0;
1080 		do_nvpflush = 0;
1081 		mutex_exit(&nvpflush_lock);
1082 
1083 		/*
1084 		 * Try flushing what's dirty, reschedule if there's
1085 		 * a failure or data gets marked as dirty again.
1086 		 * First move each file marked dirty to the dirty
1087 		 * list to avoid locking the list across the write.
1088 		 */
1089 		mutex_enter(&nvf_cache_mutex);
1090 		for (nvfdp = list_head(&nvf_cache_files);
1091 		    nvfdp; nvfdp = nextfdp) {
1092 			nextfdp = list_next(&nvf_cache_files, nvfdp);
1093 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1094 			if (NVF_IS_DIRTY(nvfdp)) {
1095 				list_remove(&nvf_cache_files, nvfdp);
1096 				list_insert_tail(&nvf_dirty_files, nvfdp);
1097 				rw_exit(&nvfdp->nvf_lock);
1098 			} else {
1099 				NVPDAEMON_DEBUG((CE_CONT,
1100 				    "nvpdaemon: not dirty %s\n",
1101 				    nvfdp->nvf_cache_path));
1102 				rw_exit(&nvfdp->nvf_lock);
1103 			}
1104 		}
1105 		mutex_exit(&nvf_cache_mutex);
1106 
1107 		/*
1108 		 * Now go through the dirty list
1109 		 */
1110 		for (nvfdp = list_head(&nvf_dirty_files);
1111 		    nvfdp; nvfdp = nextfdp) {
1112 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
1113 
1114 			is_now_clean = 0;
1115 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1116 			if (NVF_IS_DIRTY(nvfdp)) {
1117 				NVPDAEMON_DEBUG((CE_CONT,
1118 				    "nvpdaemon: flush %s\n",
1119 				    nvfdp->nvf_cache_path));
1120 				rw_exit(&nvfdp->nvf_lock);
1121 				rval = nvpflush_one(nvfdp);
1122 				rw_enter(&nvfdp->nvf_lock, RW_READER);
1123 				if (rval != DDI_SUCCESS ||
1124 				    NVF_IS_DIRTY(nvfdp)) {
1125 					rw_exit(&nvfdp->nvf_lock);
1126 					NVPDAEMON_DEBUG((CE_CONT,
1127 					    "nvpdaemon: %s dirty again\n",
1128 					    nvfdp->nvf_cache_path));
1129 					want_wakeup = 1;
1130 				} else {
1131 					rw_exit(&nvfdp->nvf_lock);
1132 					nvf_write_is_complete(nvfdp);
1133 					is_now_clean = 1;
1134 				}
1135 			} else {
1136 				NVPDAEMON_DEBUG((CE_CONT,
1137 				    "nvpdaemon: not dirty %s\n",
1138 				    nvfdp->nvf_cache_path));
1139 				rw_exit(&nvfdp->nvf_lock);
1140 				is_now_clean = 1;
1141 			}
1142 
1143 			if (is_now_clean) {
1144 				mutex_enter(&nvf_cache_mutex);
1145 				list_remove(&nvf_dirty_files, nvfdp);
1146 				list_insert_tail(&nvf_cache_files,
1147 				    nvfdp);
1148 				mutex_exit(&nvf_cache_mutex);
1149 			}
1150 		}
1151 
1152 		if (want_wakeup)
1153 			nvf_wake_daemon();
1154 
1155 		mutex_enter(&nvpflush_lock);
1156 		nvpbusy = 0;
1157 	}
1158 }
1159