xref: /titanic_52/usr/src/uts/common/os/devcache.c (revision 4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/note.h>
27 #include <sys/t_lock.h>
28 #include <sys/cmn_err.h>
29 #include <sys/instance.h>
30 #include <sys/conf.h>
31 #include <sys/stat.h>
32 #include <sys/ddi.h>
33 #include <sys/hwconf.h>
34 #include <sys/sunddi.h>
35 #include <sys/sunndi.h>
36 #include <sys/ddi_impldefs.h>
37 #include <sys/ndi_impldefs.h>
38 #include <sys/modctl.h>
39 #include <sys/dacf.h>
40 #include <sys/promif.h>
41 #include <sys/cpuvar.h>
42 #include <sys/pathname.h>
43 #include <sys/kobj.h>
44 #include <sys/devcache.h>
45 #include <sys/devcache_impl.h>
46 #include <sys/sysmacros.h>
47 #include <sys/varargs.h>
48 #include <sys/callb.h>
49 
50 /*
51  * This facility provides interfaces to clients to register,
52  * read and update cache data in persisted backing store files,
53  * usually in /etc/devices.  The data persisted through this
54  * mechanism should be stateless data, functioning in the sense
55  * of a cache.  Writes are performed by a background daemon
56  * thread, permitting a client to schedule an update without
57  * blocking, then continue updating the data state in
58  * parallel.  The data is only locked by the daemon thread
59  * to pack the data in preparation for the write.
60  *
61  * Data persisted through this mechanism should be capable
62  * of being regenerated through normal system operation,
63  * for example attaching all disk devices would cause all
64  * devids to be registered for those devices.  By caching
65  * a devid-device tuple, the system can operate in a
66  * more optimal way, directly attaching the device mapped
67  * to a devid, rather than burdensomely driving attach of
68  * the entire device tree to discover a single device.
69  *
70  * Note that a client should only need to include
71  * <sys/devcache.h> for the supported interfaces.
72  *
73  * The data per client is entirely within the control of
74  * the client.  When reading, data unpacked from the backing
75  * store should be inserted in the list.  The pointer to
76  * the list can be retrieved via nvf_list().  When writing,
77  * the data on the list is to be packed and returned to the
78  * nvpdaemon as an nvlist.
79  *
80  * Obvious restrictions are imposed by the limits of the
81  * nvlist format.  The data cannot be read or written
82  * piecemeal, and large amounts of data aren't recommended.
83  * However, nvlists do allow that data be named and typed
84  * and can be size-of-int invariant, and the cached data
85  * can be versioned conveniently.
86  *
87  * The registration involves two steps: a handle is
88  * allocated by calling the registration function.
89  * This sets up the data referenced by the handle and
90  * initializes the lock.  Following registration, the
91  * client must initialize the data list.  The list
92  * interfaces require that the list element with offset
93  * to the node link be provided.  The format of the
94  * list element is under the control of the client.
95  *
96  * Locking: the address of the data list r/w lock provided
97  * can be accessed with nvf_lock().  The lock must be held
98  * as reader when traversing the list or checking state,
99  * such as nvf_is_dirty().  The lock must be held as
100  * writer when updating the list or marking it dirty.
101  * The lock must not be held when waking the daemon.
102  *
103  * The data r/w lock is held as writer when the pack,
104  * unpack and free list handlers are called.  The
105  * lock should not be dropped and must be still held
106  * upon return.  The client should also hold the lock
107  * as reader when checking if the list is dirty, and
108  * as writer when marking the list dirty or initiating
109  * a read.
110  *
111  * The asynchronous nature of updates allows for the
112  * possibility that the data may continue to be updated
113  * once the daemon has been notified that an update is
114  * desired.  The data only needs to be locked against
115  * updates when packing the data into the form to be
116  * written.  When the write of the packed data has
117  * completed, the daemon will automatically reschedule
118  * an update if the data was marked dirty after the
119  * point at which it was packed.  Before beginning an
120  * update, the daemon attempts to lock the data as
121  * writer; if the writer lock is already held, it
122  * backs off and retries later.  The model is to give
123  * priority to the kernel processes generating the
124  * data, and that the nature of the data is that
125  * it does not change often, can be re-generated when
126  * needed, so updates should not happen often and
127  * can be delayed until the data stops changing.
128  * The client may update the list or mark it dirty
129  * any time it is able to acquire the lock as
130  * writer first.
131  *
132  * A failed write will be retried after some delay,
133  * in the hope that the cause of the error will be
134  * transient, for example a filesystem with no space
135  * available.  An update on a read-only filesystem
136  * is failed silently and not retried; this would be
137  * the case when booted off install media.
138  *
139  * There is no unregister mechanism as of yet, as it
140  * hasn't been needed so far.
141  */
142 
143 /*
144  * Global list of files registered and updated by the nvpflush
145  * daemon, protected by the nvf_cache_mutex.  While an
146  * update is taking place, a file is temporarily moved to
147  * the dirty list to avoid locking the primary list for
148  * the duration of the update.
149  */
150 list_t		nvf_cache_files;
151 list_t		nvf_dirty_files;
152 kmutex_t	nvf_cache_mutex;
153 
154 
155 /*
156  * Allow some delay from an update of the data before flushing
157  * to permit simultaneous updates of multiple changes.
158  * Changes in the data are expected to be bursty, ie
159  * reconfig or hot-plug of a new adapter.
160  *
161  * kfio_report_error (default 0)
162  *	Set to 1 to enable some error messages related to low-level
163  *	kernel file i/o operations.
164  *
165  * nvpflush_delay (default 10)
166  *	The number of seconds after data is marked dirty before the
167  *	flush daemon is triggered to flush the data.  A longer period
168  *	of time permits more data updates per write.  Note that
169  *	every update resets the timer so no repository write will
170  *	occur while data is being updated continuously.
171  *
172  * nvpdaemon_idle_time (default 60)
173  *	The number of seconds the daemon will sleep idle before exiting.
174  *
175  */
176 #define	NVPFLUSH_DELAY		10
177 #define	NVPDAEMON_IDLE_TIME	60
178 
179 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
180 
181 /*
182  * Tunables
183  */
184 int kfio_report_error = 0;		/* kernel file i/o operations */
185 int kfio_disable_read = 0;		/* disable all reads */
186 int kfio_disable_write = 0;		/* disable all writes */
187 
188 int nvpflush_delay	= NVPFLUSH_DELAY;
189 int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
190 
191 static timeout_id_t	nvpflush_id = 0;
192 static int		nvpflush_timer_busy = 0;
193 static int		nvpflush_daemon_active = 0;
194 static kthread_t	*nvpflush_thr_id = 0;
195 
196 static int		do_nvpflush = 0;
197 static int		nvpbusy = 0;
198 static kmutex_t		nvpflush_lock;
199 static kcondvar_t	nvpflush_cv;
200 static kthread_id_t	nvpflush_thread;
201 static clock_t		nvpticks;
202 
203 static void nvpflush_daemon(void);
204 
205 #ifdef	DEBUG
206 int nvpdaemon_debug = 0;
207 int kfio_debug = 0;
208 #endif	/* DEBUG */
209 
210 extern int modrootloaded;
211 extern void mdi_read_devices_files(void);
212 extern void mdi_clean_vhcache(void);
213 extern int sys_shutdown;
214 
215 /*
216  * Initialize the overall cache file management
217  */
218 void
219 i_ddi_devices_init(void)
220 {
221 	list_create(&nvf_cache_files, sizeof (nvfd_t),
222 	    offsetof(nvfd_t, nvf_link));
223 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
224 	    offsetof(nvfd_t, nvf_link));
225 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
226 	retire_store_init();
227 	devid_cache_init();
228 }
229 
230 /*
231  * Read cache files
232  * The files read here should be restricted to those
233  * that may be required to mount root.
234  */
235 void
236 i_ddi_read_devices_files(void)
237 {
238 	/*
239 	 * The retire store should be the first file read as it
240 	 * may need to offline devices. kfio_disable_read is not
241 	 * used for retire. For the rationale see the tunable
242 	 * ddi_retire_store_bypass and comments in:
243 	 *	uts/common/os/retire_store.c
244 	 */
245 
246 	retire_store_read();
247 
248 	if (!kfio_disable_read) {
249 		mdi_read_devices_files();
250 		devid_cache_read();
251 	}
252 }
253 
254 void
255 i_ddi_start_flush_daemon(void)
256 {
257 	nvfd_t	*nvfdp;
258 
259 	ASSERT(i_ddi_io_initialized());
260 
261 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
262 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
263 
264 	mutex_enter(&nvf_cache_mutex);
265 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
266 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
267 		if (NVF_IS_DIRTY(nvfdp)) {
268 			nvf_wake_daemon();
269 			break;
270 		}
271 	}
272 	mutex_exit(&nvf_cache_mutex);
273 }
274 
275 void
276 i_ddi_clean_devices_files(void)
277 {
278 	devid_cache_cleanup();
279 	mdi_clean_vhcache();
280 }
281 
282 /*
283  * Register a cache file to be managed and updated by the nvpflush daemon.
284  * All operations are performed through the returned handle.
285  * There is no unregister mechanism for now.
286  */
287 nvf_handle_t
288 nvf_register_file(nvf_ops_t *ops)
289 {
290 	nvfd_t *nvfdp;
291 
292 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
293 
294 	nvfdp->nvf_ops = ops;
295 	nvfdp->nvf_flags = 0;
296 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
297 
298 	mutex_enter(&nvf_cache_mutex);
299 	list_insert_tail(&nvf_cache_files, nvfdp);
300 	mutex_exit(&nvf_cache_mutex);
301 
302 	return ((nvf_handle_t)nvfdp);
303 }
304 
305 /*PRINTFLIKE1*/
306 void
307 nvf_error(const char *fmt, ...)
308 {
309 	va_list ap;
310 
311 	if (kfio_report_error) {
312 		va_start(ap, fmt);
313 		vcmn_err(CE_NOTE, fmt, ap);
314 		va_end(ap);
315 	}
316 }
317 
318 /*
319  * Some operations clients may use to manage the data
320  * to be persisted in a cache file.
321  */
322 char *
323 nvf_cache_name(nvf_handle_t handle)
324 {
325 	return (((nvfd_t *)handle)->nvf_cache_path);
326 }
327 
328 krwlock_t *
329 nvf_lock(nvf_handle_t handle)
330 {
331 	return (&(((nvfd_t *)handle)->nvf_lock));
332 }
333 
334 list_t *
335 nvf_list(nvf_handle_t handle)
336 {
337 	return (&(((nvfd_t *)handle)->nvf_data_list));
338 }
339 
340 void
341 nvf_mark_dirty(nvf_handle_t handle)
342 {
343 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
344 	NVF_MARK_DIRTY((nvfd_t *)handle);
345 }
346 
347 int
348 nvf_is_dirty(nvf_handle_t handle)
349 {
350 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
351 	return (NVF_IS_DIRTY((nvfd_t *)handle));
352 }
353 
354 static uint16_t
355 nvp_cksum(uchar_t *buf, int64_t buflen)
356 {
357 	uint16_t cksum = 0;
358 	uint16_t *p = (uint16_t *)buf;
359 	int64_t n;
360 
361 	if ((buflen & 0x01) != 0) {
362 		buflen--;
363 		cksum = buf[buflen];
364 	}
365 	n = buflen / 2;
366 	while (n-- > 0)
367 		cksum ^= *p++;
368 	return (cksum);
369 }
370 
371 int
372 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
373 {
374 	struct _buf	*file;
375 	nvpf_hdr_t	hdr;
376 	char		*buf;
377 	nvlist_t	*nvl;
378 	int		rval;
379 	uint_t		offset;
380 	int		n;
381 	char		c;
382 	uint16_t	cksum, hdrsum;
383 
384 	*ret_nvlist = NULL;
385 
386 	file = kobj_open_file(filename);
387 	if (file == (struct _buf *)-1) {
388 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
389 		return (ENOENT);
390 	}
391 
392 	offset = 0;
393 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
394 	if (n != sizeof (hdr)) {
395 		kobj_close_file(file);
396 		if (n < 0) {
397 			nvf_error("error reading header: %s\n", filename);
398 			return (EIO);
399 		} else if (n == 0) {
400 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
401 		} else {
402 			nvf_error("header size incorrect: %s\n", filename);
403 		}
404 		return (EINVAL);
405 	}
406 	offset += n;
407 
408 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
409 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
410 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
411 	    (longlong_t)hdr.nvpf_size));
412 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
413 	    hdr.nvpf_hdr_chksum));
414 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
415 
416 	cksum = hdr.nvpf_hdr_chksum;
417 	hdr.nvpf_hdr_chksum = 0;
418 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
419 
420 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
421 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
422 		kobj_close_file(file);
423 		if (hdrsum != cksum) {
424 			nvf_error("%s: checksum error "
425 			    "(actual 0x%x, expected 0x%x)\n",
426 			    filename, hdrsum, cksum);
427 		}
428 		nvf_error("%s: header information incorrect", filename);
429 		return (EINVAL);
430 	}
431 
432 	ASSERT(hdr.nvpf_size >= 0);
433 
434 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
435 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
436 	if (n != hdr.nvpf_size) {
437 		kmem_free(buf, hdr.nvpf_size);
438 		kobj_close_file(file);
439 		if (n < 0) {
440 			nvf_error("%s: read error %d", filename, n);
441 		} else {
442 			nvf_error("%s: incomplete read %d/%lld",
443 			    filename, n, (longlong_t)hdr.nvpf_size);
444 		}
445 		return (EINVAL);
446 	}
447 	offset += n;
448 
449 	rval = kobj_read_file(file, &c, 1, offset);
450 	kobj_close_file(file);
451 	if (rval > 0) {
452 		nvf_error("%s is larger than %lld\n",
453 		    filename, (longlong_t)hdr.nvpf_size);
454 		kmem_free(buf, hdr.nvpf_size);
455 		return (EINVAL);
456 	}
457 
458 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
459 	if (hdr.nvpf_chksum != cksum) {
460 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
461 		    filename, hdr.nvpf_chksum, cksum);
462 		kmem_free(buf, hdr.nvpf_size);
463 		return (EINVAL);
464 	}
465 
466 	nvl = NULL;
467 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
468 	if (rval != 0) {
469 		nvf_error("%s: error %d unpacking nvlist\n",
470 		    filename, rval);
471 		kmem_free(buf, hdr.nvpf_size);
472 		return (EINVAL);
473 	}
474 
475 	kmem_free(buf, hdr.nvpf_size);
476 	*ret_nvlist = nvl;
477 	return (0);
478 }
479 
480 static int
481 kfcreate(char *filename, kfile_t **kfilep)
482 {
483 	kfile_t	*fp;
484 	int	rval;
485 
486 	ASSERT(modrootloaded);
487 
488 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
489 
490 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
491 	fp->kf_fname = filename;
492 	fp->kf_fpos = 0;
493 	fp->kf_state = 0;
494 
495 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
496 	    filename, fp->kf_vnflags));
497 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
498 	    0444, &fp->kf_vp, CRCREAT, 0);
499 	if (rval != 0) {
500 		kmem_free(fp, sizeof (kfile_t));
501 		KFDEBUG((CE_CONT, "%s: create error %d\n",
502 		    filename, rval));
503 		return (rval);
504 	}
505 
506 	*kfilep = fp;
507 	return (0);
508 }
509 
510 static int
511 kfremove(char *filename)
512 {
513 	int rval;
514 
515 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
516 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
517 	if (rval != 0) {
518 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
519 		    filename, rval));
520 	}
521 	return (rval);
522 }
523 
524 static int
525 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
526 {
527 	ssize_t		resid;
528 	int		err;
529 	ssize_t		n;
530 
531 	ASSERT(modrootloaded);
532 
533 	if (fp->kf_state != 0)
534 		return (fp->kf_state);
535 
536 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
537 	    UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
538 	if (err != 0) {
539 		KFDEBUG((CE_CONT, "%s: read error %d\n",
540 		    fp->kf_fname, err));
541 		fp->kf_state = err;
542 		return (err);
543 	}
544 
545 	ASSERT(resid >= 0 && resid <= bufsiz);
546 	n = bufsiz - resid;
547 
548 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
549 	    fp->kf_fname, n, bufsiz, resid));
550 
551 	fp->kf_fpos += n;
552 	*ret_n = n;
553 	return (0);
554 }
555 
556 static int
557 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
558 {
559 	rlim64_t	rlimit;
560 	ssize_t		resid;
561 	int		err;
562 	ssize_t		len;
563 	ssize_t		n = 0;
564 
565 	ASSERT(modrootloaded);
566 
567 	if (fp->kf_state != 0)
568 		return (fp->kf_state);
569 
570 	len = bufsiz;
571 	rlimit = bufsiz + 1;
572 	for (;;) {
573 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
574 		    UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
575 		if (err) {
576 			KFDEBUG((CE_CONT, "%s: write error %d\n",
577 			    fp->kf_fname, err));
578 			fp->kf_state = err;
579 			return (err);
580 		}
581 
582 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
583 		    fp->kf_fname, len-resid, resid));
584 
585 		ASSERT(resid >= 0 && resid <= len);
586 
587 		n += (len - resid);
588 		if (resid == 0)
589 			break;
590 
591 		if (resid == len) {
592 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
593 			    fp->kf_fname));
594 			fp->kf_state = ENOSPC;
595 			return (ENOSPC);
596 		}
597 
598 		len -= resid;
599 		buf += len;
600 		fp->kf_fpos += len;
601 		len = resid;
602 	}
603 
604 	ASSERT(n == bufsiz);
605 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
606 
607 	*ret_n = n;
608 	return (0);
609 }
610 
611 
612 static int
613 kfclose(kfile_t *fp)
614 {
615 	int		rval;
616 
617 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
618 
619 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
620 		rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
621 		if (rval != 0) {
622 			nvf_error("%s: sync error %d\n",
623 			    fp->kf_fname, rval);
624 		}
625 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
626 	}
627 
628 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1,
629 	    (offset_t)0, kcred, NULL);
630 	if (rval != 0) {
631 		if (fp->kf_state == 0) {
632 			nvf_error("%s: close error %d\n",
633 			    fp->kf_fname, rval);
634 		}
635 	} else {
636 		if (fp->kf_state == 0)
637 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
638 	}
639 
640 	VN_RELE(fp->kf_vp);
641 	kmem_free(fp, sizeof (kfile_t));
642 	return (rval);
643 }
644 
645 static int
646 kfrename(char *oldname, char *newname)
647 {
648 	int rval;
649 
650 	ASSERT(modrootloaded);
651 
652 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
653 
654 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
655 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
656 		    oldname, newname, rval));
657 	}
658 
659 	return (rval);
660 }
661 
662 int
663 fwrite_nvlist(char *filename, nvlist_t *nvl)
664 {
665 	char	*buf;
666 	char	*nvbuf;
667 	kfile_t	*fp;
668 	char	*newname;
669 	int	len, err, err1;
670 	size_t	buflen;
671 	ssize_t	n;
672 
673 	ASSERT(modrootloaded);
674 
675 	nvbuf = NULL;
676 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
677 	if (err != 0) {
678 		nvf_error("%s: error %d packing nvlist\n",
679 		    filename, err);
680 		return (err);
681 	}
682 
683 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
684 	bzero(buf, sizeof (nvpf_hdr_t));
685 
686 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
687 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
688 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
689 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
690 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
691 	    nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
692 
693 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
694 	kmem_free(nvbuf, buflen);
695 	buflen += sizeof (nvpf_hdr_t);
696 
697 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
698 	newname = kmem_alloc(len, KM_SLEEP);
699 
700 
701 	(void) sprintf(newname, "%s.%s", filename, NEW_FILENAME_SUFFIX);
702 
703 	/*
704 	 * To make it unlikely we suffer data loss, write
705 	 * data to the new temporary file.  Once successful
706 	 * complete the transaction by renaming the new file
707 	 * to replace the previous.
708 	 */
709 
710 	if ((err = kfcreate(newname, &fp)) == 0) {
711 		err = kfwrite(fp, buf, buflen, &n);
712 		if (err) {
713 			nvf_error("%s: write error - %d\n",
714 			    newname, err);
715 		} else {
716 			if (n != buflen) {
717 				nvf_error(
718 				    "%s: partial write %ld of %ld bytes\n",
719 				    newname, n, buflen);
720 				nvf_error("%s: filesystem may be full?\n",
721 				    newname);
722 				err = EIO;
723 			}
724 		}
725 		if ((err1 = kfclose(fp)) != 0) {
726 			nvf_error("%s: close error\n", newname);
727 			if (err == 0)
728 				err = err1;
729 		}
730 		if (err != 0) {
731 			if (kfremove(newname) != 0) {
732 				nvf_error("%s: remove failed\n",
733 				    newname);
734 			}
735 		}
736 	} else {
737 		nvf_error("%s: create failed - %d\n", filename, err);
738 	}
739 
740 	if (err == 0) {
741 		if ((err = kfrename(newname, filename)) != 0) {
742 			nvf_error("%s: rename from %s failed\n",
743 			    newname, filename);
744 		}
745 	}
746 
747 	kmem_free(newname, len);
748 	kmem_free(buf, buflen);
749 
750 	return (err);
751 }
752 
753 static int
754 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
755 {
756 	int err;
757 
758 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
759 		return (DDI_SUCCESS);
760 	else {
761 		if (err == EROFS)
762 			NVF_MARK_READONLY(nvfd);
763 		return (DDI_FAILURE);
764 	}
765 }
766 
767 static void
768 nvp_list_free(nvfd_t *nvf)
769 {
770 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
771 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
772 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
773 }
774 
775 /*
776  * Read a file in the nvlist format
777  *	EIO - i/o error during read
778  *	ENOENT - file not found
779  *	EINVAL - file contents corrupted
780  */
781 static int
782 fread_nvp_list(nvfd_t *nvfd)
783 {
784 	nvlist_t	*nvl;
785 	nvpair_t	*nvp;
786 	char		*name;
787 	nvlist_t	*sublist;
788 	int		rval;
789 	int		rv;
790 
791 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
792 
793 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
794 	if (rval != 0)
795 		return (rval);
796 	ASSERT(nvl != NULL);
797 
798 	nvp = NULL;
799 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
800 		name = nvpair_name(nvp);
801 		ASSERT(strlen(name) > 0);
802 
803 		switch (nvpair_type(nvp)) {
804 		case DATA_TYPE_NVLIST:
805 			rval = nvpair_value_nvlist(nvp, &sublist);
806 			if (rval != 0) {
807 				nvf_error(
808 				    "nvpair_value_nvlist error %s %d\n",
809 				    name, rval);
810 				goto error;
811 			}
812 
813 			/*
814 			 * unpack nvlist for this device and
815 			 * add elements to data list.
816 			 */
817 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
818 			rv = (nvfd->nvf_unpack_nvlist)
819 			    ((nvf_handle_t)nvfd, sublist, name);
820 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
821 			if (rv != 0) {
822 				nvf_error(
823 				    "%s: %s invalid list element\n",
824 				    nvfd->nvf_cache_path, name);
825 				rval = EINVAL;
826 				goto error;
827 			}
828 			break;
829 
830 		default:
831 			nvf_error("%s: %s unsupported data type %d\n",
832 			    nvfd->nvf_cache_path, name, nvpair_type(nvp));
833 			rval = EINVAL;
834 			goto error;
835 		}
836 	}
837 
838 	nvlist_free(nvl);
839 
840 	return (0);
841 
842 error:
843 	nvlist_free(nvl);
844 	nvp_list_free(nvfd);
845 	return (rval);
846 }
847 
848 
849 int
850 nvf_read_file(nvf_handle_t nvf_handle)
851 {
852 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
853 	int rval;
854 
855 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
856 
857 	if (kfio_disable_read)
858 		return (0);
859 
860 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
861 
862 	rval = fread_nvp_list(nvfd);
863 	if (rval) {
864 		switch (rval) {
865 		case EIO:
866 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
867 			cmn_err(CE_WARN, "%s: I/O error",
868 			    nvfd->nvf_cache_path);
869 			break;
870 		case ENOENT:
871 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
872 			nvf_error("%s: not found\n",
873 			    nvfd->nvf_cache_path);
874 			break;
875 		case EINVAL:
876 		default:
877 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
878 			cmn_err(CE_WARN, "%s: data file corrupted",
879 			    nvfd->nvf_cache_path);
880 			break;
881 		}
882 	}
883 	return (rval);
884 }
885 
886 static void
887 nvf_write_is_complete(nvfd_t *fd)
888 {
889 	if (fd->nvf_write_complete) {
890 		(fd->nvf_write_complete)((nvf_handle_t)fd);
891 	}
892 }
893 
894 /*ARGSUSED*/
895 static void
896 nvpflush_timeout(void *arg)
897 {
898 	clock_t nticks;
899 
900 	mutex_enter(&nvpflush_lock);
901 	nticks = nvpticks - ddi_get_lbolt();
902 	if (nticks > 4) {
903 		nvpflush_timer_busy = 1;
904 		mutex_exit(&nvpflush_lock);
905 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
906 	} else {
907 		do_nvpflush = 1;
908 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
909 		cv_signal(&nvpflush_cv);
910 		nvpflush_id = 0;
911 		nvpflush_timer_busy = 0;
912 		mutex_exit(&nvpflush_lock);
913 	}
914 }
915 
916 /*
917  * After marking a list as dirty, wake the nvpflush daemon
918  * to perform the update.
919  */
920 void
921 nvf_wake_daemon(void)
922 {
923 	clock_t nticks;
924 
925 	/*
926 	 * If the system isn't up yet or is shutting down,
927 	 * don't even think about starting a flush.
928 	 */
929 	if (!i_ddi_io_initialized() || sys_shutdown)
930 		return;
931 
932 	mutex_enter(&nvpflush_lock);
933 
934 	if (nvpflush_daemon_active == 0) {
935 		nvpflush_daemon_active = 1;
936 		mutex_exit(&nvpflush_lock);
937 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
938 		nvpflush_thr_id = thread_create(NULL, 0,
939 		    (void (*)())nvpflush_daemon,
940 		    NULL, 0, &p0, TS_RUN, minclsyspri);
941 		mutex_enter(&nvpflush_lock);
942 	}
943 
944 	nticks = nvpflush_delay * TICKS_PER_SECOND;
945 	nvpticks = ddi_get_lbolt() + nticks;
946 	if (nvpflush_timer_busy == 0) {
947 		nvpflush_timer_busy = 1;
948 		mutex_exit(&nvpflush_lock);
949 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
950 	} else
951 		mutex_exit(&nvpflush_lock);
952 }
953 
954 static int
955 nvpflush_one(nvfd_t *nvfd)
956 {
957 	int rval = DDI_SUCCESS;
958 	nvlist_t *nvl;
959 
960 	rw_enter(&nvfd->nvf_lock, RW_READER);
961 
962 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
963 
964 	if (!NVF_IS_DIRTY(nvfd) ||
965 	    NVF_IS_READONLY(nvfd) || kfio_disable_write || sys_shutdown) {
966 		NVF_CLEAR_DIRTY(nvfd);
967 		rw_exit(&nvfd->nvf_lock);
968 		return (DDI_SUCCESS);
969 	}
970 
971 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
972 		nvf_error("nvpflush: "
973 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
974 		rw_exit(&nvfd->nvf_lock);
975 		return (DDI_FAILURE);
976 	}
977 	if (((nvfd->nvf_pack_list)
978 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
979 		nvf_error("nvpflush: "
980 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
981 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
982 		rw_exit(&nvfd->nvf_lock);
983 		return (DDI_FAILURE);
984 	}
985 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
986 
987 	NVF_CLEAR_DIRTY(nvfd);
988 	nvfd->nvf_flags |= NVF_F_FLUSHING;
989 	rw_exit(&nvfd->nvf_lock);
990 
991 	rval = e_fwrite_nvlist(nvfd, nvl);
992 	nvlist_free(nvl);
993 
994 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
995 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
996 	if (rval == DDI_FAILURE) {
997 		if (NVF_IS_READONLY(nvfd)) {
998 			rval = DDI_SUCCESS;
999 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
1000 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
1001 			cmn_err(CE_CONT,
1002 			    "%s: update failed\n", nvfd->nvf_cache_path);
1003 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
1004 		}
1005 	} else {
1006 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
1007 			cmn_err(CE_CONT,
1008 			    "!Creating %s\n", nvfd->nvf_cache_path);
1009 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
1010 		}
1011 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
1012 			cmn_err(CE_CONT,
1013 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
1014 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
1015 		}
1016 		if (nvfd->nvf_flags & NVF_F_ERROR) {
1017 			cmn_err(CE_CONT,
1018 			    "%s: update now ok\n", nvfd->nvf_cache_path);
1019 			nvfd->nvf_flags &= ~NVF_F_ERROR;
1020 		}
1021 		/*
1022 		 * The file may need to be flushed again if the cached
1023 		 * data was touched while writing the earlier contents.
1024 		 */
1025 		if (NVF_IS_DIRTY(nvfd))
1026 			rval = DDI_FAILURE;
1027 	}
1028 
1029 	rw_exit(&nvfd->nvf_lock);
1030 	return (rval);
1031 }
1032 
1033 
1034 static void
1035 nvpflush_daemon(void)
1036 {
1037 	callb_cpr_t cprinfo;
1038 	nvfd_t *nvfdp, *nextfdp;
1039 	clock_t clk;
1040 	int rval;
1041 	int want_wakeup;
1042 	int is_now_clean;
1043 
1044 	ASSERT(modrootloaded);
1045 
1046 	nvpflush_thread = curthread;
1047 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
1048 
1049 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
1050 	mutex_enter(&nvpflush_lock);
1051 	for (;;) {
1052 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1053 		while (do_nvpflush == 0) {
1054 			clk = cv_reltimedwait(&nvpflush_cv, &nvpflush_lock,
1055 			    (nvpdaemon_idle_time * TICKS_PER_SECOND),
1056 			    TR_CLOCK_TICK);
1057 			if ((clk == -1 && do_nvpflush == 0 &&
1058 			    nvpflush_timer_busy == 0) || sys_shutdown) {
1059 				/*
1060 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
1061 				 * on the lock passed in to CALLB_CPR_INIT,
1062 				 * so the lock must be held when invoking it.
1063 				 */
1064 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1065 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
1066 				ASSERT(mutex_owned(&nvpflush_lock));
1067 				nvpflush_thr_id = NULL;
1068 				nvpflush_daemon_active = 0;
1069 				CALLB_CPR_EXIT(&cprinfo);
1070 				thread_exit();
1071 			}
1072 		}
1073 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1074 
1075 		nvpbusy = 1;
1076 		want_wakeup = 0;
1077 		do_nvpflush = 0;
1078 		mutex_exit(&nvpflush_lock);
1079 
1080 		/*
1081 		 * Try flushing what's dirty, reschedule if there's
1082 		 * a failure or data gets marked as dirty again.
1083 		 * First move each file marked dirty to the dirty
1084 		 * list to avoid locking the list across the write.
1085 		 */
1086 		mutex_enter(&nvf_cache_mutex);
1087 		for (nvfdp = list_head(&nvf_cache_files);
1088 		    nvfdp; nvfdp = nextfdp) {
1089 			nextfdp = list_next(&nvf_cache_files, nvfdp);
1090 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1091 			if (NVF_IS_DIRTY(nvfdp)) {
1092 				list_remove(&nvf_cache_files, nvfdp);
1093 				list_insert_tail(&nvf_dirty_files, nvfdp);
1094 				rw_exit(&nvfdp->nvf_lock);
1095 			} else {
1096 				NVPDAEMON_DEBUG((CE_CONT,
1097 				    "nvpdaemon: not dirty %s\n",
1098 				    nvfdp->nvf_cache_path));
1099 				rw_exit(&nvfdp->nvf_lock);
1100 			}
1101 		}
1102 		mutex_exit(&nvf_cache_mutex);
1103 
1104 		/*
1105 		 * Now go through the dirty list
1106 		 */
1107 		for (nvfdp = list_head(&nvf_dirty_files);
1108 		    nvfdp; nvfdp = nextfdp) {
1109 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
1110 
1111 			is_now_clean = 0;
1112 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1113 			if (NVF_IS_DIRTY(nvfdp)) {
1114 				NVPDAEMON_DEBUG((CE_CONT,
1115 				    "nvpdaemon: flush %s\n",
1116 				    nvfdp->nvf_cache_path));
1117 				rw_exit(&nvfdp->nvf_lock);
1118 				rval = nvpflush_one(nvfdp);
1119 				rw_enter(&nvfdp->nvf_lock, RW_READER);
1120 				if (rval != DDI_SUCCESS ||
1121 				    NVF_IS_DIRTY(nvfdp)) {
1122 					rw_exit(&nvfdp->nvf_lock);
1123 					NVPDAEMON_DEBUG((CE_CONT,
1124 					    "nvpdaemon: %s dirty again\n",
1125 					    nvfdp->nvf_cache_path));
1126 					want_wakeup = 1;
1127 				} else {
1128 					rw_exit(&nvfdp->nvf_lock);
1129 					nvf_write_is_complete(nvfdp);
1130 					is_now_clean = 1;
1131 				}
1132 			} else {
1133 				NVPDAEMON_DEBUG((CE_CONT,
1134 				    "nvpdaemon: not dirty %s\n",
1135 				    nvfdp->nvf_cache_path));
1136 				rw_exit(&nvfdp->nvf_lock);
1137 				is_now_clean = 1;
1138 			}
1139 
1140 			if (is_now_clean) {
1141 				mutex_enter(&nvf_cache_mutex);
1142 				list_remove(&nvf_dirty_files, nvfdp);
1143 				list_insert_tail(&nvf_cache_files,
1144 				    nvfdp);
1145 				mutex_exit(&nvf_cache_mutex);
1146 			}
1147 		}
1148 
1149 		if (want_wakeup)
1150 			nvf_wake_daemon();
1151 
1152 		mutex_enter(&nvpflush_lock);
1153 		nvpbusy = 0;
1154 	}
1155 }
1156