1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/note.h>
27 #include <sys/t_lock.h>
28 #include <sys/cmn_err.h>
29 #include <sys/instance.h>
30 #include <sys/conf.h>
31 #include <sys/stat.h>
32 #include <sys/ddi.h>
33 #include <sys/hwconf.h>
34 #include <sys/sunddi.h>
35 #include <sys/sunndi.h>
36 #include <sys/ddi_impldefs.h>
37 #include <sys/ndi_impldefs.h>
38 #include <sys/modctl.h>
39 #include <sys/dacf.h>
40 #include <sys/promif.h>
41 #include <sys/cpuvar.h>
42 #include <sys/pathname.h>
43 #include <sys/kobj.h>
44 #include <sys/devcache.h>
45 #include <sys/devcache_impl.h>
46 #include <sys/sysmacros.h>
47 #include <sys/varargs.h>
48 #include <sys/callb.h>
49
50 /*
51 * This facility provides interfaces to clients to register,
52 * read and update cache data in persisted backing store files,
53 * usually in /etc/devices. The data persisted through this
54 * mechanism should be stateless data, functioning in the sense
55 * of a cache. Writes are performed by a background daemon
56 * thread, permitting a client to schedule an update without
57 * blocking, then continue updating the data state in
58 * parallel. The data is only locked by the daemon thread
59 * to pack the data in preparation for the write.
60 *
61 * Data persisted through this mechanism should be capable
62 * of being regenerated through normal system operation,
63 * for example attaching all disk devices would cause all
64 * devids to be registered for those devices. By caching
65 * a devid-device tuple, the system can operate in a
66 * more optimal way, directly attaching the device mapped
67 * to a devid, rather than burdensomely driving attach of
68 * the entire device tree to discover a single device.
69 *
70 * Note that a client should only need to include
71 * <sys/devcache.h> for the supported interfaces.
72 *
73 * The data per client is entirely within the control of
74 * the client. When reading, data unpacked from the backing
75 * store should be inserted in the list. The pointer to
76 * the list can be retrieved via nvf_list(). When writing,
77 * the data on the list is to be packed and returned to the
78 * nvpdaemon as an nvlist.
79 *
80 * Obvious restrictions are imposed by the limits of the
81 * nvlist format. The data cannot be read or written
82 * piecemeal, and large amounts of data aren't recommended.
83 * However, nvlists do allow that data be named and typed
84 * and can be size-of-int invariant, and the cached data
85 * can be versioned conveniently.
86 *
87 * The registration involves two steps: a handle is
88 * allocated by calling the registration function.
89 * This sets up the data referenced by the handle and
90 * initializes the lock. Following registration, the
91 * client must initialize the data list. The list
92 * interfaces require that the list element with offset
93 * to the node link be provided. The format of the
94 * list element is under the control of the client.
95 *
96 * Locking: the address of the data list r/w lock provided
97 * can be accessed with nvf_lock(). The lock must be held
98 * as reader when traversing the list or checking state,
99 * such as nvf_is_dirty(). The lock must be held as
100 * writer when updating the list or marking it dirty.
101 * The lock must not be held when waking the daemon.
102 *
103 * The data r/w lock is held as writer when the pack,
104 * unpack and free list handlers are called. The
105 * lock should not be dropped and must be still held
106 * upon return. The client should also hold the lock
107 * as reader when checking if the list is dirty, and
108 * as writer when marking the list dirty or initiating
109 * a read.
110 *
111 * The asynchronous nature of updates allows for the
112 * possibility that the data may continue to be updated
113 * once the daemon has been notified that an update is
114 * desired. The data only needs to be locked against
115 * updates when packing the data into the form to be
116 * written. When the write of the packed data has
117 * completed, the daemon will automatically reschedule
118 * an update if the data was marked dirty after the
119 * point at which it was packed. Before beginning an
120 * update, the daemon attempts to lock the data as
121 * writer; if the writer lock is already held, it
122 * backs off and retries later. The model is to give
123 * priority to the kernel processes generating the
124 * data, and that the nature of the data is that
125 * it does not change often, can be re-generated when
126 * needed, so updates should not happen often and
127 * can be delayed until the data stops changing.
128 * The client may update the list or mark it dirty
129 * any time it is able to acquire the lock as
130 * writer first.
131 *
132 * A failed write will be retried after some delay,
133 * in the hope that the cause of the error will be
134 * transient, for example a filesystem with no space
135 * available. An update on a read-only filesystem
136 * is failed silently and not retried; this would be
137 * the case when booted off install media.
138 *
139 * There is no unregister mechanism as of yet, as it
140 * hasn't been needed so far.
141 */
142
143 /*
144 * Global list of files registered and updated by the nvpflush
145 * daemon, protected by the nvf_cache_mutex. While an
146 * update is taking place, a file is temporarily moved to
147 * the dirty list to avoid locking the primary list for
148 * the duration of the update.
149 */
150 list_t nvf_cache_files;
151 list_t nvf_dirty_files;
152 kmutex_t nvf_cache_mutex;
153
154
155 /*
156 * Allow some delay from an update of the data before flushing
157 * to permit simultaneous updates of multiple changes.
158 * Changes in the data are expected to be bursty, ie
159 * reconfig or hot-plug of a new adapter.
160 *
161 * kfio_report_error (default 0)
162 * Set to 1 to enable some error messages related to low-level
163 * kernel file i/o operations.
164 *
165 * nvpflush_delay (default 10)
166 * The number of seconds after data is marked dirty before the
167 * flush daemon is triggered to flush the data. A longer period
168 * of time permits more data updates per write. Note that
169 * every update resets the timer so no repository write will
170 * occur while data is being updated continuously.
171 *
172 * nvpdaemon_idle_time (default 60)
173 * The number of seconds the daemon will sleep idle before exiting.
174 *
175 */
176 #define NVPFLUSH_DELAY 10
177 #define NVPDAEMON_IDLE_TIME 60
178
179 #define TICKS_PER_SECOND (drv_usectohz(1000000))
180
181 /*
182 * Tunables
183 */
184 int kfio_report_error = 0; /* kernel file i/o operations */
185 int kfio_disable_read = 0; /* disable all reads */
186 int kfio_disable_write = 0; /* disable all writes */
187
188 int nvpflush_delay = NVPFLUSH_DELAY;
189 int nvpdaemon_idle_time = NVPDAEMON_IDLE_TIME;
190
191 static timeout_id_t nvpflush_id = 0;
192 static int nvpflush_timer_busy = 0;
193 static int nvpflush_daemon_active = 0;
194 static kthread_t *nvpflush_thr_id = 0;
195
196 static int do_nvpflush = 0;
197 static int nvpbusy = 0;
198 static kmutex_t nvpflush_lock;
199 static kcondvar_t nvpflush_cv;
200 static kthread_id_t nvpflush_thread;
201 static clock_t nvpticks;
202
203 static void nvpflush_daemon(void);
204
205 #ifdef DEBUG
206 int nvpdaemon_debug = 0;
207 int kfio_debug = 0;
208 #endif /* DEBUG */
209
210 extern int modrootloaded;
211 extern void mdi_read_devices_files(void);
212 extern void mdi_clean_vhcache(void);
213 extern int sys_shutdown;
214
215 /*
216 * Initialize the overall cache file management
217 */
218 void
i_ddi_devices_init(void)219 i_ddi_devices_init(void)
220 {
221 list_create(&nvf_cache_files, sizeof (nvfd_t),
222 offsetof(nvfd_t, nvf_link));
223 list_create(&nvf_dirty_files, sizeof (nvfd_t),
224 offsetof(nvfd_t, nvf_link));
225 mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
226 retire_store_init();
227 devid_cache_init();
228 }
229
230 /*
231 * Read cache files
232 * The files read here should be restricted to those
233 * that may be required to mount root.
234 */
235 void
i_ddi_read_devices_files(void)236 i_ddi_read_devices_files(void)
237 {
238 /*
239 * The retire store should be the first file read as it
240 * may need to offline devices. kfio_disable_read is not
241 * used for retire. For the rationale see the tunable
242 * ddi_retire_store_bypass and comments in:
243 * uts/common/os/retire_store.c
244 */
245
246 retire_store_read();
247
248 if (!kfio_disable_read) {
249 mdi_read_devices_files();
250 devid_cache_read();
251 }
252 }
253
254 void
i_ddi_start_flush_daemon(void)255 i_ddi_start_flush_daemon(void)
256 {
257 nvfd_t *nvfdp;
258
259 ASSERT(i_ddi_io_initialized());
260
261 mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
262 cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
263
264 mutex_enter(&nvf_cache_mutex);
265 for (nvfdp = list_head(&nvf_cache_files); nvfdp;
266 nvfdp = list_next(&nvf_cache_files, nvfdp)) {
267 if (NVF_IS_DIRTY(nvfdp)) {
268 nvf_wake_daemon();
269 break;
270 }
271 }
272 mutex_exit(&nvf_cache_mutex);
273 }
274
275 void
i_ddi_clean_devices_files(void)276 i_ddi_clean_devices_files(void)
277 {
278 devid_cache_cleanup();
279 mdi_clean_vhcache();
280 }
281
282 /*
283 * Register a cache file to be managed and updated by the nvpflush daemon.
284 * All operations are performed through the returned handle.
285 * There is no unregister mechanism for now.
286 */
287 nvf_handle_t
nvf_register_file(nvf_ops_t * ops)288 nvf_register_file(nvf_ops_t *ops)
289 {
290 nvfd_t *nvfdp;
291
292 nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
293
294 nvfdp->nvf_ops = ops;
295 nvfdp->nvf_flags = 0;
296 rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
297
298 mutex_enter(&nvf_cache_mutex);
299 list_insert_tail(&nvf_cache_files, nvfdp);
300 mutex_exit(&nvf_cache_mutex);
301
302 return ((nvf_handle_t)nvfdp);
303 }
304
305 /*PRINTFLIKE1*/
306 void
nvf_error(const char * fmt,...)307 nvf_error(const char *fmt, ...)
308 {
309 va_list ap;
310
311 if (kfio_report_error) {
312 va_start(ap, fmt);
313 vcmn_err(CE_NOTE, fmt, ap);
314 va_end(ap);
315 }
316 }
317
318 /*
319 * Some operations clients may use to manage the data
320 * to be persisted in a cache file.
321 */
322 char *
nvf_cache_name(nvf_handle_t handle)323 nvf_cache_name(nvf_handle_t handle)
324 {
325 return (((nvfd_t *)handle)->nvf_cache_path);
326 }
327
328 krwlock_t *
nvf_lock(nvf_handle_t handle)329 nvf_lock(nvf_handle_t handle)
330 {
331 return (&(((nvfd_t *)handle)->nvf_lock));
332 }
333
334 list_t *
nvf_list(nvf_handle_t handle)335 nvf_list(nvf_handle_t handle)
336 {
337 return (&(((nvfd_t *)handle)->nvf_data_list));
338 }
339
340 void
nvf_mark_dirty(nvf_handle_t handle)341 nvf_mark_dirty(nvf_handle_t handle)
342 {
343 ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
344 NVF_MARK_DIRTY((nvfd_t *)handle);
345 }
346
347 int
nvf_is_dirty(nvf_handle_t handle)348 nvf_is_dirty(nvf_handle_t handle)
349 {
350 ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
351 return (NVF_IS_DIRTY((nvfd_t *)handle));
352 }
353
354 static uint16_t
nvp_cksum(uchar_t * buf,int64_t buflen)355 nvp_cksum(uchar_t *buf, int64_t buflen)
356 {
357 uint16_t cksum = 0;
358 uint16_t *p = (uint16_t *)buf;
359 int64_t n;
360
361 if ((buflen & 0x01) != 0) {
362 buflen--;
363 cksum = buf[buflen];
364 }
365 n = buflen / 2;
366 while (n-- > 0)
367 cksum ^= *p++;
368 return (cksum);
369 }
370
371 int
fread_nvlist(char * filename,nvlist_t ** ret_nvlist)372 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
373 {
374 struct _buf *file;
375 nvpf_hdr_t hdr;
376 char *buf;
377 nvlist_t *nvl;
378 int rval;
379 uint_t offset;
380 int n;
381 char c;
382 uint16_t cksum, hdrsum;
383
384 *ret_nvlist = NULL;
385
386 file = kobj_open_file(filename);
387 if (file == (struct _buf *)-1) {
388 KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
389 return (ENOENT);
390 }
391
392 offset = 0;
393 n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
394 if (n != sizeof (hdr)) {
395 kobj_close_file(file);
396 if (n < 0) {
397 nvf_error("error reading header: %s\n", filename);
398 return (EIO);
399 } else if (n == 0) {
400 KFDEBUG((CE_CONT, "file empty: %s\n", filename));
401 } else {
402 nvf_error("header size incorrect: %s\n", filename);
403 }
404 return (EINVAL);
405 }
406 offset += n;
407
408 KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
409 KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
410 KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
411 (longlong_t)hdr.nvpf_size));
412 KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
413 hdr.nvpf_hdr_chksum));
414 KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
415
416 cksum = hdr.nvpf_hdr_chksum;
417 hdr.nvpf_hdr_chksum = 0;
418 hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
419
420 if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
421 hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
422 kobj_close_file(file);
423 if (hdrsum != cksum) {
424 nvf_error("%s: checksum error "
425 "(actual 0x%x, expected 0x%x)\n",
426 filename, hdrsum, cksum);
427 }
428 nvf_error("%s: header information incorrect", filename);
429 return (EINVAL);
430 }
431
432 ASSERT(hdr.nvpf_size >= 0);
433
434 buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
435 n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
436 if (n != hdr.nvpf_size) {
437 kmem_free(buf, hdr.nvpf_size);
438 kobj_close_file(file);
439 if (n < 0) {
440 nvf_error("%s: read error %d", filename, n);
441 } else {
442 nvf_error("%s: incomplete read %d/%lld",
443 filename, n, (longlong_t)hdr.nvpf_size);
444 }
445 return (EINVAL);
446 }
447 offset += n;
448
449 rval = kobj_read_file(file, &c, 1, offset);
450 kobj_close_file(file);
451 if (rval > 0) {
452 nvf_error("%s is larger than %lld\n",
453 filename, (longlong_t)hdr.nvpf_size);
454 kmem_free(buf, hdr.nvpf_size);
455 return (EINVAL);
456 }
457
458 cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
459 if (hdr.nvpf_chksum != cksum) {
460 nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
461 filename, hdr.nvpf_chksum, cksum);
462 kmem_free(buf, hdr.nvpf_size);
463 return (EINVAL);
464 }
465
466 nvl = NULL;
467 rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
468 if (rval != 0) {
469 nvf_error("%s: error %d unpacking nvlist\n",
470 filename, rval);
471 kmem_free(buf, hdr.nvpf_size);
472 return (EINVAL);
473 }
474
475 kmem_free(buf, hdr.nvpf_size);
476 *ret_nvlist = nvl;
477 return (0);
478 }
479
480 static int
kfcreate(char * filename,kfile_t ** kfilep)481 kfcreate(char *filename, kfile_t **kfilep)
482 {
483 kfile_t *fp;
484 int rval;
485
486 ASSERT(modrootloaded);
487
488 fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
489
490 fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
491 fp->kf_fname = filename;
492 fp->kf_fpos = 0;
493 fp->kf_state = 0;
494
495 KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
496 filename, fp->kf_vnflags));
497 rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
498 0444, &fp->kf_vp, CRCREAT, 0);
499 if (rval != 0) {
500 kmem_free(fp, sizeof (kfile_t));
501 KFDEBUG((CE_CONT, "%s: create error %d\n",
502 filename, rval));
503 return (rval);
504 }
505
506 *kfilep = fp;
507 return (0);
508 }
509
510 static int
kfremove(char * filename)511 kfremove(char *filename)
512 {
513 int rval;
514
515 KFDEBUG((CE_CONT, "remove: %s\n", filename));
516 rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
517 if (rval != 0) {
518 KFDEBUG((CE_CONT, "%s: remove error %d\n",
519 filename, rval));
520 }
521 return (rval);
522 }
523
524 static int
kfread(kfile_t * fp,char * buf,ssize_t bufsiz,ssize_t * ret_n)525 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
526 {
527 ssize_t resid;
528 int err;
529 ssize_t n;
530
531 ASSERT(modrootloaded);
532
533 if (fp->kf_state != 0)
534 return (fp->kf_state);
535
536 err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
537 UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
538 if (err != 0) {
539 KFDEBUG((CE_CONT, "%s: read error %d\n",
540 fp->kf_fname, err));
541 fp->kf_state = err;
542 return (err);
543 }
544
545 ASSERT(resid >= 0 && resid <= bufsiz);
546 n = bufsiz - resid;
547
548 KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
549 fp->kf_fname, n, bufsiz, resid));
550
551 fp->kf_fpos += n;
552 *ret_n = n;
553 return (0);
554 }
555
556 static int
kfwrite(kfile_t * fp,char * buf,ssize_t bufsiz,ssize_t * ret_n)557 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
558 {
559 rlim64_t rlimit;
560 ssize_t resid;
561 int err;
562 ssize_t len;
563 ssize_t n = 0;
564
565 ASSERT(modrootloaded);
566
567 if (fp->kf_state != 0)
568 return (fp->kf_state);
569
570 len = bufsiz;
571 rlimit = bufsiz + 1;
572 for (;;) {
573 err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
574 UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
575 if (err) {
576 KFDEBUG((CE_CONT, "%s: write error %d\n",
577 fp->kf_fname, err));
578 fp->kf_state = err;
579 return (err);
580 }
581
582 KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
583 fp->kf_fname, len-resid, resid));
584
585 ASSERT(resid >= 0 && resid <= len);
586
587 n += (len - resid);
588 if (resid == 0)
589 break;
590
591 if (resid == len) {
592 KFDEBUG((CE_CONT, "%s: filesystem full?\n",
593 fp->kf_fname));
594 fp->kf_state = ENOSPC;
595 return (ENOSPC);
596 }
597
598 len -= resid;
599 buf += len;
600 fp->kf_fpos += len;
601 len = resid;
602 }
603
604 ASSERT(n == bufsiz);
605 KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
606
607 *ret_n = n;
608 return (0);
609 }
610
611
612 static int
kfclose(kfile_t * fp)613 kfclose(kfile_t *fp)
614 {
615 int rval;
616
617 KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
618
619 if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
620 rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
621 if (rval != 0) {
622 nvf_error("%s: sync error %d\n",
623 fp->kf_fname, rval);
624 }
625 KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
626 }
627
628 rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1,
629 (offset_t)0, kcred, NULL);
630 if (rval != 0) {
631 if (fp->kf_state == 0) {
632 nvf_error("%s: close error %d\n",
633 fp->kf_fname, rval);
634 }
635 } else {
636 if (fp->kf_state == 0)
637 KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
638 }
639
640 VN_RELE(fp->kf_vp);
641 kmem_free(fp, sizeof (kfile_t));
642 return (rval);
643 }
644
645 static int
kfrename(char * oldname,char * newname)646 kfrename(char *oldname, char *newname)
647 {
648 int rval;
649
650 ASSERT(modrootloaded);
651
652 KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
653
654 if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
655 KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
656 oldname, newname, rval));
657 }
658
659 return (rval);
660 }
661
662 int
fwrite_nvlist(char * filename,nvlist_t * nvl)663 fwrite_nvlist(char *filename, nvlist_t *nvl)
664 {
665 char *buf;
666 char *nvbuf;
667 kfile_t *fp;
668 char *newname;
669 int len, err, err1;
670 size_t buflen;
671 ssize_t n;
672
673 ASSERT(modrootloaded);
674
675 nvbuf = NULL;
676 err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
677 if (err != 0) {
678 nvf_error("%s: error %d packing nvlist\n",
679 filename, err);
680 return (err);
681 }
682
683 buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
684 bzero(buf, sizeof (nvpf_hdr_t));
685
686 ((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
687 ((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
688 ((nvpf_hdr_t *)buf)->nvpf_size = buflen;
689 ((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
690 ((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
691 nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
692
693 bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
694 kmem_free(nvbuf, buflen);
695 buflen += sizeof (nvpf_hdr_t);
696
697 len = strlen(filename) + MAX_SUFFIX_LEN + 2;
698 newname = kmem_alloc(len, KM_SLEEP);
699
700
701 (void) sprintf(newname, "%s.%s", filename, NEW_FILENAME_SUFFIX);
702
703 /*
704 * To make it unlikely we suffer data loss, write
705 * data to the new temporary file. Once successful
706 * complete the transaction by renaming the new file
707 * to replace the previous.
708 */
709
710 if ((err = kfcreate(newname, &fp)) == 0) {
711 err = kfwrite(fp, buf, buflen, &n);
712 if (err) {
713 nvf_error("%s: write error - %d\n",
714 newname, err);
715 } else {
716 if (n != buflen) {
717 nvf_error(
718 "%s: partial write %ld of %ld bytes\n",
719 newname, n, buflen);
720 nvf_error("%s: filesystem may be full?\n",
721 newname);
722 err = EIO;
723 }
724 }
725 if ((err1 = kfclose(fp)) != 0) {
726 nvf_error("%s: close error\n", newname);
727 if (err == 0)
728 err = err1;
729 }
730 if (err != 0) {
731 if (kfremove(newname) != 0) {
732 nvf_error("%s: remove failed\n",
733 newname);
734 }
735 }
736 } else {
737 nvf_error("%s: create failed - %d\n", filename, err);
738 }
739
740 if (err == 0) {
741 if ((err = kfrename(newname, filename)) != 0) {
742 nvf_error("%s: rename from %s failed\n",
743 newname, filename);
744 }
745 }
746
747 kmem_free(newname, len);
748 kmem_free(buf, buflen);
749
750 return (err);
751 }
752
753 static int
e_fwrite_nvlist(nvfd_t * nvfd,nvlist_t * nvl)754 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
755 {
756 int err;
757
758 if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
759 return (DDI_SUCCESS);
760 else {
761 if (err == EROFS)
762 NVF_MARK_READONLY(nvfd);
763 return (DDI_FAILURE);
764 }
765 }
766
767 static void
nvp_list_free(nvfd_t * nvf)768 nvp_list_free(nvfd_t *nvf)
769 {
770 ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
771 (nvf->nvf_list_free)((nvf_handle_t)nvf);
772 ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
773 }
774
775 /*
776 * Read a file in the nvlist format
777 * EIO - i/o error during read
778 * ENOENT - file not found
779 * EINVAL - file contents corrupted
780 */
781 static int
fread_nvp_list(nvfd_t * nvfd)782 fread_nvp_list(nvfd_t *nvfd)
783 {
784 nvlist_t *nvl;
785 nvpair_t *nvp;
786 char *name;
787 nvlist_t *sublist;
788 int rval;
789 int rv;
790
791 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
792
793 rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
794 if (rval != 0)
795 return (rval);
796 ASSERT(nvl != NULL);
797
798 nvp = NULL;
799 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
800 name = nvpair_name(nvp);
801 ASSERT(strlen(name) > 0);
802
803 switch (nvpair_type(nvp)) {
804 case DATA_TYPE_NVLIST:
805 rval = nvpair_value_nvlist(nvp, &sublist);
806 if (rval != 0) {
807 nvf_error(
808 "nvpair_value_nvlist error %s %d\n",
809 name, rval);
810 goto error;
811 }
812
813 /*
814 * unpack nvlist for this device and
815 * add elements to data list.
816 */
817 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
818 rv = (nvfd->nvf_unpack_nvlist)
819 ((nvf_handle_t)nvfd, sublist, name);
820 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
821 if (rv != 0) {
822 nvf_error(
823 "%s: %s invalid list element\n",
824 nvfd->nvf_cache_path, name);
825 rval = EINVAL;
826 goto error;
827 }
828 break;
829
830 default:
831 nvf_error("%s: %s unsupported data type %d\n",
832 nvfd->nvf_cache_path, name, nvpair_type(nvp));
833 rval = EINVAL;
834 goto error;
835 }
836 }
837
838 nvlist_free(nvl);
839
840 return (0);
841
842 error:
843 nvlist_free(nvl);
844 nvp_list_free(nvfd);
845 return (rval);
846 }
847
848
849 int
nvf_read_file(nvf_handle_t nvf_handle)850 nvf_read_file(nvf_handle_t nvf_handle)
851 {
852 nvfd_t *nvfd = (nvfd_t *)nvf_handle;
853 int rval;
854
855 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
856
857 if (kfio_disable_read)
858 return (0);
859
860 KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
861
862 rval = fread_nvp_list(nvfd);
863 if (rval) {
864 switch (rval) {
865 case EIO:
866 nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
867 cmn_err(CE_WARN, "%s: I/O error",
868 nvfd->nvf_cache_path);
869 break;
870 case ENOENT:
871 nvfd->nvf_flags |= NVF_F_CREATE_MSG;
872 nvf_error("%s: not found\n",
873 nvfd->nvf_cache_path);
874 break;
875 case EINVAL:
876 default:
877 nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
878 cmn_err(CE_WARN, "%s: data file corrupted",
879 nvfd->nvf_cache_path);
880 break;
881 }
882 }
883 return (rval);
884 }
885
886 static void
nvf_write_is_complete(nvfd_t * fd)887 nvf_write_is_complete(nvfd_t *fd)
888 {
889 if (fd->nvf_write_complete) {
890 (fd->nvf_write_complete)((nvf_handle_t)fd);
891 }
892 }
893
894 /*ARGSUSED*/
895 static void
nvpflush_timeout(void * arg)896 nvpflush_timeout(void *arg)
897 {
898 clock_t nticks;
899
900 mutex_enter(&nvpflush_lock);
901 nticks = nvpticks - ddi_get_lbolt();
902 if (nticks > 4) {
903 nvpflush_timer_busy = 1;
904 mutex_exit(&nvpflush_lock);
905 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
906 } else {
907 do_nvpflush = 1;
908 NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
909 cv_signal(&nvpflush_cv);
910 nvpflush_id = 0;
911 nvpflush_timer_busy = 0;
912 mutex_exit(&nvpflush_lock);
913 }
914 }
915
916 /*
917 * After marking a list as dirty, wake the nvpflush daemon
918 * to perform the update.
919 */
920 void
nvf_wake_daemon(void)921 nvf_wake_daemon(void)
922 {
923 clock_t nticks;
924
925 /*
926 * If the system isn't up yet or is shutting down,
927 * don't even think about starting a flush.
928 */
929 if (!i_ddi_io_initialized() || sys_shutdown)
930 return;
931
932 mutex_enter(&nvpflush_lock);
933
934 if (nvpflush_daemon_active == 0) {
935 nvpflush_daemon_active = 1;
936 mutex_exit(&nvpflush_lock);
937 NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
938 nvpflush_thr_id = thread_create(NULL, 0,
939 (void (*)())nvpflush_daemon,
940 NULL, 0, &p0, TS_RUN, minclsyspri);
941 mutex_enter(&nvpflush_lock);
942 }
943
944 nticks = nvpflush_delay * TICKS_PER_SECOND;
945 nvpticks = ddi_get_lbolt() + nticks;
946 if (nvpflush_timer_busy == 0) {
947 nvpflush_timer_busy = 1;
948 mutex_exit(&nvpflush_lock);
949 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
950 } else
951 mutex_exit(&nvpflush_lock);
952 }
953
954 static int
nvpflush_one(nvfd_t * nvfd)955 nvpflush_one(nvfd_t *nvfd)
956 {
957 int rval = DDI_SUCCESS;
958 nvlist_t *nvl;
959
960 rw_enter(&nvfd->nvf_lock, RW_READER);
961
962 ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
963
964 if (!NVF_IS_DIRTY(nvfd) ||
965 NVF_IS_READONLY(nvfd) || kfio_disable_write || sys_shutdown) {
966 NVF_CLEAR_DIRTY(nvfd);
967 rw_exit(&nvfd->nvf_lock);
968 return (DDI_SUCCESS);
969 }
970
971 if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
972 nvf_error("nvpflush: "
973 "%s rw upgrade failed\n", nvfd->nvf_cache_path);
974 rw_exit(&nvfd->nvf_lock);
975 return (DDI_FAILURE);
976 }
977 if (((nvfd->nvf_pack_list)
978 ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
979 nvf_error("nvpflush: "
980 "%s nvlist construction failed\n", nvfd->nvf_cache_path);
981 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
982 rw_exit(&nvfd->nvf_lock);
983 return (DDI_FAILURE);
984 }
985 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
986
987 NVF_CLEAR_DIRTY(nvfd);
988 nvfd->nvf_flags |= NVF_F_FLUSHING;
989 rw_exit(&nvfd->nvf_lock);
990
991 rval = e_fwrite_nvlist(nvfd, nvl);
992 nvlist_free(nvl);
993
994 rw_enter(&nvfd->nvf_lock, RW_WRITER);
995 nvfd->nvf_flags &= ~NVF_F_FLUSHING;
996 if (rval == DDI_FAILURE) {
997 if (NVF_IS_READONLY(nvfd)) {
998 rval = DDI_SUCCESS;
999 nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
1000 } else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
1001 cmn_err(CE_CONT,
1002 "%s: update failed\n", nvfd->nvf_cache_path);
1003 nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
1004 }
1005 } else {
1006 if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
1007 cmn_err(CE_CONT,
1008 "!Creating %s\n", nvfd->nvf_cache_path);
1009 nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
1010 }
1011 if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
1012 cmn_err(CE_CONT,
1013 "!Rebuilding %s\n", nvfd->nvf_cache_path);
1014 nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
1015 }
1016 if (nvfd->nvf_flags & NVF_F_ERROR) {
1017 cmn_err(CE_CONT,
1018 "%s: update now ok\n", nvfd->nvf_cache_path);
1019 nvfd->nvf_flags &= ~NVF_F_ERROR;
1020 }
1021 /*
1022 * The file may need to be flushed again if the cached
1023 * data was touched while writing the earlier contents.
1024 */
1025 if (NVF_IS_DIRTY(nvfd))
1026 rval = DDI_FAILURE;
1027 }
1028
1029 rw_exit(&nvfd->nvf_lock);
1030 return (rval);
1031 }
1032
1033
1034 static void
nvpflush_daemon(void)1035 nvpflush_daemon(void)
1036 {
1037 callb_cpr_t cprinfo;
1038 nvfd_t *nvfdp, *nextfdp;
1039 clock_t clk;
1040 int rval;
1041 int want_wakeup;
1042 int is_now_clean;
1043
1044 ASSERT(modrootloaded);
1045
1046 nvpflush_thread = curthread;
1047 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
1048
1049 CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
1050 mutex_enter(&nvpflush_lock);
1051 for (;;) {
1052 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1053 while (do_nvpflush == 0) {
1054 clk = cv_reltimedwait(&nvpflush_cv, &nvpflush_lock,
1055 (nvpdaemon_idle_time * TICKS_PER_SECOND),
1056 TR_CLOCK_TICK);
1057 if ((clk == -1 && do_nvpflush == 0 &&
1058 nvpflush_timer_busy == 0) || sys_shutdown) {
1059 /*
1060 * Note that CALLB_CPR_EXIT calls mutex_exit()
1061 * on the lock passed in to CALLB_CPR_INIT,
1062 * so the lock must be held when invoking it.
1063 */
1064 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1065 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
1066 ASSERT(mutex_owned(&nvpflush_lock));
1067 nvpflush_thr_id = NULL;
1068 nvpflush_daemon_active = 0;
1069 CALLB_CPR_EXIT(&cprinfo);
1070 thread_exit();
1071 }
1072 }
1073 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1074
1075 nvpbusy = 1;
1076 want_wakeup = 0;
1077 do_nvpflush = 0;
1078 mutex_exit(&nvpflush_lock);
1079
1080 /*
1081 * Try flushing what's dirty, reschedule if there's
1082 * a failure or data gets marked as dirty again.
1083 * First move each file marked dirty to the dirty
1084 * list to avoid locking the list across the write.
1085 */
1086 mutex_enter(&nvf_cache_mutex);
1087 for (nvfdp = list_head(&nvf_cache_files);
1088 nvfdp; nvfdp = nextfdp) {
1089 nextfdp = list_next(&nvf_cache_files, nvfdp);
1090 rw_enter(&nvfdp->nvf_lock, RW_READER);
1091 if (NVF_IS_DIRTY(nvfdp)) {
1092 list_remove(&nvf_cache_files, nvfdp);
1093 list_insert_tail(&nvf_dirty_files, nvfdp);
1094 rw_exit(&nvfdp->nvf_lock);
1095 } else {
1096 NVPDAEMON_DEBUG((CE_CONT,
1097 "nvpdaemon: not dirty %s\n",
1098 nvfdp->nvf_cache_path));
1099 rw_exit(&nvfdp->nvf_lock);
1100 }
1101 }
1102 mutex_exit(&nvf_cache_mutex);
1103
1104 /*
1105 * Now go through the dirty list
1106 */
1107 for (nvfdp = list_head(&nvf_dirty_files);
1108 nvfdp; nvfdp = nextfdp) {
1109 nextfdp = list_next(&nvf_dirty_files, nvfdp);
1110
1111 is_now_clean = 0;
1112 rw_enter(&nvfdp->nvf_lock, RW_READER);
1113 if (NVF_IS_DIRTY(nvfdp)) {
1114 NVPDAEMON_DEBUG((CE_CONT,
1115 "nvpdaemon: flush %s\n",
1116 nvfdp->nvf_cache_path));
1117 rw_exit(&nvfdp->nvf_lock);
1118 rval = nvpflush_one(nvfdp);
1119 rw_enter(&nvfdp->nvf_lock, RW_READER);
1120 if (rval != DDI_SUCCESS ||
1121 NVF_IS_DIRTY(nvfdp)) {
1122 rw_exit(&nvfdp->nvf_lock);
1123 NVPDAEMON_DEBUG((CE_CONT,
1124 "nvpdaemon: %s dirty again\n",
1125 nvfdp->nvf_cache_path));
1126 want_wakeup = 1;
1127 } else {
1128 rw_exit(&nvfdp->nvf_lock);
1129 nvf_write_is_complete(nvfdp);
1130 is_now_clean = 1;
1131 }
1132 } else {
1133 NVPDAEMON_DEBUG((CE_CONT,
1134 "nvpdaemon: not dirty %s\n",
1135 nvfdp->nvf_cache_path));
1136 rw_exit(&nvfdp->nvf_lock);
1137 is_now_clean = 1;
1138 }
1139
1140 if (is_now_clean) {
1141 mutex_enter(&nvf_cache_mutex);
1142 list_remove(&nvf_dirty_files, nvfdp);
1143 list_insert_tail(&nvf_cache_files,
1144 nvfdp);
1145 mutex_exit(&nvf_cache_mutex);
1146 }
1147 }
1148
1149 if (want_wakeup)
1150 nvf_wake_daemon();
1151
1152 mutex_enter(&nvpflush_lock);
1153 nvpbusy = 0;
1154 }
1155 }
1156