1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/note.h> 27 #include <sys/t_lock.h> 28 #include <sys/cmn_err.h> 29 #include <sys/instance.h> 30 #include <sys/conf.h> 31 #include <sys/stat.h> 32 #include <sys/ddi.h> 33 #include <sys/hwconf.h> 34 #include <sys/sunddi.h> 35 #include <sys/sunndi.h> 36 #include <sys/ddi_impldefs.h> 37 #include <sys/ndi_impldefs.h> 38 #include <sys/modctl.h> 39 #include <sys/dacf.h> 40 #include <sys/promif.h> 41 #include <sys/cpuvar.h> 42 #include <sys/pathname.h> 43 #include <sys/kobj.h> 44 #include <sys/devcache.h> 45 #include <sys/devcache_impl.h> 46 #include <sys/sysmacros.h> 47 #include <sys/varargs.h> 48 #include <sys/callb.h> 49 50 /* 51 * This facility provides interfaces to clients to register, 52 * read and update cache data in persisted backing store files, 53 * usually in /etc/devices. The data persisted through this 54 * mechanism should be stateless data, functioning in the sense 55 * of a cache. Writes are performed by a background daemon 56 * thread, permitting a client to schedule an update without 57 * blocking, then continue updating the data state in 58 * parallel. The data is only locked by the daemon thread 59 * to pack the data in preparation for the write. 60 * 61 * Data persisted through this mechanism should be capable 62 * of being regenerated through normal system operation, 63 * for example attaching all disk devices would cause all 64 * devids to be registered for those devices. By caching 65 * a devid-device tuple, the system can operate in a 66 * more optimal way, directly attaching the device mapped 67 * to a devid, rather than burdensomely driving attach of 68 * the entire device tree to discover a single device. 69 * 70 * Note that a client should only need to include 71 * <sys/devcache.h> for the supported interfaces. 72 * 73 * The data per client is entirely within the control of 74 * the client. When reading, data unpacked from the backing 75 * store should be inserted in the list. The pointer to 76 * the list can be retrieved via nvf_list(). When writing, 77 * the data on the list is to be packed and returned to the 78 * nvpdaemon as an nvlist. 79 * 80 * Obvious restrictions are imposed by the limits of the 81 * nvlist format. The data cannot be read or written 82 * piecemeal, and large amounts of data aren't recommended. 83 * However, nvlists do allow that data be named and typed 84 * and can be size-of-int invariant, and the cached data 85 * can be versioned conveniently. 86 * 87 * The registration involves two steps: a handle is 88 * allocated by calling the registration function. 89 * This sets up the data referenced by the handle and 90 * initializes the lock. Following registration, the 91 * client must initialize the data list. The list 92 * interfaces require that the list element with offset 93 * to the node link be provided. The format of the 94 * list element is under the control of the client. 95 * 96 * Locking: the address of the data list r/w lock provided 97 * can be accessed with nvf_lock(). The lock must be held 98 * as reader when traversing the list or checking state, 99 * such as nvf_is_dirty(). The lock must be held as 100 * writer when updating the list or marking it dirty. 101 * The lock must not be held when waking the daemon. 102 * 103 * The data r/w lock is held as writer when the pack, 104 * unpack and free list handlers are called. The 105 * lock should not be dropped and must be still held 106 * upon return. The client should also hold the lock 107 * as reader when checking if the list is dirty, and 108 * as writer when marking the list dirty or initiating 109 * a read. 110 * 111 * The asynchronous nature of updates allows for the 112 * possibility that the data may continue to be updated 113 * once the daemon has been notified that an update is 114 * desired. The data only needs to be locked against 115 * updates when packing the data into the form to be 116 * written. When the write of the packed data has 117 * completed, the daemon will automatically reschedule 118 * an update if the data was marked dirty after the 119 * point at which it was packed. Before beginning an 120 * update, the daemon attempts to lock the data as 121 * writer; if the writer lock is already held, it 122 * backs off and retries later. The model is to give 123 * priority to the kernel processes generating the 124 * data, and that the nature of the data is that 125 * it does not change often, can be re-generated when 126 * needed, so updates should not happen often and 127 * can be delayed until the data stops changing. 128 * The client may update the list or mark it dirty 129 * any time it is able to acquire the lock as 130 * writer first. 131 * 132 * A failed write will be retried after some delay, 133 * in the hope that the cause of the error will be 134 * transient, for example a filesystem with no space 135 * available. An update on a read-only filesystem 136 * is failed silently and not retried; this would be 137 * the case when booted off install media. 138 * 139 * There is no unregister mechanism as of yet, as it 140 * hasn't been needed so far. 141 */ 142 143 /* 144 * Global list of files registered and updated by the nvpflush 145 * daemon, protected by the nvf_cache_mutex. While an 146 * update is taking place, a file is temporarily moved to 147 * the dirty list to avoid locking the primary list for 148 * the duration of the update. 149 */ 150 list_t nvf_cache_files; 151 list_t nvf_dirty_files; 152 kmutex_t nvf_cache_mutex; 153 154 155 /* 156 * Allow some delay from an update of the data before flushing 157 * to permit simultaneous updates of multiple changes. 158 * Changes in the data are expected to be bursty, ie 159 * reconfig or hot-plug of a new adapter. 160 * 161 * kfio_report_error (default 0) 162 * Set to 1 to enable some error messages related to low-level 163 * kernel file i/o operations. 164 * 165 * nvpflush_delay (default 10) 166 * The number of seconds after data is marked dirty before the 167 * flush daemon is triggered to flush the data. A longer period 168 * of time permits more data updates per write. Note that 169 * every update resets the timer so no repository write will 170 * occur while data is being updated continuously. 171 * 172 * nvpdaemon_idle_time (default 60) 173 * The number of seconds the daemon will sleep idle before exiting. 174 * 175 */ 176 #define NVPFLUSH_DELAY 10 177 #define NVPDAEMON_IDLE_TIME 60 178 179 #define TICKS_PER_SECOND (drv_usectohz(1000000)) 180 181 /* 182 * Tunables 183 */ 184 int kfio_report_error = 0; /* kernel file i/o operations */ 185 int kfio_disable_read = 0; /* disable all reads */ 186 int kfio_disable_write = 0; /* disable all writes */ 187 188 int nvpflush_delay = NVPFLUSH_DELAY; 189 int nvpdaemon_idle_time = NVPDAEMON_IDLE_TIME; 190 191 static timeout_id_t nvpflush_id = 0; 192 static int nvpflush_timer_busy = 0; 193 static int nvpflush_daemon_active = 0; 194 static kthread_t *nvpflush_thr_id = 0; 195 196 static int do_nvpflush = 0; 197 static int nvpbusy = 0; 198 static kmutex_t nvpflush_lock; 199 static kcondvar_t nvpflush_cv; 200 static kthread_id_t nvpflush_thread; 201 static clock_t nvpticks; 202 203 static void nvpflush_daemon(void); 204 205 #ifdef DEBUG 206 int nvpdaemon_debug = 0; 207 int kfio_debug = 0; 208 #endif /* DEBUG */ 209 210 extern int modrootloaded; 211 extern void mdi_read_devices_files(void); 212 extern void mdi_clean_vhcache(void); 213 extern int sys_shutdown; 214 215 /* 216 * Initialize the overall cache file management 217 */ 218 void 219 i_ddi_devices_init(void) 220 { 221 list_create(&nvf_cache_files, sizeof (nvfd_t), 222 offsetof(nvfd_t, nvf_link)); 223 list_create(&nvf_dirty_files, sizeof (nvfd_t), 224 offsetof(nvfd_t, nvf_link)); 225 mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL); 226 retire_store_init(); 227 devid_cache_init(); 228 } 229 230 /* 231 * Read cache files 232 * The files read here should be restricted to those 233 * that may be required to mount root. 234 */ 235 void 236 i_ddi_read_devices_files(void) 237 { 238 /* 239 * The retire store should be the first file read as it 240 * may need to offline devices. kfio_disable_read is not 241 * used for retire. For the rationale see the tunable 242 * ddi_retire_store_bypass and comments in: 243 * uts/common/os/retire_store.c 244 */ 245 246 retire_store_read(); 247 248 if (!kfio_disable_read) { 249 mdi_read_devices_files(); 250 devid_cache_read(); 251 } 252 } 253 254 void 255 i_ddi_start_flush_daemon(void) 256 { 257 nvfd_t *nvfdp; 258 259 ASSERT(i_ddi_io_initialized()); 260 261 mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL); 262 cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL); 263 264 mutex_enter(&nvf_cache_mutex); 265 for (nvfdp = list_head(&nvf_cache_files); nvfdp; 266 nvfdp = list_next(&nvf_cache_files, nvfdp)) { 267 if (NVF_IS_DIRTY(nvfdp)) { 268 nvf_wake_daemon(); 269 break; 270 } 271 } 272 mutex_exit(&nvf_cache_mutex); 273 } 274 275 void 276 i_ddi_clean_devices_files(void) 277 { 278 devid_cache_cleanup(); 279 mdi_clean_vhcache(); 280 } 281 282 /* 283 * Register a cache file to be managed and updated by the nvpflush daemon. 284 * All operations are performed through the returned handle. 285 * There is no unregister mechanism for now. 286 */ 287 nvf_handle_t 288 nvf_register_file(nvf_ops_t *ops) 289 { 290 nvfd_t *nvfdp; 291 292 nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP); 293 294 nvfdp->nvf_ops = ops; 295 nvfdp->nvf_flags = 0; 296 rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL); 297 298 mutex_enter(&nvf_cache_mutex); 299 list_insert_tail(&nvf_cache_files, nvfdp); 300 mutex_exit(&nvf_cache_mutex); 301 302 return ((nvf_handle_t)nvfdp); 303 } 304 305 /*PRINTFLIKE1*/ 306 void 307 nvf_error(const char *fmt, ...) 308 { 309 va_list ap; 310 311 if (kfio_report_error) { 312 va_start(ap, fmt); 313 vcmn_err(CE_NOTE, fmt, ap); 314 va_end(ap); 315 } 316 } 317 318 /* 319 * Some operations clients may use to manage the data 320 * to be persisted in a cache file. 321 */ 322 char * 323 nvf_cache_name(nvf_handle_t handle) 324 { 325 return (((nvfd_t *)handle)->nvf_cache_path); 326 } 327 328 krwlock_t * 329 nvf_lock(nvf_handle_t handle) 330 { 331 return (&(((nvfd_t *)handle)->nvf_lock)); 332 } 333 334 list_t * 335 nvf_list(nvf_handle_t handle) 336 { 337 return (&(((nvfd_t *)handle)->nvf_data_list)); 338 } 339 340 void 341 nvf_mark_dirty(nvf_handle_t handle) 342 { 343 ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock))); 344 NVF_MARK_DIRTY((nvfd_t *)handle); 345 } 346 347 int 348 nvf_is_dirty(nvf_handle_t handle) 349 { 350 ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock))); 351 return (NVF_IS_DIRTY((nvfd_t *)handle)); 352 } 353 354 static uint16_t 355 nvp_cksum(uchar_t *buf, int64_t buflen) 356 { 357 uint16_t cksum = 0; 358 uint16_t *p = (uint16_t *)buf; 359 int64_t n; 360 361 if ((buflen & 0x01) != 0) { 362 buflen--; 363 cksum = buf[buflen]; 364 } 365 n = buflen / 2; 366 while (n-- > 0) 367 cksum ^= *p++; 368 return (cksum); 369 } 370 371 int 372 fread_nvlist(char *filename, nvlist_t **ret_nvlist) 373 { 374 struct _buf *file; 375 nvpf_hdr_t hdr; 376 char *buf; 377 nvlist_t *nvl; 378 int rval; 379 uint_t offset; 380 int n; 381 char c; 382 uint16_t cksum, hdrsum; 383 384 *ret_nvlist = NULL; 385 386 file = kobj_open_file(filename); 387 if (file == (struct _buf *)-1) { 388 KFDEBUG((CE_CONT, "cannot open file: %s\n", filename)); 389 return (ENOENT); 390 } 391 392 offset = 0; 393 n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset); 394 if (n != sizeof (hdr)) { 395 kobj_close_file(file); 396 if (n < 0) { 397 nvf_error("error reading header: %s\n", filename); 398 return (EIO); 399 } else if (n == 0) { 400 KFDEBUG((CE_CONT, "file empty: %s\n", filename)); 401 } else { 402 nvf_error("header size incorrect: %s\n", filename); 403 } 404 return (EINVAL); 405 } 406 offset += n; 407 408 KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic)); 409 KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version)); 410 KFDEBUG2((CE_CONT, "nvpf_size: %lld\n", 411 (longlong_t)hdr.nvpf_size)); 412 KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n", 413 hdr.nvpf_hdr_chksum)); 414 KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum)); 415 416 cksum = hdr.nvpf_hdr_chksum; 417 hdr.nvpf_hdr_chksum = 0; 418 hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr)); 419 420 if (hdr.nvpf_magic != NVPF_HDR_MAGIC || 421 hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) { 422 kobj_close_file(file); 423 if (hdrsum != cksum) { 424 nvf_error("%s: checksum error " 425 "(actual 0x%x, expected 0x%x)\n", 426 filename, hdrsum, cksum); 427 } 428 nvf_error("%s: header information incorrect", filename); 429 return (EINVAL); 430 } 431 432 ASSERT(hdr.nvpf_size >= 0); 433 434 buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP); 435 n = kobj_read_file(file, buf, hdr.nvpf_size, offset); 436 if (n != hdr.nvpf_size) { 437 kmem_free(buf, hdr.nvpf_size); 438 kobj_close_file(file); 439 if (n < 0) { 440 nvf_error("%s: read error %d", filename, n); 441 } else { 442 nvf_error("%s: incomplete read %d/%lld", 443 filename, n, (longlong_t)hdr.nvpf_size); 444 } 445 return (EINVAL); 446 } 447 offset += n; 448 449 rval = kobj_read_file(file, &c, 1, offset); 450 kobj_close_file(file); 451 if (rval > 0) { 452 nvf_error("%s is larger than %lld\n", 453 filename, (longlong_t)hdr.nvpf_size); 454 kmem_free(buf, hdr.nvpf_size); 455 return (EINVAL); 456 } 457 458 cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size); 459 if (hdr.nvpf_chksum != cksum) { 460 nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n", 461 filename, hdr.nvpf_chksum, cksum); 462 kmem_free(buf, hdr.nvpf_size); 463 return (EINVAL); 464 } 465 466 nvl = NULL; 467 rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0); 468 if (rval != 0) { 469 nvf_error("%s: error %d unpacking nvlist\n", 470 filename, rval); 471 kmem_free(buf, hdr.nvpf_size); 472 return (EINVAL); 473 } 474 475 kmem_free(buf, hdr.nvpf_size); 476 *ret_nvlist = nvl; 477 return (0); 478 } 479 480 static int 481 kfcreate(char *filename, kfile_t **kfilep) 482 { 483 kfile_t *fp; 484 int rval; 485 486 ASSERT(modrootloaded); 487 488 fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP); 489 490 fp->kf_vnflags = FCREAT | FWRITE | FTRUNC; 491 fp->kf_fname = filename; 492 fp->kf_fpos = 0; 493 fp->kf_state = 0; 494 495 KFDEBUG((CE_CONT, "create: %s flags 0x%x\n", 496 filename, fp->kf_vnflags)); 497 rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags, 498 0444, &fp->kf_vp, CRCREAT, 0); 499 if (rval != 0) { 500 kmem_free(fp, sizeof (kfile_t)); 501 KFDEBUG((CE_CONT, "%s: create error %d\n", 502 filename, rval)); 503 return (rval); 504 } 505 506 *kfilep = fp; 507 return (0); 508 } 509 510 static int 511 kfremove(char *filename) 512 { 513 int rval; 514 515 KFDEBUG((CE_CONT, "remove: %s\n", filename)); 516 rval = vn_remove(filename, UIO_SYSSPACE, RMFILE); 517 if (rval != 0) { 518 KFDEBUG((CE_CONT, "%s: remove error %d\n", 519 filename, rval)); 520 } 521 return (rval); 522 } 523 524 static int 525 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n) 526 { 527 ssize_t resid; 528 int err; 529 ssize_t n; 530 531 ASSERT(modrootloaded); 532 533 if (fp->kf_state != 0) 534 return (fp->kf_state); 535 536 err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos, 537 UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid); 538 if (err != 0) { 539 KFDEBUG((CE_CONT, "%s: read error %d\n", 540 fp->kf_fname, err)); 541 fp->kf_state = err; 542 return (err); 543 } 544 545 ASSERT(resid >= 0 && resid <= bufsiz); 546 n = bufsiz - resid; 547 548 KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n", 549 fp->kf_fname, n, bufsiz, resid)); 550 551 fp->kf_fpos += n; 552 *ret_n = n; 553 return (0); 554 } 555 556 static int 557 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n) 558 { 559 rlim64_t rlimit; 560 ssize_t resid; 561 int err; 562 ssize_t len; 563 ssize_t n = 0; 564 565 ASSERT(modrootloaded); 566 567 if (fp->kf_state != 0) 568 return (fp->kf_state); 569 570 len = bufsiz; 571 rlimit = bufsiz + 1; 572 for (;;) { 573 err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos, 574 UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid); 575 if (err) { 576 KFDEBUG((CE_CONT, "%s: write error %d\n", 577 fp->kf_fname, err)); 578 fp->kf_state = err; 579 return (err); 580 } 581 582 KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n", 583 fp->kf_fname, len-resid, resid)); 584 585 ASSERT(resid >= 0 && resid <= len); 586 587 n += (len - resid); 588 if (resid == 0) 589 break; 590 591 if (resid == len) { 592 KFDEBUG((CE_CONT, "%s: filesystem full?\n", 593 fp->kf_fname)); 594 fp->kf_state = ENOSPC; 595 return (ENOSPC); 596 } 597 598 len -= resid; 599 buf += len; 600 fp->kf_fpos += len; 601 len = resid; 602 } 603 604 ASSERT(n == bufsiz); 605 KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n)); 606 607 *ret_n = n; 608 return (0); 609 } 610 611 612 static int 613 kfclose(kfile_t *fp) 614 { 615 int rval; 616 617 KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname)); 618 619 if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) { 620 rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL); 621 if (rval != 0) { 622 nvf_error("%s: sync error %d\n", 623 fp->kf_fname, rval); 624 } 625 KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname)); 626 } 627 628 rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, 629 (offset_t)0, kcred, NULL); 630 if (rval != 0) { 631 if (fp->kf_state == 0) { 632 nvf_error("%s: close error %d\n", 633 fp->kf_fname, rval); 634 } 635 } else { 636 if (fp->kf_state == 0) 637 KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname)); 638 } 639 640 VN_RELE(fp->kf_vp); 641 kmem_free(fp, sizeof (kfile_t)); 642 return (rval); 643 } 644 645 static int 646 kfrename(char *oldname, char *newname) 647 { 648 int rval; 649 650 ASSERT(modrootloaded); 651 652 KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname)); 653 654 if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) { 655 KFDEBUG((CE_CONT, "rename %s to %s: %d\n", 656 oldname, newname, rval)); 657 } 658 659 return (rval); 660 } 661 662 int 663 fwrite_nvlist(char *filename, nvlist_t *nvl) 664 { 665 char *buf; 666 char *nvbuf; 667 kfile_t *fp; 668 char *newname; 669 int len, err, err1; 670 size_t buflen; 671 ssize_t n; 672 673 ASSERT(modrootloaded); 674 675 nvbuf = NULL; 676 err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0); 677 if (err != 0) { 678 nvf_error("%s: error %d packing nvlist\n", 679 filename, err); 680 return (err); 681 } 682 683 buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP); 684 bzero(buf, sizeof (nvpf_hdr_t)); 685 686 ((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC; 687 ((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION; 688 ((nvpf_hdr_t *)buf)->nvpf_size = buflen; 689 ((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen); 690 ((nvpf_hdr_t *)buf)->nvpf_hdr_chksum = 691 nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t)); 692 693 bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen); 694 kmem_free(nvbuf, buflen); 695 buflen += sizeof (nvpf_hdr_t); 696 697 len = strlen(filename) + MAX_SUFFIX_LEN + 2; 698 newname = kmem_alloc(len, KM_SLEEP); 699 700 701 (void) sprintf(newname, "%s.%s", filename, NEW_FILENAME_SUFFIX); 702 703 /* 704 * To make it unlikely we suffer data loss, write 705 * data to the new temporary file. Once successful 706 * complete the transaction by renaming the new file 707 * to replace the previous. 708 */ 709 710 if ((err = kfcreate(newname, &fp)) == 0) { 711 err = kfwrite(fp, buf, buflen, &n); 712 if (err) { 713 nvf_error("%s: write error - %d\n", 714 newname, err); 715 } else { 716 if (n != buflen) { 717 nvf_error( 718 "%s: partial write %ld of %ld bytes\n", 719 newname, n, buflen); 720 nvf_error("%s: filesystem may be full?\n", 721 newname); 722 err = EIO; 723 } 724 } 725 if ((err1 = kfclose(fp)) != 0) { 726 nvf_error("%s: close error\n", newname); 727 if (err == 0) 728 err = err1; 729 } 730 if (err != 0) { 731 if (kfremove(newname) != 0) { 732 nvf_error("%s: remove failed\n", 733 newname); 734 } 735 } 736 } else { 737 nvf_error("%s: create failed - %d\n", filename, err); 738 } 739 740 if (err == 0) { 741 if ((err = kfrename(newname, filename)) != 0) { 742 nvf_error("%s: rename from %s failed\n", 743 newname, filename); 744 } 745 } 746 747 kmem_free(newname, len); 748 kmem_free(buf, buflen); 749 750 return (err); 751 } 752 753 static int 754 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl) 755 { 756 int err; 757 758 if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0) 759 return (DDI_SUCCESS); 760 else { 761 if (err == EROFS) 762 NVF_MARK_READONLY(nvfd); 763 return (DDI_FAILURE); 764 } 765 } 766 767 static void 768 nvp_list_free(nvfd_t *nvf) 769 { 770 ASSERT(RW_WRITE_HELD(&nvf->nvf_lock)); 771 (nvf->nvf_list_free)((nvf_handle_t)nvf); 772 ASSERT(RW_WRITE_HELD(&nvf->nvf_lock)); 773 } 774 775 /* 776 * Read a file in the nvlist format 777 * EIO - i/o error during read 778 * ENOENT - file not found 779 * EINVAL - file contents corrupted 780 */ 781 static int 782 fread_nvp_list(nvfd_t *nvfd) 783 { 784 nvlist_t *nvl; 785 nvpair_t *nvp; 786 char *name; 787 nvlist_t *sublist; 788 int rval; 789 int rv; 790 791 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); 792 793 rval = fread_nvlist(nvfd->nvf_cache_path, &nvl); 794 if (rval != 0) 795 return (rval); 796 ASSERT(nvl != NULL); 797 798 nvp = NULL; 799 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 800 name = nvpair_name(nvp); 801 ASSERT(strlen(name) > 0); 802 803 switch (nvpair_type(nvp)) { 804 case DATA_TYPE_NVLIST: 805 rval = nvpair_value_nvlist(nvp, &sublist); 806 if (rval != 0) { 807 nvf_error( 808 "nvpair_value_nvlist error %s %d\n", 809 name, rval); 810 goto error; 811 } 812 813 /* 814 * unpack nvlist for this device and 815 * add elements to data list. 816 */ 817 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); 818 rv = (nvfd->nvf_unpack_nvlist) 819 ((nvf_handle_t)nvfd, sublist, name); 820 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); 821 if (rv != 0) { 822 nvf_error( 823 "%s: %s invalid list element\n", 824 nvfd->nvf_cache_path, name); 825 rval = EINVAL; 826 goto error; 827 } 828 break; 829 830 default: 831 nvf_error("%s: %s unsupported data type %d\n", 832 nvfd->nvf_cache_path, name, nvpair_type(nvp)); 833 rval = EINVAL; 834 goto error; 835 } 836 } 837 838 nvlist_free(nvl); 839 840 return (0); 841 842 error: 843 nvlist_free(nvl); 844 nvp_list_free(nvfd); 845 return (rval); 846 } 847 848 849 int 850 nvf_read_file(nvf_handle_t nvf_handle) 851 { 852 nvfd_t *nvfd = (nvfd_t *)nvf_handle; 853 int rval; 854 855 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); 856 857 if (kfio_disable_read) 858 return (0); 859 860 KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path)); 861 862 rval = fread_nvp_list(nvfd); 863 if (rval) { 864 switch (rval) { 865 case EIO: 866 nvfd->nvf_flags |= NVF_F_REBUILD_MSG; 867 cmn_err(CE_WARN, "%s: I/O error", 868 nvfd->nvf_cache_path); 869 break; 870 case ENOENT: 871 nvfd->nvf_flags |= NVF_F_CREATE_MSG; 872 nvf_error("%s: not found\n", 873 nvfd->nvf_cache_path); 874 break; 875 case EINVAL: 876 default: 877 nvfd->nvf_flags |= NVF_F_REBUILD_MSG; 878 cmn_err(CE_WARN, "%s: data file corrupted", 879 nvfd->nvf_cache_path); 880 break; 881 } 882 } 883 return (rval); 884 } 885 886 static void 887 nvf_write_is_complete(nvfd_t *fd) 888 { 889 if (fd->nvf_write_complete) { 890 (fd->nvf_write_complete)((nvf_handle_t)fd); 891 } 892 } 893 894 /*ARGSUSED*/ 895 static void 896 nvpflush_timeout(void *arg) 897 { 898 clock_t nticks; 899 900 mutex_enter(&nvpflush_lock); 901 nticks = nvpticks - ddi_get_lbolt(); 902 if (nticks > 4) { 903 nvpflush_timer_busy = 1; 904 mutex_exit(&nvpflush_lock); 905 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks); 906 } else { 907 do_nvpflush = 1; 908 NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n")); 909 cv_signal(&nvpflush_cv); 910 nvpflush_id = 0; 911 nvpflush_timer_busy = 0; 912 mutex_exit(&nvpflush_lock); 913 } 914 } 915 916 /* 917 * After marking a list as dirty, wake the nvpflush daemon 918 * to perform the update. 919 */ 920 void 921 nvf_wake_daemon(void) 922 { 923 clock_t nticks; 924 925 /* 926 * If the system isn't up yet or is shutting down, 927 * don't even think about starting a flush. 928 */ 929 if (!i_ddi_io_initialized() || sys_shutdown) 930 return; 931 932 mutex_enter(&nvpflush_lock); 933 934 if (nvpflush_daemon_active == 0) { 935 nvpflush_daemon_active = 1; 936 mutex_exit(&nvpflush_lock); 937 NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n")); 938 nvpflush_thr_id = thread_create(NULL, 0, 939 (void (*)())nvpflush_daemon, 940 NULL, 0, &p0, TS_RUN, minclsyspri); 941 mutex_enter(&nvpflush_lock); 942 } 943 944 nticks = nvpflush_delay * TICKS_PER_SECOND; 945 nvpticks = ddi_get_lbolt() + nticks; 946 if (nvpflush_timer_busy == 0) { 947 nvpflush_timer_busy = 1; 948 mutex_exit(&nvpflush_lock); 949 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4); 950 } else 951 mutex_exit(&nvpflush_lock); 952 } 953 954 static int 955 nvpflush_one(nvfd_t *nvfd) 956 { 957 int rval = DDI_SUCCESS; 958 nvlist_t *nvl; 959 960 rw_enter(&nvfd->nvf_lock, RW_READER); 961 962 ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0); 963 964 if (!NVF_IS_DIRTY(nvfd) || 965 NVF_IS_READONLY(nvfd) || kfio_disable_write || sys_shutdown) { 966 NVF_CLEAR_DIRTY(nvfd); 967 rw_exit(&nvfd->nvf_lock); 968 return (DDI_SUCCESS); 969 } 970 971 if (rw_tryupgrade(&nvfd->nvf_lock) == 0) { 972 nvf_error("nvpflush: " 973 "%s rw upgrade failed\n", nvfd->nvf_cache_path); 974 rw_exit(&nvfd->nvf_lock); 975 return (DDI_FAILURE); 976 } 977 if (((nvfd->nvf_pack_list) 978 ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) { 979 nvf_error("nvpflush: " 980 "%s nvlist construction failed\n", nvfd->nvf_cache_path); 981 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); 982 rw_exit(&nvfd->nvf_lock); 983 return (DDI_FAILURE); 984 } 985 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); 986 987 NVF_CLEAR_DIRTY(nvfd); 988 nvfd->nvf_flags |= NVF_F_FLUSHING; 989 rw_exit(&nvfd->nvf_lock); 990 991 rval = e_fwrite_nvlist(nvfd, nvl); 992 nvlist_free(nvl); 993 994 rw_enter(&nvfd->nvf_lock, RW_WRITER); 995 nvfd->nvf_flags &= ~NVF_F_FLUSHING; 996 if (rval == DDI_FAILURE) { 997 if (NVF_IS_READONLY(nvfd)) { 998 rval = DDI_SUCCESS; 999 nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY); 1000 } else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) { 1001 cmn_err(CE_CONT, 1002 "%s: update failed\n", nvfd->nvf_cache_path); 1003 nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY; 1004 } 1005 } else { 1006 if (nvfd->nvf_flags & NVF_F_CREATE_MSG) { 1007 cmn_err(CE_CONT, 1008 "!Creating %s\n", nvfd->nvf_cache_path); 1009 nvfd->nvf_flags &= ~NVF_F_CREATE_MSG; 1010 } 1011 if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) { 1012 cmn_err(CE_CONT, 1013 "!Rebuilding %s\n", nvfd->nvf_cache_path); 1014 nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG; 1015 } 1016 if (nvfd->nvf_flags & NVF_F_ERROR) { 1017 cmn_err(CE_CONT, 1018 "%s: update now ok\n", nvfd->nvf_cache_path); 1019 nvfd->nvf_flags &= ~NVF_F_ERROR; 1020 } 1021 /* 1022 * The file may need to be flushed again if the cached 1023 * data was touched while writing the earlier contents. 1024 */ 1025 if (NVF_IS_DIRTY(nvfd)) 1026 rval = DDI_FAILURE; 1027 } 1028 1029 rw_exit(&nvfd->nvf_lock); 1030 return (rval); 1031 } 1032 1033 1034 static void 1035 nvpflush_daemon(void) 1036 { 1037 callb_cpr_t cprinfo; 1038 nvfd_t *nvfdp, *nextfdp; 1039 clock_t clk; 1040 int rval; 1041 int want_wakeup; 1042 int is_now_clean; 1043 1044 ASSERT(modrootloaded); 1045 1046 nvpflush_thread = curthread; 1047 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n")); 1048 1049 CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp"); 1050 mutex_enter(&nvpflush_lock); 1051 for (;;) { 1052 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1053 while (do_nvpflush == 0) { 1054 clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock, 1055 ddi_get_lbolt() + 1056 (nvpdaemon_idle_time * TICKS_PER_SECOND)); 1057 if ((clk == -1 && do_nvpflush == 0 && 1058 nvpflush_timer_busy == 0) || sys_shutdown) { 1059 /* 1060 * Note that CALLB_CPR_EXIT calls mutex_exit() 1061 * on the lock passed in to CALLB_CPR_INIT, 1062 * so the lock must be held when invoking it. 1063 */ 1064 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock); 1065 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n")); 1066 ASSERT(mutex_owned(&nvpflush_lock)); 1067 nvpflush_thr_id = NULL; 1068 nvpflush_daemon_active = 0; 1069 CALLB_CPR_EXIT(&cprinfo); 1070 thread_exit(); 1071 } 1072 } 1073 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock); 1074 1075 nvpbusy = 1; 1076 want_wakeup = 0; 1077 do_nvpflush = 0; 1078 mutex_exit(&nvpflush_lock); 1079 1080 /* 1081 * Try flushing what's dirty, reschedule if there's 1082 * a failure or data gets marked as dirty again. 1083 * First move each file marked dirty to the dirty 1084 * list to avoid locking the list across the write. 1085 */ 1086 mutex_enter(&nvf_cache_mutex); 1087 for (nvfdp = list_head(&nvf_cache_files); 1088 nvfdp; nvfdp = nextfdp) { 1089 nextfdp = list_next(&nvf_cache_files, nvfdp); 1090 rw_enter(&nvfdp->nvf_lock, RW_READER); 1091 if (NVF_IS_DIRTY(nvfdp)) { 1092 list_remove(&nvf_cache_files, nvfdp); 1093 list_insert_tail(&nvf_dirty_files, nvfdp); 1094 rw_exit(&nvfdp->nvf_lock); 1095 } else { 1096 NVPDAEMON_DEBUG((CE_CONT, 1097 "nvpdaemon: not dirty %s\n", 1098 nvfdp->nvf_cache_path)); 1099 rw_exit(&nvfdp->nvf_lock); 1100 } 1101 } 1102 mutex_exit(&nvf_cache_mutex); 1103 1104 /* 1105 * Now go through the dirty list 1106 */ 1107 for (nvfdp = list_head(&nvf_dirty_files); 1108 nvfdp; nvfdp = nextfdp) { 1109 nextfdp = list_next(&nvf_dirty_files, nvfdp); 1110 1111 is_now_clean = 0; 1112 rw_enter(&nvfdp->nvf_lock, RW_READER); 1113 if (NVF_IS_DIRTY(nvfdp)) { 1114 NVPDAEMON_DEBUG((CE_CONT, 1115 "nvpdaemon: flush %s\n", 1116 nvfdp->nvf_cache_path)); 1117 rw_exit(&nvfdp->nvf_lock); 1118 rval = nvpflush_one(nvfdp); 1119 rw_enter(&nvfdp->nvf_lock, RW_READER); 1120 if (rval != DDI_SUCCESS || 1121 NVF_IS_DIRTY(nvfdp)) { 1122 rw_exit(&nvfdp->nvf_lock); 1123 NVPDAEMON_DEBUG((CE_CONT, 1124 "nvpdaemon: %s dirty again\n", 1125 nvfdp->nvf_cache_path)); 1126 want_wakeup = 1; 1127 } else { 1128 rw_exit(&nvfdp->nvf_lock); 1129 nvf_write_is_complete(nvfdp); 1130 is_now_clean = 1; 1131 } 1132 } else { 1133 NVPDAEMON_DEBUG((CE_CONT, 1134 "nvpdaemon: not dirty %s\n", 1135 nvfdp->nvf_cache_path)); 1136 rw_exit(&nvfdp->nvf_lock); 1137 is_now_clean = 1; 1138 } 1139 1140 if (is_now_clean) { 1141 mutex_enter(&nvf_cache_mutex); 1142 list_remove(&nvf_dirty_files, nvfdp); 1143 list_insert_tail(&nvf_cache_files, 1144 nvfdp); 1145 mutex_exit(&nvf_cache_mutex); 1146 } 1147 } 1148 1149 if (want_wakeup) 1150 nvf_wake_daemon(); 1151 1152 mutex_enter(&nvpflush_lock); 1153 nvpbusy = 0; 1154 } 1155 } 1156