1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/note.h> 29 #include <sys/t_lock.h> 30 #include <sys/cmn_err.h> 31 #include <sys/instance.h> 32 #include <sys/conf.h> 33 #include <sys/stat.h> 34 #include <sys/ddi.h> 35 #include <sys/hwconf.h> 36 #include <sys/sunddi.h> 37 #include <sys/sunndi.h> 38 #include <sys/ddi_impldefs.h> 39 #include <sys/ndi_impldefs.h> 40 #include <sys/modctl.h> 41 #include <sys/dacf.h> 42 #include <sys/promif.h> 43 #include <sys/cpuvar.h> 44 #include <sys/pathname.h> 45 #include <sys/kobj.h> 46 #include <sys/devcache.h> 47 #include <sys/devcache_impl.h> 48 #include <sys/sysmacros.h> 49 #include <sys/varargs.h> 50 #include <sys/callb.h> 51 52 /* 53 * This facility provides interfaces to clients to register, 54 * read and update cache data in persisted backing store files, 55 * usually in /etc/devices. The data persisted through this 56 * mechanism should be stateless data, functioning in the sense 57 * of a cache. Writes are performed by a background daemon 58 * thread, permitting a client to schedule an update without 59 * blocking, then continue updating the data state in 60 * parallel. The data is only locked by the daemon thread 61 * to pack the data in preparation for the write. 62 * 63 * Data persisted through this mechanism should be capable 64 * of being regenerated through normal system operation, 65 * for example attaching all disk devices would cause all 66 * devids to be registered for those devices. By caching 67 * a devid-device tuple, the system can operate in a 68 * more optimal way, directly attaching the device mapped 69 * to a devid, rather than burdensomely driving attach of 70 * the entire device tree to discover a single device. 71 * 72 * Note that a client should only need to include 73 * <sys/devcache.h> for the supported interfaces. 74 * 75 * The data per client is entirely within the control of 76 * the client. When reading, data unpacked from the backing 77 * store should be inserted in the list. The pointer to 78 * the list can be retrieved via nvf_list(). When writing, 79 * the data on the list is to be packed and returned to the 80 * nvpdaemon as an nvlist. 81 * 82 * Obvious restrictions are imposed by the limits of the 83 * nvlist format. The data cannot be read or written 84 * piecemeal, and large amounts of data aren't recommended. 85 * However, nvlists do allow that data be named and typed 86 * and can be size-of-int invariant, and the cached data 87 * can be versioned conveniently. 88 * 89 * The registration involves two steps: a handle is 90 * allocated by calling the registration function. 91 * This sets up the data referenced by the handle and 92 * initializes the lock. Following registration, the 93 * client must initialize the data list. The list 94 * interfaces require that the list element with offset 95 * to the node link be provided. The format of the 96 * list element is under the control of the client. 97 * 98 * Locking: the address of the data list r/w lock provided 99 * can be accessed with nvf_lock(). The lock must be held 100 * as reader when traversing the list or checking state, 101 * such as nvf_is_dirty(). The lock must be held as 102 * writer when updating the list or marking it dirty. 103 * The lock must not be held when waking the daemon. 104 * 105 * The data r/w lock is held as writer when the pack, 106 * unpack and free list handlers are called. The 107 * lock should not be dropped and must be still held 108 * upon return. The client should also hold the lock 109 * as reader when checking if the list is dirty, and 110 * as writer when marking the list dirty or initiating 111 * a read. 112 * 113 * The asynchronous nature of updates allows for the 114 * possibility that the data may continue to be updated 115 * once the daemon has been notified that an update is 116 * desired. The data only needs to be locked against 117 * updates when packing the data into the form to be 118 * written. When the write of the packed data has 119 * completed, the daemon will automatically reschedule 120 * an update if the data was marked dirty after the 121 * point at which it was packed. Before beginning an 122 * update, the daemon attempts to lock the data as 123 * writer; if the writer lock is already held, it 124 * backs off and retries later. The model is to give 125 * priority to the kernel processes generating the 126 * data, and that the nature of the data is that 127 * it does not change often, can be re-generated when 128 * needed, so updates should not happen often and 129 * can be delayed until the data stops changing. 130 * The client may update the list or mark it dirty 131 * any time it is able to acquire the lock as 132 * writer first. 133 * 134 * A failed write will be retried after some delay, 135 * in the hope that the cause of the error will be 136 * transient, for example a filesystem with no space 137 * available. An update on a read-only filesystem 138 * is failed silently and not retried; this would be 139 * the case when booted off install media. 140 * 141 * There is no unregister mechanism as of yet, as it 142 * hasn't been needed so far. 143 */ 144 145 /* 146 * Global list of files registered and updated by the nvpflush 147 * daemon, protected by the nvf_cache_mutex. While an 148 * update is taking place, a file is temporarily moved to 149 * the dirty list to avoid locking the primary list for 150 * the duration of the update. 151 */ 152 list_t nvf_cache_files; 153 list_t nvf_dirty_files; 154 kmutex_t nvf_cache_mutex; 155 156 157 /* 158 * Allow some delay from an update of the data before flushing 159 * to permit simultaneous updates of multiple changes. 160 * Changes in the data are expected to be bursty, ie 161 * reconfig or hot-plug of a new adapter. 162 * 163 * kfio_report_error (default 0) 164 * Set to 1 to enable some error messages related to low-level 165 * kernel file i/o operations. 166 * 167 * nvpflush_delay (default 10) 168 * The number of seconds after data is marked dirty before the 169 * flush daemon is triggered to flush the data. A longer period 170 * of time permits more data updates per write. Note that 171 * every update resets the timer so no repository write will 172 * occur while data is being updated continuously. 173 * 174 * nvpdaemon_idle_time (default 60) 175 * The number of seconds the daemon will sleep idle before exiting. 176 * 177 */ 178 #define NVPFLUSH_DELAY 10 179 #define NVPDAEMON_IDLE_TIME 60 180 181 #define TICKS_PER_SECOND (drv_usectohz(1000000)) 182 183 /* 184 * Tunables 185 */ 186 int kfio_report_error = 0; /* kernel file i/o operations */ 187 int kfio_disable_read = 0; /* disable all reads */ 188 int kfio_disable_write = 0; /* disable all writes */ 189 190 int nvpflush_delay = NVPFLUSH_DELAY; 191 int nvpdaemon_idle_time = NVPDAEMON_IDLE_TIME; 192 193 static timeout_id_t nvpflush_id = 0; 194 static int nvpflush_timer_busy = 0; 195 static int nvpflush_daemon_active = 0; 196 static kthread_t *nvpflush_thr_id = 0; 197 198 static int do_nvpflush = 0; 199 static int nvpbusy = 0; 200 static kmutex_t nvpflush_lock; 201 static kcondvar_t nvpflush_cv; 202 static kthread_id_t nvpflush_thread; 203 static clock_t nvpticks; 204 205 static void nvpflush_daemon(void); 206 207 #ifdef DEBUG 208 int nvpdaemon_debug = 0; 209 int kfio_debug = 0; 210 #endif /* DEBUG */ 211 212 extern int modrootloaded; 213 extern void mdi_read_devices_files(void); 214 extern void mdi_clean_vhcache(void); 215 216 /* 217 * Initialize the overall cache file management 218 */ 219 void 220 i_ddi_devices_init(void) 221 { 222 list_create(&nvf_cache_files, sizeof (nvfd_t), 223 offsetof(nvfd_t, nvf_link)); 224 list_create(&nvf_dirty_files, sizeof (nvfd_t), 225 offsetof(nvfd_t, nvf_link)); 226 mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL); 227 retire_store_init(); 228 devid_cache_init(); 229 } 230 231 /* 232 * Read cache files 233 * The files read here should be restricted to those 234 * that may be required to mount root. 235 */ 236 void 237 i_ddi_read_devices_files(void) 238 { 239 /* 240 * The retire store should be the first file read as it 241 * may need to offline devices. kfio_disable_read is not 242 * used for retire. For the rationale see the tunable 243 * ddi_retire_store_bypass and comments in: 244 * uts/common/os/retire_store.c 245 */ 246 247 retire_store_read(); 248 249 if (!kfio_disable_read) { 250 mdi_read_devices_files(); 251 devid_cache_read(); 252 } 253 } 254 255 void 256 i_ddi_start_flush_daemon(void) 257 { 258 nvfd_t *nvfdp; 259 260 ASSERT(i_ddi_io_initialized()); 261 262 mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL); 263 cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL); 264 265 mutex_enter(&nvf_cache_mutex); 266 for (nvfdp = list_head(&nvf_cache_files); nvfdp; 267 nvfdp = list_next(&nvf_cache_files, nvfdp)) { 268 if (NVF_IS_DIRTY(nvfdp)) { 269 nvf_wake_daemon(); 270 break; 271 } 272 } 273 mutex_exit(&nvf_cache_mutex); 274 } 275 276 void 277 i_ddi_clean_devices_files(void) 278 { 279 devid_cache_cleanup(); 280 mdi_clean_vhcache(); 281 } 282 283 /* 284 * Register a cache file to be managed and updated by the nvpflush daemon. 285 * All operations are performed through the returned handle. 286 * There is no unregister mechanism for now. 287 */ 288 nvf_handle_t 289 nvf_register_file(nvf_ops_t *ops) 290 { 291 nvfd_t *nvfdp; 292 293 nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP); 294 295 nvfdp->nvf_ops = ops; 296 nvfdp->nvf_flags = 0; 297 rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL); 298 299 mutex_enter(&nvf_cache_mutex); 300 list_insert_tail(&nvf_cache_files, nvfdp); 301 mutex_exit(&nvf_cache_mutex); 302 303 return ((nvf_handle_t)nvfdp); 304 } 305 306 /*PRINTFLIKE1*/ 307 void 308 nvf_error(const char *fmt, ...) 309 { 310 va_list ap; 311 312 if (kfio_report_error) { 313 va_start(ap, fmt); 314 vcmn_err(CE_NOTE, fmt, ap); 315 va_end(ap); 316 } 317 } 318 319 /* 320 * Some operations clients may use to manage the data 321 * to be persisted in a cache file. 322 */ 323 char * 324 nvf_cache_name(nvf_handle_t handle) 325 { 326 return (((nvfd_t *)handle)->nvf_cache_path); 327 } 328 329 krwlock_t * 330 nvf_lock(nvf_handle_t handle) 331 { 332 return (&(((nvfd_t *)handle)->nvf_lock)); 333 } 334 335 list_t * 336 nvf_list(nvf_handle_t handle) 337 { 338 return (&(((nvfd_t *)handle)->nvf_data_list)); 339 } 340 341 void 342 nvf_mark_dirty(nvf_handle_t handle) 343 { 344 ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock))); 345 NVF_MARK_DIRTY((nvfd_t *)handle); 346 } 347 348 int 349 nvf_is_dirty(nvf_handle_t handle) 350 { 351 ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock))); 352 return (NVF_IS_DIRTY((nvfd_t *)handle)); 353 } 354 355 static uint16_t 356 nvp_cksum(uchar_t *buf, int64_t buflen) 357 { 358 uint16_t cksum = 0; 359 uint16_t *p = (uint16_t *)buf; 360 int64_t n; 361 362 if ((buflen & 0x01) != 0) { 363 buflen--; 364 cksum = buf[buflen]; 365 } 366 n = buflen / 2; 367 while (n-- > 0) 368 cksum ^= *p++; 369 return (cksum); 370 } 371 372 int 373 fread_nvlist(char *filename, nvlist_t **ret_nvlist) 374 { 375 struct _buf *file; 376 nvpf_hdr_t hdr; 377 char *buf; 378 nvlist_t *nvl; 379 int rval; 380 uint_t offset; 381 int n; 382 char c; 383 uint16_t cksum, hdrsum; 384 385 *ret_nvlist = NULL; 386 387 file = kobj_open_file(filename); 388 if (file == (struct _buf *)-1) { 389 KFDEBUG((CE_CONT, "cannot open file: %s\n", filename)); 390 return (ENOENT); 391 } 392 393 offset = 0; 394 n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset); 395 if (n != sizeof (hdr)) { 396 kobj_close_file(file); 397 if (n < 0) { 398 nvf_error("error reading header: %s\n", filename); 399 return (EIO); 400 } else if (n == 0) { 401 KFDEBUG((CE_CONT, "file empty: %s\n", filename)); 402 } else { 403 nvf_error("header size incorrect: %s\n", filename); 404 } 405 return (EINVAL); 406 } 407 offset += n; 408 409 KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic)); 410 KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version)); 411 KFDEBUG2((CE_CONT, "nvpf_size: %lld\n", 412 (longlong_t)hdr.nvpf_size)); 413 KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n", 414 hdr.nvpf_hdr_chksum)); 415 KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum)); 416 417 cksum = hdr.nvpf_hdr_chksum; 418 hdr.nvpf_hdr_chksum = 0; 419 hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr)); 420 421 if (hdr.nvpf_magic != NVPF_HDR_MAGIC || 422 hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) { 423 kobj_close_file(file); 424 if (hdrsum != cksum) { 425 nvf_error("%s: checksum error " 426 "(actual 0x%x, expected 0x%x)\n", 427 filename, hdrsum, cksum); 428 } 429 nvf_error("%s: header information incorrect", filename); 430 return (EINVAL); 431 } 432 433 ASSERT(hdr.nvpf_size >= 0); 434 435 buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP); 436 n = kobj_read_file(file, buf, hdr.nvpf_size, offset); 437 if (n != hdr.nvpf_size) { 438 kmem_free(buf, hdr.nvpf_size); 439 kobj_close_file(file); 440 if (n < 0) { 441 nvf_error("%s: read error %d", filename, n); 442 } else { 443 nvf_error("%s: incomplete read %d/%lld", 444 filename, n, (longlong_t)hdr.nvpf_size); 445 } 446 return (EINVAL); 447 } 448 offset += n; 449 450 rval = kobj_read_file(file, &c, 1, offset); 451 kobj_close_file(file); 452 if (rval > 0) { 453 nvf_error("%s is larger than %lld\n", 454 filename, (longlong_t)hdr.nvpf_size); 455 kmem_free(buf, hdr.nvpf_size); 456 return (EINVAL); 457 } 458 459 cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size); 460 if (hdr.nvpf_chksum != cksum) { 461 nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n", 462 filename, hdr.nvpf_chksum, cksum); 463 kmem_free(buf, hdr.nvpf_size); 464 return (EINVAL); 465 } 466 467 nvl = NULL; 468 rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0); 469 if (rval != 0) { 470 nvf_error("%s: error %d unpacking nvlist\n", 471 filename, rval); 472 kmem_free(buf, hdr.nvpf_size); 473 return (EINVAL); 474 } 475 476 kmem_free(buf, hdr.nvpf_size); 477 *ret_nvlist = nvl; 478 return (0); 479 } 480 481 static int 482 kfcreate(char *filename, kfile_t **kfilep) 483 { 484 kfile_t *fp; 485 int rval; 486 487 ASSERT(modrootloaded); 488 489 fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP); 490 491 fp->kf_vnflags = FCREAT | FWRITE | FTRUNC; 492 fp->kf_fname = filename; 493 fp->kf_fpos = 0; 494 fp->kf_state = 0; 495 496 KFDEBUG((CE_CONT, "create: %s flags 0x%x\n", 497 filename, fp->kf_vnflags)); 498 rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags, 499 0444, &fp->kf_vp, CRCREAT, 0); 500 if (rval != 0) { 501 kmem_free(fp, sizeof (kfile_t)); 502 KFDEBUG((CE_CONT, "%s: create error %d\n", 503 filename, rval)); 504 return (rval); 505 } 506 507 *kfilep = fp; 508 return (0); 509 } 510 511 static int 512 kfremove(char *filename) 513 { 514 int rval; 515 516 KFDEBUG((CE_CONT, "remove: %s\n", filename)); 517 rval = vn_remove(filename, UIO_SYSSPACE, RMFILE); 518 if (rval != 0) { 519 KFDEBUG((CE_CONT, "%s: remove error %d\n", 520 filename, rval)); 521 } 522 return (rval); 523 } 524 525 static int 526 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n) 527 { 528 ssize_t resid; 529 int err; 530 ssize_t n; 531 532 ASSERT(modrootloaded); 533 534 if (fp->kf_state != 0) 535 return (fp->kf_state); 536 537 err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos, 538 UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid); 539 if (err != 0) { 540 KFDEBUG((CE_CONT, "%s: read error %d\n", 541 fp->kf_fname, err)); 542 fp->kf_state = err; 543 return (err); 544 } 545 546 ASSERT(resid >= 0 && resid <= bufsiz); 547 n = bufsiz - resid; 548 549 KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n", 550 fp->kf_fname, n, bufsiz, resid)); 551 552 fp->kf_fpos += n; 553 *ret_n = n; 554 return (0); 555 } 556 557 static int 558 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n) 559 { 560 rlim64_t rlimit; 561 ssize_t resid; 562 int err; 563 ssize_t len; 564 ssize_t n = 0; 565 566 ASSERT(modrootloaded); 567 568 if (fp->kf_state != 0) 569 return (fp->kf_state); 570 571 len = bufsiz; 572 rlimit = bufsiz + 1; 573 for (;;) { 574 err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos, 575 UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid); 576 if (err) { 577 KFDEBUG((CE_CONT, "%s: write error %d\n", 578 fp->kf_fname, err)); 579 fp->kf_state = err; 580 return (err); 581 } 582 583 KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n", 584 fp->kf_fname, len-resid, resid)); 585 586 ASSERT(resid >= 0 && resid <= len); 587 588 n += (len - resid); 589 if (resid == 0) 590 break; 591 592 if (resid == len) { 593 KFDEBUG((CE_CONT, "%s: filesystem full?\n", 594 fp->kf_fname)); 595 fp->kf_state = ENOSPC; 596 return (ENOSPC); 597 } 598 599 len -= resid; 600 buf += len; 601 fp->kf_fpos += len; 602 len = resid; 603 } 604 605 ASSERT(n == bufsiz); 606 KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n)); 607 608 *ret_n = n; 609 return (0); 610 } 611 612 613 static int 614 kfclose(kfile_t *fp) 615 { 616 int rval; 617 618 KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname)); 619 620 if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) { 621 rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL); 622 if (rval != 0) { 623 nvf_error("%s: sync error %d\n", 624 fp->kf_fname, rval); 625 } 626 KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname)); 627 } 628 629 rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred, 630 NULL); 631 if (rval != 0) { 632 if (fp->kf_state == 0) { 633 nvf_error("%s: close error %d\n", 634 fp->kf_fname, rval); 635 } 636 } else { 637 if (fp->kf_state == 0) 638 KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname)); 639 } 640 641 VN_RELE(fp->kf_vp); 642 kmem_free(fp, sizeof (kfile_t)); 643 return (rval); 644 } 645 646 static int 647 kfrename(char *oldname, char *newname) 648 { 649 int rval; 650 651 ASSERT(modrootloaded); 652 653 KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname)); 654 655 if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) { 656 KFDEBUG((CE_CONT, "rename %s to %s: %d\n", 657 oldname, newname, rval)); 658 } 659 660 return (rval); 661 } 662 663 int 664 fwrite_nvlist(char *filename, nvlist_t *nvl) 665 { 666 char *buf; 667 char *nvbuf; 668 kfile_t *fp; 669 char *newname; 670 int len, err, err1; 671 size_t buflen; 672 ssize_t n; 673 674 ASSERT(modrootloaded); 675 676 nvbuf = NULL; 677 err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0); 678 if (err != 0) { 679 nvf_error("%s: error %d packing nvlist\n", 680 filename, err); 681 return (err); 682 } 683 684 buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP); 685 bzero(buf, sizeof (nvpf_hdr_t)); 686 687 ((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC; 688 ((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION; 689 ((nvpf_hdr_t *)buf)->nvpf_size = buflen; 690 ((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen); 691 ((nvpf_hdr_t *)buf)->nvpf_hdr_chksum = 692 nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t)); 693 694 bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen); 695 kmem_free(nvbuf, buflen); 696 buflen += sizeof (nvpf_hdr_t); 697 698 len = strlen(filename) + MAX_SUFFIX_LEN + 2; 699 newname = kmem_alloc(len, KM_SLEEP); 700 701 702 (void) sprintf(newname, "%s.%s", 703 filename, NEW_FILENAME_SUFFIX); 704 705 /* 706 * To make it unlikely we suffer data loss, write 707 * data to the new temporary file. Once successful 708 * complete the transaction by renaming the new file 709 * to replace the previous. 710 */ 711 712 if ((err = kfcreate(newname, &fp)) == 0) { 713 err = kfwrite(fp, buf, buflen, &n); 714 if (err) { 715 nvf_error("%s: write error - %d\n", 716 newname, err); 717 } else { 718 if (n != buflen) { 719 nvf_error( 720 "%s: partial write %ld of %ld bytes\n", 721 newname, n, buflen); 722 nvf_error("%s: filesystem may be full?\n", 723 newname); 724 err = EIO; 725 } 726 } 727 if ((err1 = kfclose(fp)) != 0) { 728 nvf_error("%s: close error\n", newname); 729 if (err == 0) 730 err = err1; 731 } 732 if (err != 0) { 733 if (kfremove(newname) != 0) { 734 nvf_error("%s: remove failed\n", 735 newname); 736 } 737 } 738 } else { 739 nvf_error("%s: create failed - %d\n", filename, err); 740 } 741 742 if (err == 0) { 743 if ((err = kfrename(newname, filename)) != 0) { 744 nvf_error("%s: rename from %s failed\n", 745 newname, filename); 746 } 747 } 748 749 kmem_free(newname, len); 750 kmem_free(buf, buflen); 751 752 return (err); 753 } 754 755 static int 756 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl) 757 { 758 int err; 759 760 if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0) 761 return (DDI_SUCCESS); 762 else { 763 if (err == EROFS) 764 NVF_MARK_READONLY(nvfd); 765 return (DDI_FAILURE); 766 } 767 } 768 769 static void 770 nvp_list_free(nvfd_t *nvf) 771 { 772 ASSERT(RW_WRITE_HELD(&nvf->nvf_lock)); 773 (nvf->nvf_list_free)((nvf_handle_t)nvf); 774 ASSERT(RW_WRITE_HELD(&nvf->nvf_lock)); 775 } 776 777 /* 778 * Read a file in the nvlist format 779 * EIO - i/o error during read 780 * ENOENT - file not found 781 * EINVAL - file contents corrupted 782 */ 783 static int 784 fread_nvp_list(nvfd_t *nvfd) 785 { 786 nvlist_t *nvl; 787 nvpair_t *nvp; 788 char *name; 789 nvlist_t *sublist; 790 int rval; 791 int rv; 792 793 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); 794 795 rval = fread_nvlist(nvfd->nvf_cache_path, &nvl); 796 if (rval != 0) 797 return (rval); 798 ASSERT(nvl != NULL); 799 800 nvp = NULL; 801 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 802 name = nvpair_name(nvp); 803 ASSERT(strlen(name) > 0); 804 805 switch (nvpair_type(nvp)) { 806 case DATA_TYPE_NVLIST: 807 rval = nvpair_value_nvlist(nvp, &sublist); 808 if (rval != 0) { 809 nvf_error( 810 "nvpair_value_nvlist error %s %d\n", 811 name, rval); 812 goto error; 813 } 814 815 /* 816 * unpack nvlist for this device and 817 * add elements to data list. 818 */ 819 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); 820 rv = (nvfd->nvf_unpack_nvlist) 821 ((nvf_handle_t)nvfd, sublist, name); 822 ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); 823 if (rv != 0) { 824 nvf_error( 825 "%s: %s invalid list element\n", 826 nvfd->nvf_cache_path, name); 827 rval = EINVAL; 828 goto error; 829 } 830 break; 831 832 default: 833 nvf_error("%s: %s unsupported data type %d\n", 834 nvfd->nvf_cache_path, name, nvpair_type(nvp)); 835 rval = EINVAL; 836 goto error; 837 } 838 } 839 840 nvlist_free(nvl); 841 842 return (0); 843 844 error: 845 nvlist_free(nvl); 846 nvp_list_free(nvfd); 847 return (rval); 848 } 849 850 851 int 852 nvf_read_file(nvf_handle_t nvf_handle) 853 { 854 nvfd_t *nvfd = (nvfd_t *)nvf_handle; 855 int rval; 856 857 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); 858 859 if (kfio_disable_read) 860 return (0); 861 862 KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path)); 863 864 rval = fread_nvp_list(nvfd); 865 if (rval) { 866 switch (rval) { 867 case EIO: 868 nvfd->nvf_flags |= NVF_F_REBUILD_MSG; 869 cmn_err(CE_WARN, "%s: I/O error", 870 nvfd->nvf_cache_path); 871 break; 872 case ENOENT: 873 nvfd->nvf_flags |= NVF_F_CREATE_MSG; 874 nvf_error("%s: not found\n", 875 nvfd->nvf_cache_path); 876 break; 877 case EINVAL: 878 default: 879 nvfd->nvf_flags |= NVF_F_REBUILD_MSG; 880 cmn_err(CE_WARN, "%s: data file corrupted", 881 nvfd->nvf_cache_path); 882 break; 883 } 884 } 885 return (rval); 886 } 887 888 static void 889 nvf_write_is_complete(nvfd_t *fd) 890 { 891 if (fd->nvf_write_complete) { 892 (fd->nvf_write_complete)((nvf_handle_t)fd); 893 } 894 } 895 896 /*ARGSUSED*/ 897 static void 898 nvpflush_timeout(void *arg) 899 { 900 clock_t nticks; 901 902 mutex_enter(&nvpflush_lock); 903 nticks = nvpticks - ddi_get_lbolt(); 904 if (nticks > 4) { 905 nvpflush_timer_busy = 1; 906 mutex_exit(&nvpflush_lock); 907 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks); 908 } else { 909 do_nvpflush = 1; 910 NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n")); 911 cv_signal(&nvpflush_cv); 912 nvpflush_id = 0; 913 nvpflush_timer_busy = 0; 914 mutex_exit(&nvpflush_lock); 915 } 916 } 917 918 /* 919 * After marking a list as dirty, wake the nvpflush daemon 920 * to perform the update. 921 */ 922 void 923 nvf_wake_daemon(void) 924 { 925 clock_t nticks; 926 927 /* 928 * If the system isn't up yet 929 * don't even think about starting a flush. 930 */ 931 if (!i_ddi_io_initialized()) 932 return; 933 934 mutex_enter(&nvpflush_lock); 935 936 if (nvpflush_daemon_active == 0) { 937 nvpflush_daemon_active = 1; 938 mutex_exit(&nvpflush_lock); 939 NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n")); 940 nvpflush_thr_id = thread_create(NULL, 0, 941 (void (*)())nvpflush_daemon, 942 NULL, 0, &p0, TS_RUN, minclsyspri); 943 mutex_enter(&nvpflush_lock); 944 } 945 946 nticks = nvpflush_delay * TICKS_PER_SECOND; 947 nvpticks = ddi_get_lbolt() + nticks; 948 if (nvpflush_timer_busy == 0) { 949 nvpflush_timer_busy = 1; 950 mutex_exit(&nvpflush_lock); 951 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4); 952 } else 953 mutex_exit(&nvpflush_lock); 954 } 955 956 static int 957 nvpflush_one(nvfd_t *nvfd) 958 { 959 int rval = DDI_SUCCESS; 960 nvlist_t *nvl; 961 962 rw_enter(&nvfd->nvf_lock, RW_READER); 963 964 ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0); 965 966 if (!NVF_IS_DIRTY(nvfd) || 967 NVF_IS_READONLY(nvfd) || kfio_disable_write) { 968 NVF_CLEAR_DIRTY(nvfd); 969 rw_exit(&nvfd->nvf_lock); 970 return (DDI_SUCCESS); 971 } 972 973 if (rw_tryupgrade(&nvfd->nvf_lock) == 0) { 974 nvf_error("nvpflush: " 975 "%s rw upgrade failed\n", nvfd->nvf_cache_path); 976 rw_exit(&nvfd->nvf_lock); 977 return (DDI_FAILURE); 978 } 979 if (((nvfd->nvf_pack_list) 980 ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) { 981 nvf_error("nvpflush: " 982 "%s nvlist construction failed\n", nvfd->nvf_cache_path); 983 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); 984 rw_exit(&nvfd->nvf_lock); 985 return (DDI_FAILURE); 986 } 987 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); 988 989 NVF_CLEAR_DIRTY(nvfd); 990 nvfd->nvf_flags |= NVF_F_FLUSHING; 991 rw_exit(&nvfd->nvf_lock); 992 993 rval = e_fwrite_nvlist(nvfd, nvl); 994 nvlist_free(nvl); 995 996 rw_enter(&nvfd->nvf_lock, RW_WRITER); 997 nvfd->nvf_flags &= ~NVF_F_FLUSHING; 998 if (rval == DDI_FAILURE) { 999 if (NVF_IS_READONLY(nvfd)) { 1000 rval = DDI_SUCCESS; 1001 nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY); 1002 } else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) { 1003 cmn_err(CE_CONT, 1004 "%s: updated failed\n", nvfd->nvf_cache_path); 1005 nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY; 1006 } 1007 } else { 1008 if (nvfd->nvf_flags & NVF_F_CREATE_MSG) { 1009 cmn_err(CE_CONT, 1010 "!Creating %s\n", nvfd->nvf_cache_path); 1011 nvfd->nvf_flags &= ~NVF_F_CREATE_MSG; 1012 } 1013 if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) { 1014 cmn_err(CE_CONT, 1015 "!Rebuilding %s\n", nvfd->nvf_cache_path); 1016 nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG; 1017 } 1018 if (nvfd->nvf_flags & NVF_F_ERROR) { 1019 cmn_err(CE_CONT, 1020 "%s: update now ok\n", nvfd->nvf_cache_path); 1021 nvfd->nvf_flags &= ~NVF_F_ERROR; 1022 } 1023 /* 1024 * The file may need to be flushed again if the cached 1025 * data was touched while writing the earlier contents. 1026 */ 1027 if (NVF_IS_DIRTY(nvfd)) 1028 rval = DDI_FAILURE; 1029 } 1030 1031 rw_exit(&nvfd->nvf_lock); 1032 return (rval); 1033 } 1034 1035 1036 static void 1037 nvpflush_daemon(void) 1038 { 1039 callb_cpr_t cprinfo; 1040 nvfd_t *nvfdp, *nextfdp; 1041 clock_t clk; 1042 int rval; 1043 int want_wakeup; 1044 int is_now_clean; 1045 1046 ASSERT(modrootloaded); 1047 1048 nvpflush_thread = curthread; 1049 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n")); 1050 1051 CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp"); 1052 mutex_enter(&nvpflush_lock); 1053 for (;;) { 1054 1055 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1056 while (do_nvpflush == 0) { 1057 clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock, 1058 ddi_get_lbolt() + 1059 (nvpdaemon_idle_time * TICKS_PER_SECOND)); 1060 if (clk == -1 && 1061 do_nvpflush == 0 && nvpflush_timer_busy == 0) { 1062 /* 1063 * Note that CALLB_CPR_EXIT calls mutex_exit() 1064 * on the lock passed in to CALLB_CPR_INIT, 1065 * so the lock must be held when invoking it. 1066 */ 1067 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock); 1068 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n")); 1069 ASSERT(mutex_owned(&nvpflush_lock)); 1070 nvpflush_thr_id = NULL; 1071 nvpflush_daemon_active = 0; 1072 CALLB_CPR_EXIT(&cprinfo); 1073 thread_exit(); 1074 } 1075 } 1076 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock); 1077 1078 nvpbusy = 1; 1079 want_wakeup = 0; 1080 do_nvpflush = 0; 1081 mutex_exit(&nvpflush_lock); 1082 1083 /* 1084 * Try flushing what's dirty, reschedule if there's 1085 * a failure or data gets marked as dirty again. 1086 * First move each file marked dirty to the dirty 1087 * list to avoid locking the list across the write. 1088 */ 1089 mutex_enter(&nvf_cache_mutex); 1090 for (nvfdp = list_head(&nvf_cache_files); 1091 nvfdp; nvfdp = nextfdp) { 1092 nextfdp = list_next(&nvf_cache_files, nvfdp); 1093 rw_enter(&nvfdp->nvf_lock, RW_READER); 1094 if (NVF_IS_DIRTY(nvfdp)) { 1095 list_remove(&nvf_cache_files, nvfdp); 1096 list_insert_tail(&nvf_dirty_files, nvfdp); 1097 rw_exit(&nvfdp->nvf_lock); 1098 } else { 1099 NVPDAEMON_DEBUG((CE_CONT, 1100 "nvpdaemon: not dirty %s\n", 1101 nvfdp->nvf_cache_path)); 1102 rw_exit(&nvfdp->nvf_lock); 1103 } 1104 } 1105 mutex_exit(&nvf_cache_mutex); 1106 1107 /* 1108 * Now go through the dirty list 1109 */ 1110 for (nvfdp = list_head(&nvf_dirty_files); 1111 nvfdp; nvfdp = nextfdp) { 1112 nextfdp = list_next(&nvf_dirty_files, nvfdp); 1113 1114 is_now_clean = 0; 1115 rw_enter(&nvfdp->nvf_lock, RW_READER); 1116 if (NVF_IS_DIRTY(nvfdp)) { 1117 NVPDAEMON_DEBUG((CE_CONT, 1118 "nvpdaemon: flush %s\n", 1119 nvfdp->nvf_cache_path)); 1120 rw_exit(&nvfdp->nvf_lock); 1121 rval = nvpflush_one(nvfdp); 1122 rw_enter(&nvfdp->nvf_lock, RW_READER); 1123 if (rval != DDI_SUCCESS || 1124 NVF_IS_DIRTY(nvfdp)) { 1125 rw_exit(&nvfdp->nvf_lock); 1126 NVPDAEMON_DEBUG((CE_CONT, 1127 "nvpdaemon: %s dirty again\n", 1128 nvfdp->nvf_cache_path)); 1129 want_wakeup = 1; 1130 } else { 1131 rw_exit(&nvfdp->nvf_lock); 1132 nvf_write_is_complete(nvfdp); 1133 is_now_clean = 1; 1134 } 1135 } else { 1136 NVPDAEMON_DEBUG((CE_CONT, 1137 "nvpdaemon: not dirty %s\n", 1138 nvfdp->nvf_cache_path)); 1139 rw_exit(&nvfdp->nvf_lock); 1140 is_now_clean = 1; 1141 } 1142 1143 if (is_now_clean) { 1144 mutex_enter(&nvf_cache_mutex); 1145 list_remove(&nvf_dirty_files, nvfdp); 1146 list_insert_tail(&nvf_cache_files, 1147 nvfdp); 1148 mutex_exit(&nvf_cache_mutex); 1149 } 1150 } 1151 1152 if (want_wakeup) 1153 nvf_wake_daemon(); 1154 1155 mutex_enter(&nvpflush_lock); 1156 nvpbusy = 0; 1157 } 1158 } 1159