1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/errno.h> 30 #include <sys/cpuvar.h> 31 #include <sys/vfs.h> 32 #include <sys/vnode.h> 33 #include <sys/pathname.h> 34 #include <sys/callb.h> 35 #include <sys/fs/ufs_inode.h> 36 #include <vm/anon.h> 37 #include <sys/fs/swapnode.h> /* for swapfs_minfree */ 38 #include <sys/kmem.h> 39 #include <sys/cpr.h> 40 #include <sys/conf.h> 41 42 /* 43 * CPR miscellaneous support routines 44 */ 45 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 46 mode, 0600, vpp, CRCREAT, 0)) 47 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 48 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 49 (ssize_t *)NULL)) 50 51 extern void clkset(time_t); 52 extern cpu_t *i_cpr_bootcpu(void); 53 extern caddr_t i_cpr_map_setup(void); 54 extern void i_cpr_free_memory_resources(void); 55 56 extern kmutex_t cpr_slock; 57 extern size_t cpr_buf_size; 58 extern char *cpr_buf; 59 extern size_t cpr_pagedata_size; 60 extern char *cpr_pagedata; 61 extern int cpr_bufs_allocated; 62 extern int cpr_bitmaps_allocated; 63 64 static struct cprconfig cprconfig; 65 static int cprconfig_loaded = 0; 66 static int cpr_statefile_ok(vnode_t *, int); 67 static int cpr_p_online(cpu_t *, int); 68 static void cpr_save_mp_state(void); 69 int cpr_is_ufs(struct vfs *); 70 71 char cpr_default_path[] = CPR_DEFAULT; 72 73 #define COMPRESS_PERCENT 40 /* approx compression ratio in percent */ 74 #define SIZE_RATE 115 /* increase size by 15% */ 75 #define INTEGRAL 100 /* for integer math */ 76 77 78 /* 79 * cmn_err() followed by a 1/4 second delay; this gives the 80 * logging service a chance to flush messages and helps avoid 81 * intermixing output from prom_printf(). 82 */ 83 /*PRINTFLIKE2*/ 84 void 85 cpr_err(int ce, const char *fmt, ...) 86 { 87 va_list adx; 88 89 va_start(adx, fmt); 90 vcmn_err(ce, fmt, adx); 91 va_end(adx); 92 drv_usecwait(MICROSEC >> 2); 93 } 94 95 96 int 97 cpr_init(int fcn) 98 { 99 /* 100 * Allow only one suspend/resume process. 101 */ 102 if (mutex_tryenter(&cpr_slock) == 0) 103 return (EBUSY); 104 105 CPR->c_flags = 0; 106 CPR->c_substate = 0; 107 CPR->c_cprboot_magic = 0; 108 CPR->c_alloc_cnt = 0; 109 110 CPR->c_fcn = fcn; 111 if (fcn == AD_CPR_REUSABLE) 112 CPR->c_flags |= C_REUSABLE; 113 else 114 CPR->c_flags |= C_SUSPENDING; 115 if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ) 116 CPR->c_flags |= C_COMPRESSING; 117 /* 118 * reserve CPR_MAXCONTIG virtual pages for cpr_dump() 119 */ 120 CPR->c_mapping_area = i_cpr_map_setup(); 121 if (CPR->c_mapping_area == 0) { /* no space in kernelmap */ 122 cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n"); 123 mutex_exit(&cpr_slock); 124 return (EAGAIN); 125 } 126 if (cpr_debug & CPR_DEBUG3) 127 cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing " 128 "kas\n", (void *)CPR->c_mapping_area); 129 130 return (0); 131 } 132 133 /* 134 * This routine releases any resources used during the checkpoint. 135 */ 136 void 137 cpr_done(void) 138 { 139 cpr_stat_cleanup(); 140 i_cpr_bitmap_cleanup(); 141 142 /* 143 * Free pages used by cpr buffers. 144 */ 145 if (cpr_buf) { 146 kmem_free(cpr_buf, cpr_buf_size); 147 cpr_buf = NULL; 148 } 149 if (cpr_pagedata) { 150 kmem_free(cpr_pagedata, cpr_pagedata_size); 151 cpr_pagedata = NULL; 152 } 153 154 i_cpr_free_memory_resources(); 155 mutex_exit(&cpr_slock); 156 cpr_err(CE_CONT, "System has been resumed.\n"); 157 } 158 159 160 /* 161 * reads config data into cprconfig 162 */ 163 static int 164 cpr_get_config(void) 165 { 166 static char config_path[] = CPR_CONFIG; 167 struct cprconfig *cf = &cprconfig; 168 struct vnode *vp; 169 char *fmt; 170 int err; 171 172 if (cprconfig_loaded) 173 return (0); 174 175 fmt = "cannot %s config file \"%s\", error %d\n"; 176 if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) { 177 cpr_err(CE_CONT, fmt, "open", config_path, err); 178 return (err); 179 } 180 181 err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf)); 182 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED()); 183 VN_RELE(vp); 184 if (err) { 185 cpr_err(CE_CONT, fmt, "read", config_path, err); 186 return (err); 187 } 188 189 if (cf->cf_magic == CPR_CONFIG_MAGIC) 190 cprconfig_loaded = 1; 191 else { 192 cpr_err(CE_CONT, "invalid config file \"%s\", " 193 "rerun pmconfig(1M)\n", config_path); 194 err = EINVAL; 195 } 196 197 return (err); 198 } 199 200 201 /* 202 * concat fs and path fields of the cprconfig structure; 203 * returns pointer to the base of static data 204 */ 205 static char * 206 cpr_cprconfig_to_path(void) 207 { 208 static char full_path[MAXNAMELEN]; 209 struct cprconfig *cf = &cprconfig; 210 char *ptr; 211 212 /* 213 * build /fs/path without extra '/' 214 */ 215 (void) strcpy(full_path, cf->cf_fs); 216 if (strcmp(cf->cf_fs, "/")) 217 (void) strcat(full_path, "/"); 218 ptr = cf->cf_path; 219 if (*ptr == '/') 220 ptr++; 221 (void) strcat(full_path, ptr); 222 return (full_path); 223 } 224 225 226 /* 227 * Verify that the information in the configuration file regarding the 228 * location for the statefile is still valid, depending on cf_type. 229 * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be 230 * mounted on the same device as when pmconfig was last run, 231 * and the translation of that device to a node in the prom's 232 * device tree must be the same as when pmconfig was last run. 233 * for CFT_SPEC, cf_path must be the path to a block special file, 234 * it must have no file system mounted on it, 235 * and the translation of that device to a node in the prom's 236 * device tree must be the same as when pmconfig was last run. 237 */ 238 static int 239 cpr_verify_statefile_path(void) 240 { 241 struct cprconfig *cf = &cprconfig; 242 static const char long_name[] = "Statefile pathname is too long.\n"; 243 static const char lookup_fmt[] = "Lookup failed for " 244 "cpr statefile device %s.\n"; 245 static const char path_chg_fmt[] = "Device path for statefile " 246 "has changed from %s to %s.\t%s\n"; 247 static const char rerun[] = "Please rerun pmconfig(1m)."; 248 struct vfs *vfsp = NULL, *vfsp_save = rootvfs; 249 ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data; 250 ufsvfs_t *ufsvfsp_save = ufsvfsp; 251 int error; 252 struct vnode *vp; 253 char *slash, *tail, *longest; 254 char *errstr; 255 int found = 0; 256 union { 257 char un_devpath[OBP_MAXPATHLEN]; 258 char un_sfpath[MAXNAMELEN]; 259 } un; 260 #define devpath un.un_devpath 261 #define sfpath un.un_sfpath 262 263 ASSERT(cprconfig_loaded); 264 /* 265 * We need not worry about locking or the timing of releasing 266 * the vnode, since we are single-threaded now. 267 */ 268 269 switch (cf->cf_type) { 270 case CFT_SPEC: 271 if (strlen(cf->cf_path) > sizeof (sfpath)) { 272 cpr_err(CE_CONT, long_name); 273 return (ENAMETOOLONG); 274 } 275 if ((error = lookupname(cf->cf_devfs, 276 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 277 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs); 278 return (error); 279 } 280 if (vp->v_type != VBLK) 281 errstr = "statefile must be a block device"; 282 else if (vfs_devismounted(vp->v_rdev)) 283 errstr = "statefile device must not " 284 "have a file system mounted on it"; 285 else if (IS_SWAPVP(vp)) 286 errstr = "statefile device must not " 287 "be configured as swap file"; 288 else 289 errstr = NULL; 290 291 VN_RELE(vp); 292 if (errstr) { 293 cpr_err(CE_CONT, "%s.\n", errstr); 294 return (ENOTSUP); 295 } 296 297 error = i_devname_to_promname(cf->cf_devfs, devpath, 298 OBP_MAXPATHLEN); 299 if (error || strcmp(devpath, cf->cf_dev_prom)) { 300 cpr_err(CE_CONT, path_chg_fmt, 301 cf->cf_dev_prom, devpath, rerun); 302 } 303 return (error); 304 case CFT_UFS: 305 break; /* don't indent all the original code */ 306 default: 307 cpr_err(CE_PANIC, "invalid cf_type"); 308 } 309 310 /* 311 * The original code for UFS statefile 312 */ 313 if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) { 314 cpr_err(CE_CONT, long_name); 315 return (ENAMETOOLONG); 316 } 317 318 bzero(sfpath, sizeof (sfpath)); 319 (void) strcpy(sfpath, cpr_cprconfig_to_path()); 320 321 if (*sfpath != '/') { 322 cpr_err(CE_CONT, "Statefile pathname %s " 323 "must begin with a /\n", sfpath); 324 return (EINVAL); 325 } 326 327 /* 328 * Find the longest prefix of the statefile pathname which 329 * is the mountpoint of a filesystem. This string must 330 * match the cf_fs field we read from the config file. Other- 331 * wise the user has changed things without running pmconfig. 332 */ 333 tail = longest = sfpath + 1; /* pt beyond the leading "/" */ 334 while ((slash = strchr(tail, '/')) != NULL) { 335 *slash = '\0'; /* temporarily terminate the string */ 336 if ((error = lookupname(sfpath, 337 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 338 *slash = '/'; 339 cpr_err(CE_CONT, "A directory in the " 340 "statefile path %s was not found.\n", sfpath); 341 VN_RELE(vp); 342 343 return (error); 344 } 345 346 vfs_list_read_lock(); 347 vfsp = rootvfs; 348 do { 349 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 350 if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) { 351 found = 1; 352 break; 353 } 354 vfsp = vfsp->vfs_next; 355 } while (vfsp != rootvfs); 356 vfs_list_unlock(); 357 358 /* 359 * If we have found a filesystem mounted on the current 360 * path prefix, remember the end of the string in 361 * "longest". If it happens to be the the exact fs 362 * saved in the configuration file, save the current 363 * ufsvfsp so we can make additional checks further down. 364 */ 365 if (found) { 366 longest = slash; 367 if (strcmp(cf->cf_fs, sfpath) == 0) { 368 ufsvfsp_save = ufsvfsp; 369 vfsp_save = vfsp; 370 } 371 found = 0; 372 } 373 374 VN_RELE(vp); 375 *slash = '/'; 376 tail = slash + 1; 377 } 378 *longest = '\0'; 379 if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) { 380 cpr_err(CE_CONT, "Filesystem containing " 381 "the statefile when pmconfig was run (%s) has " 382 "changed to %s. %s\n", cf->cf_fs, sfpath, rerun); 383 return (EINVAL); 384 } 385 386 if ((error = lookupname(cf->cf_devfs, 387 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 388 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs); 389 return (error); 390 } 391 392 if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) { 393 cpr_err(CE_CONT, "Filesystem containing " 394 "statefile no longer mounted on device %s. " 395 "See power.conf(4).", cf->cf_devfs); 396 VN_RELE(vp); 397 return (ENXIO); 398 } 399 VN_RELE(vp); 400 401 error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN); 402 if (error || strcmp(devpath, cf->cf_dev_prom)) { 403 cpr_err(CE_CONT, path_chg_fmt, 404 cf->cf_dev_prom, devpath, rerun); 405 return (error); 406 } 407 408 return (0); 409 } 410 411 /* 412 * Make sure that the statefile can be used as a block special statefile 413 * (meaning that is exists and has nothing mounted on it) 414 * Returns errno if not a valid statefile. 415 */ 416 int 417 cpr_check_spec_statefile(void) 418 { 419 int err; 420 421 if (err = cpr_get_config()) 422 return (err); 423 ASSERT(cprconfig.cf_type == CFT_SPEC); 424 425 if (cprconfig.cf_devfs == NULL) 426 return (ENXIO); 427 428 return (cpr_verify_statefile_path()); 429 430 } 431 432 int 433 cpr_alloc_statefile(int alloc_retry) 434 { 435 register int rc = 0; 436 char *str; 437 438 /* 439 * Statefile size validation. If checkpoint the first time, disk blocks 440 * allocation will be done; otherwise, just do file size check. 441 * if statefile allocation is being retried, C_VP will be inited 442 */ 443 if (alloc_retry) { 444 str = "\n-->Retrying statefile allocation..."; 445 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7)) 446 prom_printf(str); 447 if (C_VP->v_type != VBLK) 448 (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL); 449 } else { 450 /* 451 * Open an exiting file for writing, the state file needs to be 452 * pre-allocated since we can't and don't want to do allocation 453 * during checkpoint (too much of the OS is disabled). 454 * - do a preliminary size checking here, if it is too small, 455 * allocate more space internally and retry. 456 * - check the vp to make sure it's the right type. 457 */ 458 char *path = cpr_build_statefile_path(); 459 460 if (path == NULL) 461 return (ENXIO); 462 else if (rc = cpr_verify_statefile_path()) 463 return (rc); 464 465 if (rc = vn_open(path, UIO_SYSSPACE, 466 FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) { 467 cpr_err(CE_WARN, "cannot open statefile %s", path); 468 return (rc); 469 } 470 } 471 472 /* 473 * Only ufs and block special statefiles supported 474 */ 475 if (C_VP->v_type != VREG && C_VP->v_type != VBLK) { 476 cpr_err(CE_CONT, 477 "Statefile must be regular file or block special file."); 478 return (EACCES); 479 } 480 481 if (rc = cpr_statefile_ok(C_VP, alloc_retry)) 482 return (rc); 483 484 if (C_VP->v_type != VBLK) { 485 /* 486 * sync out the fs change due to the statefile reservation. 487 */ 488 (void) VFS_SYNC(C_VP->v_vfsp, 0, CRED()); 489 490 /* 491 * Validate disk blocks allocation for the state file. 492 * Ask the file system prepare itself for the dump operation. 493 */ 494 if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL)) { 495 cpr_err(CE_CONT, "Error allocating " 496 "blocks for cpr statefile."); 497 return (rc); 498 } 499 } 500 return (0); 501 } 502 503 504 /* 505 * lookup device size in blocks, 506 * and return available space in bytes 507 */ 508 size_t 509 cpr_get_devsize(dev_t dev) 510 { 511 size_t bytes = 0; 512 int64_t Nblocks; 513 int nblocks; 514 515 if ((Nblocks = bdev_Size(dev)) != -1) 516 bytes = dbtob(Nblocks); 517 else if ((nblocks = bdev_size(dev)) != -1) 518 bytes = dbtob(nblocks); 519 520 if (bytes > CPR_SPEC_OFFSET) 521 bytes -= CPR_SPEC_OFFSET; 522 else 523 bytes = 0; 524 525 return (bytes); 526 } 527 528 529 /* 530 * increase statefile size 531 */ 532 static int 533 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize) 534 { 535 extern uchar_t cpr_pagecopy[]; 536 struct inode *ip = VTOI(vp); 537 u_longlong_t offset; 538 int error, increase; 539 ssize_t resid; 540 541 rw_enter(&ip->i_contents, RW_READER); 542 increase = (ip->i_size < newsize); 543 offset = ip->i_size; 544 rw_exit(&ip->i_contents); 545 546 if (increase == 0) 547 return (0); 548 549 /* 550 * write to each logical block to reserve disk space 551 */ 552 error = 0; 553 cpr_pagecopy[0] = '1'; 554 for (; offset < newsize; offset += ip->i_fs->fs_bsize) { 555 if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy, 556 ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0, 557 (rlim64_t)MAXOFF_T, CRED(), &resid)) { 558 if (error == ENOSPC) { 559 cpr_err(CE_WARN, "error %d while reserving " 560 "disk space for statefile %s\n" 561 "wanted %lld bytes, file is %lld short", 562 error, cpr_cprconfig_to_path(), 563 newsize, newsize - offset); 564 } 565 break; 566 } 567 } 568 return (error); 569 } 570 571 572 /* 573 * do a simple estimate of the space needed to hold the statefile 574 * taking compression into account, but be fairly conservative 575 * so we have a better chance of completing; when dump fails, 576 * the retry cost is fairly high. 577 * 578 * Do disk blocks allocation for the state file if no space has 579 * been allocated yet. Since the state file will not be removed, 580 * allocation should only be done once. 581 */ 582 static int 583 cpr_statefile_ok(vnode_t *vp, int alloc_retry) 584 { 585 extern size_t cpr_bitmap_size; 586 struct inode *ip = VTOI(vp); 587 const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */ 588 u_longlong_t size, isize, ksize, raw_data; 589 char *str, *est_fmt; 590 size_t space; 591 int error; 592 593 /* 594 * number of pages short for swapping. 595 */ 596 STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv; 597 if (STAT->cs_nosw_pages < 0) 598 STAT->cs_nosw_pages = 0; 599 600 str = "cpr_statefile_ok:"; 601 602 CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n", 603 k_anoninfo.ani_max, k_anoninfo.ani_phys_resv); 604 CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n", 605 MAX(availrmem - swapfs_minfree, 0), 606 k_anoninfo.ani_mem_resv); 607 CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n", 608 CURRENT_TOTAL_AVAILABLE_SWAP); 609 610 /* 611 * try increasing filesize by 15% 612 */ 613 if (alloc_retry) { 614 /* 615 * block device doesn't get any bigger 616 */ 617 if (vp->v_type == VBLK) { 618 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 619 prom_printf( 620 "Retry statefile on special file\n"); 621 return (ENOMEM); 622 } else { 623 rw_enter(&ip->i_contents, RW_READER); 624 size = (ip->i_size * SIZE_RATE) / INTEGRAL; 625 rw_exit(&ip->i_contents); 626 } 627 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 628 prom_printf("Retry statefile size = %lld\n", size); 629 } else { 630 u_longlong_t cpd_size; 631 pgcnt_t npages, nback; 632 int ndvram; 633 634 ndvram = 0; 635 (void) callb_execute_class(CB_CL_CPR_FB, 636 (int)(uintptr_t)&ndvram); 637 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 638 prom_printf("ndvram size = %d\n", ndvram); 639 640 /* 641 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages 642 */ 643 npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit); 644 cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2)); 645 raw_data = cpd_size + cpr_bitmap_size; 646 ksize = ndvram + mmu_ptob(npages); 647 648 est_fmt = "%s estimated size with " 649 "%scompression %lld, ksize %lld\n"; 650 nback = mmu_ptob(STAT->cs_nosw_pages); 651 if (CPR->c_flags & C_COMPRESSING) { 652 size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) + 653 raw_data + ((nback * 10) / UCOMP_RATE); 654 CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize); 655 } else { 656 size = ksize + raw_data + nback; 657 CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ", 658 size, ksize); 659 } 660 } 661 662 /* 663 * All this is much simpler for a block device 664 */ 665 if (vp->v_type == VBLK) { 666 space = cpr_get_devsize(vp->v_rdev); 667 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 668 prom_printf("statefile dev size %lu\n", space); 669 670 /* 671 * Export the estimated filesize info, this value will be 672 * compared before dumping out the statefile in the case of 673 * no compression. 674 */ 675 STAT->cs_est_statefsz = size; 676 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 677 prom_printf("%s Estimated statefile size %llu, " 678 "space %lu\n", str, size, space); 679 if (size > space) { 680 cpr_err(CE_CONT, "Statefile partition too small."); 681 return (ENOMEM); 682 } 683 return (0); 684 } else { 685 if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) { 686 cpr_err(CE_CONT, "Statefile allocation retry failed\n"); 687 return (ENOMEM); 688 } 689 690 /* 691 * Estimate space needed for the state file. 692 * 693 * State file size in bytes: 694 * kernel size + non-cache pte seg + 695 * bitmap size + cpr state file headers size 696 * (round up to fs->fs_bsize) 697 */ 698 size = blkroundup(ip->i_fs, size); 699 700 /* 701 * Export the estimated filesize info, this value will be 702 * compared before dumping out the statefile in the case of 703 * no compression. 704 */ 705 STAT->cs_est_statefsz = size; 706 error = cpr_grow_statefile(vp, size); 707 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) { 708 rw_enter(&ip->i_contents, RW_READER); 709 isize = ip->i_size; 710 rw_exit(&ip->i_contents); 711 prom_printf("%s Estimated statefile size %lld, " 712 "i_size %lld\n", str, size, isize); 713 } 714 715 return (error); 716 } 717 } 718 719 720 void 721 cpr_statef_close(void) 722 { 723 if (C_VP) { 724 if (!cpr_reusable_mode) 725 (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL); 726 (void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED()); 727 VN_RELE(C_VP); 728 C_VP = 0; 729 } 730 } 731 732 733 /* 734 * open cpr default file and display error 735 */ 736 int 737 cpr_open_deffile(int mode, vnode_t **vpp) 738 { 739 int error; 740 741 if (error = cpr_open(cpr_default_path, mode, vpp)) 742 cpr_err(CE_CONT, "cannot open \"%s\", error %d\n", 743 cpr_default_path, error); 744 return (error); 745 } 746 747 748 /* 749 * write cdef_t to disk. This contains the original values of prom 750 * properties that we modify. We fill in the magic number of the file 751 * here as a signal to the booter code that the state file is valid. 752 * Be sure the file gets synced, since we may be shutting down the OS. 753 */ 754 int 755 cpr_write_deffile(cdef_t *cdef) 756 { 757 struct vnode *vp; 758 char *str; 759 int rc; 760 761 if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp)) 762 return (rc); 763 764 if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef))) 765 str = "write"; 766 else if (rc = VOP_FSYNC(vp, FSYNC, CRED())) 767 str = "fsync"; 768 (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED()); 769 VN_RELE(vp); 770 771 if (rc) { 772 cpr_err(CE_WARN, "%s error %d, file \"%s\"", 773 str, rc, cpr_default_path); 774 } 775 return (rc); 776 } 777 778 /* 779 * Clear the magic number in the defaults file. This tells the booter 780 * program that the state file is not current and thus prevents 781 * any attempt to restore from an obsolete state file. 782 */ 783 void 784 cpr_clear_definfo(void) 785 { 786 struct vnode *vp; 787 cmini_t mini; 788 789 if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) || 790 cpr_open_deffile(FCREAT|FWRITE, &vp)) 791 return; 792 mini.magic = mini.reusable = 0; 793 (void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini)); 794 (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED()); 795 VN_RELE(vp); 796 } 797 798 /* 799 * If the cpr default file is invalid, then we must not be in reusable mode 800 * if it is valid, it tells us our mode 801 */ 802 int 803 cpr_get_reusable_mode(void) 804 { 805 struct vnode *vp; 806 cmini_t mini; 807 int rc; 808 809 if (cpr_open(cpr_default_path, FREAD, &vp)) 810 return (0); 811 812 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 813 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED()); 814 VN_RELE(vp); 815 if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC) 816 return (mini.reusable); 817 818 return (0); 819 } 820 821 /* 822 * clock/time related routines 823 */ 824 static time_t cpr_time_stamp; 825 826 827 void 828 cpr_tod_get(cpr_time_t *ctp) 829 { 830 timestruc_t ts; 831 832 mutex_enter(&tod_lock); 833 ts = tod_get(); 834 mutex_exit(&tod_lock); 835 ctp->tv_sec = (time32_t)ts.tv_sec; 836 ctp->tv_nsec = (int32_t)ts.tv_nsec; 837 } 838 839 void 840 cpr_tod_fault_reset(void) 841 { 842 mutex_enter(&tod_lock); 843 tod_fault_reset(); 844 mutex_exit(&tod_lock); 845 } 846 847 void 848 cpr_save_time(void) 849 { 850 cpr_time_stamp = gethrestime_sec(); 851 } 852 853 /* 854 * correct time based on saved time stamp or hardware clock 855 */ 856 void 857 cpr_restore_time(void) 858 { 859 clkset(cpr_time_stamp); 860 } 861 862 /* 863 * CPU ONLINE/OFFLINE CODE 864 */ 865 int 866 cpr_mp_offline(void) 867 { 868 cpu_t *cp, *bootcpu; 869 int rc = 0; 870 int brought_up_boot = 0; 871 872 /* 873 * Do nothing for UP. 874 */ 875 if (ncpus == 1) 876 return (0); 877 878 mutex_enter(&cpu_lock); 879 880 cpr_save_mp_state(); 881 882 bootcpu = i_cpr_bootcpu(); 883 if (!CPU_ACTIVE(bootcpu)) { 884 if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) { 885 mutex_exit(&cpu_lock); 886 return (rc); 887 } 888 brought_up_boot = 1; 889 } 890 891 cp = cpu_list; 892 do { 893 if (cp == bootcpu) 894 continue; 895 if (cp->cpu_flags & CPU_OFFLINE) 896 continue; 897 if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) { 898 mutex_exit(&cpu_lock); 899 return (rc); 900 } 901 } while ((cp = cp->cpu_next) != cpu_list); 902 if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))) 903 prom_printf("changed cpu %p to state %d\n", 904 bootcpu, CPU_CPR_ONLINE); 905 mutex_exit(&cpu_lock); 906 907 return (rc); 908 } 909 910 int 911 cpr_mp_online(void) 912 { 913 cpu_t *cp, *bootcpu = CPU; 914 int rc = 0; 915 916 /* 917 * Do nothing for UP. 918 */ 919 if (ncpus == 1) 920 return (0); 921 922 /* 923 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags 924 * to indicate a cpu was online at the time of cpr_suspend(); 925 * now restart those cpus that were marked as CPU_CPR_ONLINE 926 * and actually are offline. 927 */ 928 mutex_enter(&cpu_lock); 929 for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) { 930 /* 931 * Clear the CPU_FROZEN flag in all cases. 932 */ 933 cp->cpu_flags &= ~CPU_FROZEN; 934 935 if (CPU_CPR_IS_OFFLINE(cp)) 936 continue; 937 if (CPU_ACTIVE(cp)) 938 continue; 939 if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) { 940 mutex_exit(&cpu_lock); 941 return (rc); 942 } 943 } 944 945 /* 946 * turn off the boot cpu if it was offlined 947 */ 948 if (CPU_CPR_IS_OFFLINE(bootcpu)) { 949 if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) { 950 mutex_exit(&cpu_lock); 951 return (rc); 952 } 953 } 954 mutex_exit(&cpu_lock); 955 return (0); 956 } 957 958 static void 959 cpr_save_mp_state(void) 960 { 961 cpu_t *cp; 962 963 ASSERT(MUTEX_HELD(&cpu_lock)); 964 965 cp = cpu_list; 966 do { 967 cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE; 968 if (CPU_ACTIVE(cp)) 969 CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE); 970 } while ((cp = cp->cpu_next) != cpu_list); 971 } 972 973 /* 974 * change cpu to online/offline 975 */ 976 static int 977 cpr_p_online(cpu_t *cp, int state) 978 { 979 int rc; 980 981 ASSERT(MUTEX_HELD(&cpu_lock)); 982 983 switch (state) { 984 case CPU_CPR_ONLINE: 985 rc = cpu_online(cp); 986 break; 987 case CPU_CPR_OFFLINE: 988 rc = cpu_offline(cp, CPU_FORCED); 989 break; 990 } 991 if (rc) { 992 cpr_err(CE_WARN, "Failed to change processor %d to " 993 "state %d, (errno %d)", cp->cpu_id, state, rc); 994 } 995 return (rc); 996 } 997 998 /* 999 * Construct the pathname of the state file and return a pointer to 1000 * caller. Read the config file to get the mount point of the 1001 * filesystem and the pathname within fs. 1002 */ 1003 char * 1004 cpr_build_statefile_path(void) 1005 { 1006 struct cprconfig *cf = &cprconfig; 1007 1008 if (cpr_get_config()) 1009 return (NULL); 1010 1011 switch (cf->cf_type) { 1012 case CFT_UFS: 1013 if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) { 1014 cpr_err(CE_CONT, "Statefile path is too long.\n"); 1015 return (NULL); 1016 } 1017 return (cpr_cprconfig_to_path()); 1018 case CFT_SPEC: 1019 return (cf->cf_devfs); 1020 default: 1021 cpr_err(CE_PANIC, "invalid statefile type"); 1022 /*NOTREACHED*/ 1023 return (NULL); 1024 } 1025 } 1026 1027 int 1028 cpr_statefile_is_spec(void) 1029 { 1030 if (cpr_get_config()) 1031 return (0); 1032 return (cprconfig.cf_type == CFT_SPEC); 1033 } 1034 1035 char * 1036 cpr_get_statefile_prom_path(void) 1037 { 1038 struct cprconfig *cf = &cprconfig; 1039 1040 ASSERT(cprconfig_loaded); 1041 ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC); 1042 ASSERT(cf->cf_type == CFT_SPEC); 1043 return (cf->cf_dev_prom); 1044 } 1045 1046 1047 /* 1048 * XXX The following routines need to be in the vfs source code. 1049 */ 1050 1051 int 1052 cpr_is_ufs(struct vfs *vfsp) 1053 { 1054 char *fsname; 1055 1056 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1057 return (strcmp(fsname, "ufs") == 0); 1058 } 1059 1060 /* 1061 * This is a list of file systems that are allowed to be writeable when a 1062 * reusable statefile checkpoint is taken. They must not have any state that 1063 * cannot be restored to consistency by simply rebooting using the checkpoint. 1064 * (In contrast to ufs, cachefs and pcfs which have disk state that could get 1065 * out of sync with the in-kernel data). 1066 */ 1067 int 1068 cpr_reusable_mount_check(void) 1069 { 1070 struct vfs *vfsp; 1071 char *fsname; 1072 char **cpp; 1073 static char *cpr_writeok_fss[] = { 1074 "autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs", 1075 "proc", "tmpfs", "ctfs", "objfs", "dev", NULL 1076 }; 1077 1078 vfs_list_read_lock(); 1079 vfsp = rootvfs; 1080 do { 1081 if (vfsp->vfs_flag & VFS_RDONLY) { 1082 vfsp = vfsp->vfs_next; 1083 continue; 1084 } 1085 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1086 for (cpp = cpr_writeok_fss; *cpp; cpp++) { 1087 if (strcmp(fsname, *cpp) == 0) 1088 break; 1089 } 1090 /* 1091 * if the inner loop reached the NULL terminator, 1092 * the current fs-type does not match any OK-type 1093 */ 1094 if (*cpp == NULL) { 1095 cpr_err(CE_CONT, "a filesystem of type %s is " 1096 "mounted read/write.\nReusable statefile requires " 1097 "no writeable filesystem of this type be mounted\n", 1098 fsname); 1099 vfs_list_unlock(); 1100 return (EINVAL); 1101 } 1102 vfsp = vfsp->vfs_next; 1103 } while (vfsp != rootvfs); 1104 vfs_list_unlock(); 1105 return (0); 1106 } 1107 1108 /* 1109 * Force a fresh read of the cprinfo per uadmin 3 call 1110 */ 1111 void 1112 cpr_forget_cprconfig(void) 1113 { 1114 cprconfig_loaded = 0; 1115 } 1116 1117 1118 /* 1119 * return statefile offset in DEV_BSIZE units 1120 */ 1121 int 1122 cpr_statefile_offset(void) 1123 { 1124 return (cpr_statefile_is_spec() ? btod(CPR_SPEC_OFFSET) : 0); 1125 } 1126