1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/errno.h> 29 #include <sys/cpuvar.h> 30 #include <sys/vfs.h> 31 #include <sys/vnode.h> 32 #include <sys/pathname.h> 33 #include <sys/callb.h> 34 #include <sys/fs/ufs_inode.h> 35 #include <vm/anon.h> 36 #include <sys/fs/swapnode.h> /* for swapfs_minfree */ 37 #include <sys/kmem.h> 38 #include <sys/cpr.h> 39 #include <sys/conf.h> 40 #include <sys/machclock.h> 41 42 /* 43 * CPR miscellaneous support routines 44 */ 45 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 46 mode, 0600, vpp, CRCREAT, 0)) 47 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 48 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 49 (ssize_t *)NULL)) 50 51 extern void clkset(time_t); 52 extern cpu_t *i_cpr_bootcpu(void); 53 extern caddr_t i_cpr_map_setup(void); 54 extern void i_cpr_free_memory_resources(void); 55 56 extern kmutex_t cpr_slock; 57 extern size_t cpr_buf_size; 58 extern char *cpr_buf; 59 extern size_t cpr_pagedata_size; 60 extern char *cpr_pagedata; 61 extern int cpr_bufs_allocated; 62 extern int cpr_bitmaps_allocated; 63 64 #if defined(__sparc) 65 static struct cprconfig cprconfig; 66 static int cprconfig_loaded = 0; 67 static int cpr_statefile_ok(vnode_t *, int); 68 static int cpr_p_online(cpu_t *, int); 69 static void cpr_save_mp_state(void); 70 #endif 71 72 int cpr_is_ufs(struct vfs *); 73 int cpr_is_zfs(struct vfs *); 74 75 char cpr_default_path[] = CPR_DEFAULT; 76 77 #define COMPRESS_PERCENT 40 /* approx compression ratio in percent */ 78 #define SIZE_RATE 115 /* increase size by 15% */ 79 #define INTEGRAL 100 /* for integer math */ 80 81 82 /* 83 * cmn_err() followed by a 1/4 second delay; this gives the 84 * logging service a chance to flush messages and helps avoid 85 * intermixing output from prom_printf(). 86 */ 87 /*PRINTFLIKE2*/ 88 void 89 cpr_err(int ce, const char *fmt, ...) 90 { 91 va_list adx; 92 93 va_start(adx, fmt); 94 vcmn_err(ce, fmt, adx); 95 va_end(adx); 96 drv_usecwait(MICROSEC >> 2); 97 } 98 99 100 int 101 cpr_init(int fcn) 102 { 103 /* 104 * Allow only one suspend/resume process. 105 */ 106 if (mutex_tryenter(&cpr_slock) == 0) 107 return (EBUSY); 108 109 CPR->c_flags = 0; 110 CPR->c_substate = 0; 111 CPR->c_cprboot_magic = 0; 112 CPR->c_alloc_cnt = 0; 113 114 CPR->c_fcn = fcn; 115 if (fcn == AD_CPR_REUSABLE) 116 CPR->c_flags |= C_REUSABLE; 117 else 118 CPR->c_flags |= C_SUSPENDING; 119 if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) { 120 return (0); 121 } 122 #if defined(__sparc) 123 if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ) 124 CPR->c_flags |= C_COMPRESSING; 125 /* 126 * reserve CPR_MAXCONTIG virtual pages for cpr_dump() 127 */ 128 CPR->c_mapping_area = i_cpr_map_setup(); 129 if (CPR->c_mapping_area == 0) { /* no space in kernelmap */ 130 cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n"); 131 mutex_exit(&cpr_slock); 132 return (EAGAIN); 133 } 134 if (cpr_debug & CPR_DEBUG3) 135 cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing " 136 "kas\n", (void *)CPR->c_mapping_area); 137 #endif 138 139 return (0); 140 } 141 142 /* 143 * This routine releases any resources used during the checkpoint. 144 */ 145 void 146 cpr_done(void) 147 { 148 cpr_stat_cleanup(); 149 i_cpr_bitmap_cleanup(); 150 151 /* 152 * Free pages used by cpr buffers. 153 */ 154 if (cpr_buf) { 155 kmem_free(cpr_buf, cpr_buf_size); 156 cpr_buf = NULL; 157 } 158 if (cpr_pagedata) { 159 kmem_free(cpr_pagedata, cpr_pagedata_size); 160 cpr_pagedata = NULL; 161 } 162 163 i_cpr_free_memory_resources(); 164 mutex_exit(&cpr_slock); 165 cpr_err(CE_CONT, "System has been resumed.\n"); 166 } 167 168 169 #if defined(__sparc) 170 /* 171 * reads config data into cprconfig 172 */ 173 static int 174 cpr_get_config(void) 175 { 176 static char config_path[] = CPR_CONFIG; 177 struct cprconfig *cf = &cprconfig; 178 struct vnode *vp; 179 char *fmt; 180 int err; 181 182 if (cprconfig_loaded) 183 return (0); 184 185 fmt = "cannot %s config file \"%s\", error %d\n"; 186 if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) { 187 cpr_err(CE_CONT, fmt, "open", config_path, err); 188 return (err); 189 } 190 191 err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf)); 192 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL); 193 VN_RELE(vp); 194 if (err) { 195 cpr_err(CE_CONT, fmt, "read", config_path, err); 196 return (err); 197 } 198 199 if (cf->cf_magic == CPR_CONFIG_MAGIC) 200 cprconfig_loaded = 1; 201 else { 202 cpr_err(CE_CONT, "invalid config file \"%s\", " 203 "rerun pmconfig(1M)\n", config_path); 204 err = EINVAL; 205 } 206 207 return (err); 208 } 209 210 211 /* 212 * concat fs and path fields of the cprconfig structure; 213 * returns pointer to the base of static data 214 */ 215 static char * 216 cpr_cprconfig_to_path(void) 217 { 218 static char full_path[MAXNAMELEN]; 219 struct cprconfig *cf = &cprconfig; 220 char *ptr; 221 222 /* 223 * build /fs/path without extra '/' 224 */ 225 (void) strcpy(full_path, cf->cf_fs); 226 if (strcmp(cf->cf_fs, "/")) 227 (void) strcat(full_path, "/"); 228 ptr = cf->cf_path; 229 if (*ptr == '/') 230 ptr++; 231 (void) strcat(full_path, ptr); 232 return (full_path); 233 } 234 235 236 /* 237 * Verify that the information in the configuration file regarding the 238 * location for the statefile is still valid, depending on cf_type. 239 * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be 240 * mounted on the same device as when pmconfig was last run, 241 * and the translation of that device to a node in the prom's 242 * device tree must be the same as when pmconfig was last run. 243 * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block 244 * special file, it must have no file system mounted on it, 245 * and the translation of that device to a node in the prom's 246 * device tree must be the same as when pmconfig was last run. 247 */ 248 static int 249 cpr_verify_statefile_path(void) 250 { 251 struct cprconfig *cf = &cprconfig; 252 static const char long_name[] = "Statefile pathname is too long.\n"; 253 static const char lookup_fmt[] = "Lookup failed for " 254 "cpr statefile device %s.\n"; 255 static const char path_chg_fmt[] = "Device path for statefile " 256 "has changed from %s to %s.\t%s\n"; 257 static const char rerun[] = "Please rerun pmconfig(1m)."; 258 struct vfs *vfsp = NULL, *vfsp_save = rootvfs; 259 ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data; 260 ufsvfs_t *ufsvfsp_save = ufsvfsp; 261 int error; 262 struct vnode *vp; 263 char *slash, *tail, *longest; 264 char *errstr; 265 int found = 0; 266 union { 267 char un_devpath[OBP_MAXPATHLEN]; 268 char un_sfpath[MAXNAMELEN]; 269 } un; 270 #define devpath un.un_devpath 271 #define sfpath un.un_sfpath 272 273 ASSERT(cprconfig_loaded); 274 /* 275 * We need not worry about locking or the timing of releasing 276 * the vnode, since we are single-threaded now. 277 */ 278 279 switch (cf->cf_type) { 280 case CFT_SPEC: 281 error = i_devname_to_promname(cf->cf_devfs, devpath, 282 OBP_MAXPATHLEN); 283 if (error || strcmp(devpath, cf->cf_dev_prom)) { 284 cpr_err(CE_CONT, path_chg_fmt, 285 cf->cf_dev_prom, devpath, rerun); 286 return (error); 287 } 288 /*FALLTHROUGH*/ 289 case CFT_ZVOL: 290 if (strlen(cf->cf_path) > sizeof (sfpath)) { 291 cpr_err(CE_CONT, long_name); 292 return (ENAMETOOLONG); 293 } 294 if ((error = lookupname(cf->cf_devfs, 295 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 296 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs); 297 return (error); 298 } 299 if (vp->v_type != VBLK) 300 errstr = "statefile must be a block device"; 301 else if (vfs_devismounted(vp->v_rdev)) 302 errstr = "statefile device must not " 303 "have a file system mounted on it"; 304 else if (IS_SWAPVP(vp)) 305 errstr = "statefile device must not " 306 "be configured as swap file"; 307 else 308 errstr = NULL; 309 310 VN_RELE(vp); 311 if (errstr) { 312 cpr_err(CE_CONT, "%s.\n", errstr); 313 return (ENOTSUP); 314 } 315 316 return (error); 317 case CFT_UFS: 318 break; /* don't indent all the original code */ 319 default: 320 cpr_err(CE_PANIC, "invalid cf_type"); 321 } 322 323 /* 324 * The original code for UFS statefile 325 */ 326 if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) { 327 cpr_err(CE_CONT, long_name); 328 return (ENAMETOOLONG); 329 } 330 331 bzero(sfpath, sizeof (sfpath)); 332 (void) strcpy(sfpath, cpr_cprconfig_to_path()); 333 334 if (*sfpath != '/') { 335 cpr_err(CE_CONT, "Statefile pathname %s " 336 "must begin with a /\n", sfpath); 337 return (EINVAL); 338 } 339 340 /* 341 * Find the longest prefix of the statefile pathname which 342 * is the mountpoint of a filesystem. This string must 343 * match the cf_fs field we read from the config file. Other- 344 * wise the user has changed things without running pmconfig. 345 */ 346 tail = longest = sfpath + 1; /* pt beyond the leading "/" */ 347 while ((slash = strchr(tail, '/')) != NULL) { 348 *slash = '\0'; /* temporarily terminate the string */ 349 if ((error = lookupname(sfpath, 350 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 351 *slash = '/'; 352 cpr_err(CE_CONT, "A directory in the " 353 "statefile path %s was not found.\n", sfpath); 354 VN_RELE(vp); 355 356 return (error); 357 } 358 359 vfs_list_read_lock(); 360 vfsp = rootvfs; 361 do { 362 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 363 if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) { 364 found = 1; 365 break; 366 } 367 vfsp = vfsp->vfs_next; 368 } while (vfsp != rootvfs); 369 vfs_list_unlock(); 370 371 /* 372 * If we have found a filesystem mounted on the current 373 * path prefix, remember the end of the string in 374 * "longest". If it happens to be the the exact fs 375 * saved in the configuration file, save the current 376 * ufsvfsp so we can make additional checks further down. 377 */ 378 if (found) { 379 longest = slash; 380 if (strcmp(cf->cf_fs, sfpath) == 0) { 381 ufsvfsp_save = ufsvfsp; 382 vfsp_save = vfsp; 383 } 384 found = 0; 385 } 386 387 VN_RELE(vp); 388 *slash = '/'; 389 tail = slash + 1; 390 } 391 *longest = '\0'; 392 if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) { 393 cpr_err(CE_CONT, "Filesystem containing " 394 "the statefile when pmconfig was run (%s) has " 395 "changed to %s. %s\n", cf->cf_fs, sfpath, rerun); 396 return (EINVAL); 397 } 398 399 if ((error = lookupname(cf->cf_devfs, 400 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 401 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs); 402 return (error); 403 } 404 405 if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) { 406 cpr_err(CE_CONT, "Filesystem containing " 407 "statefile no longer mounted on device %s. " 408 "See power.conf(4).", cf->cf_devfs); 409 VN_RELE(vp); 410 return (ENXIO); 411 } 412 VN_RELE(vp); 413 414 error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN); 415 if (error || strcmp(devpath, cf->cf_dev_prom)) { 416 cpr_err(CE_CONT, path_chg_fmt, 417 cf->cf_dev_prom, devpath, rerun); 418 return (error); 419 } 420 421 return (0); 422 } 423 424 /* 425 * Make sure that the statefile can be used as a block special statefile 426 * (meaning that is exists and has nothing mounted on it) 427 * Returns errno if not a valid statefile. 428 */ 429 int 430 cpr_check_spec_statefile(void) 431 { 432 int err; 433 434 if (err = cpr_get_config()) 435 return (err); 436 ASSERT(cprconfig.cf_type == CFT_SPEC || 437 cprconfig.cf_type == CFT_ZVOL); 438 439 if (cprconfig.cf_devfs == NULL) 440 return (ENXIO); 441 442 return (cpr_verify_statefile_path()); 443 444 } 445 446 int 447 cpr_alloc_statefile(int alloc_retry) 448 { 449 register int rc = 0; 450 char *str; 451 452 /* 453 * Statefile size validation. If checkpoint the first time, disk blocks 454 * allocation will be done; otherwise, just do file size check. 455 * if statefile allocation is being retried, C_VP will be inited 456 */ 457 if (alloc_retry) { 458 str = "\n-->Retrying statefile allocation..."; 459 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7)) 460 prom_printf(str); 461 if (C_VP->v_type != VBLK) 462 (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL); 463 } else { 464 /* 465 * Open an exiting file for writing, the state file needs to be 466 * pre-allocated since we can't and don't want to do allocation 467 * during checkpoint (too much of the OS is disabled). 468 * - do a preliminary size checking here, if it is too small, 469 * allocate more space internally and retry. 470 * - check the vp to make sure it's the right type. 471 */ 472 char *path = cpr_build_statefile_path(); 473 474 if (path == NULL) 475 return (ENXIO); 476 else if (rc = cpr_verify_statefile_path()) 477 return (rc); 478 479 if (rc = vn_open(path, UIO_SYSSPACE, 480 FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) { 481 cpr_err(CE_WARN, "cannot open statefile %s", path); 482 return (rc); 483 } 484 } 485 486 /* 487 * Only ufs and block special statefiles supported 488 */ 489 if (C_VP->v_type != VREG && C_VP->v_type != VBLK) { 490 cpr_err(CE_CONT, 491 "Statefile must be regular file or block special file."); 492 return (EACCES); 493 } 494 495 if (rc = cpr_statefile_ok(C_VP, alloc_retry)) 496 return (rc); 497 498 if (C_VP->v_type != VBLK) { 499 /* 500 * sync out the fs change due to the statefile reservation. 501 */ 502 (void) VFS_SYNC(C_VP->v_vfsp, 0, CRED()); 503 504 /* 505 * Validate disk blocks allocation for the state file. 506 * Ask the file system prepare itself for the dump operation. 507 */ 508 if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) { 509 cpr_err(CE_CONT, "Error allocating " 510 "blocks for cpr statefile."); 511 return (rc); 512 } 513 } 514 return (0); 515 } 516 517 518 /* 519 * Lookup device size and return available space in bytes. 520 * NOTE: Since prop_op(9E) can't tell the difference between a character 521 * and a block reference, it is ok to ask for "Size" instead of "Nblocks". 522 */ 523 size_t 524 cpr_get_devsize(dev_t dev) 525 { 526 size_t bytes = 0; 527 528 bytes = cdev_Size(dev); 529 if (bytes == 0) 530 bytes = cdev_size(dev); 531 532 if (bytes > CPR_SPEC_OFFSET) 533 bytes -= CPR_SPEC_OFFSET; 534 else 535 bytes = 0; 536 537 return (bytes); 538 } 539 540 541 /* 542 * increase statefile size 543 */ 544 static int 545 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize) 546 { 547 extern uchar_t cpr_pagecopy[]; 548 struct inode *ip = VTOI(vp); 549 u_longlong_t offset; 550 int error, increase; 551 ssize_t resid; 552 553 rw_enter(&ip->i_contents, RW_READER); 554 increase = (ip->i_size < newsize); 555 offset = ip->i_size; 556 rw_exit(&ip->i_contents); 557 558 if (increase == 0) 559 return (0); 560 561 /* 562 * write to each logical block to reserve disk space 563 */ 564 error = 0; 565 cpr_pagecopy[0] = '1'; 566 for (; offset < newsize; offset += ip->i_fs->fs_bsize) { 567 if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy, 568 ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0, 569 (rlim64_t)MAXOFF_T, CRED(), &resid)) { 570 if (error == ENOSPC) { 571 cpr_err(CE_WARN, "error %d while reserving " 572 "disk space for statefile %s\n" 573 "wanted %lld bytes, file is %lld short", 574 error, cpr_cprconfig_to_path(), 575 newsize, newsize - offset); 576 } 577 break; 578 } 579 } 580 return (error); 581 } 582 583 584 /* 585 * do a simple estimate of the space needed to hold the statefile 586 * taking compression into account, but be fairly conservative 587 * so we have a better chance of completing; when dump fails, 588 * the retry cost is fairly high. 589 * 590 * Do disk blocks allocation for the state file if no space has 591 * been allocated yet. Since the state file will not be removed, 592 * allocation should only be done once. 593 */ 594 static int 595 cpr_statefile_ok(vnode_t *vp, int alloc_retry) 596 { 597 extern size_t cpr_bitmap_size; 598 struct inode *ip = VTOI(vp); 599 const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */ 600 u_longlong_t size, isize, ksize, raw_data; 601 char *str, *est_fmt; 602 size_t space; 603 int error; 604 605 /* 606 * number of pages short for swapping. 607 */ 608 STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv; 609 if (STAT->cs_nosw_pages < 0) 610 STAT->cs_nosw_pages = 0; 611 612 str = "cpr_statefile_ok:"; 613 614 CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n", 615 k_anoninfo.ani_max, k_anoninfo.ani_phys_resv); 616 CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n", 617 MAX(availrmem - swapfs_minfree, 0), 618 k_anoninfo.ani_mem_resv); 619 CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n", 620 CURRENT_TOTAL_AVAILABLE_SWAP); 621 622 /* 623 * try increasing filesize by 15% 624 */ 625 if (alloc_retry) { 626 /* 627 * block device doesn't get any bigger 628 */ 629 if (vp->v_type == VBLK) { 630 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 631 prom_printf( 632 "Retry statefile on special file\n"); 633 return (ENOMEM); 634 } else { 635 rw_enter(&ip->i_contents, RW_READER); 636 size = (ip->i_size * SIZE_RATE) / INTEGRAL; 637 rw_exit(&ip->i_contents); 638 } 639 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 640 prom_printf("Retry statefile size = %lld\n", size); 641 } else { 642 u_longlong_t cpd_size; 643 pgcnt_t npages, nback; 644 int ndvram; 645 646 ndvram = 0; 647 (void) callb_execute_class(CB_CL_CPR_FB, 648 (int)(uintptr_t)&ndvram); 649 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 650 prom_printf("ndvram size = %d\n", ndvram); 651 652 /* 653 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages 654 */ 655 npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit); 656 cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2)); 657 raw_data = cpd_size + cpr_bitmap_size; 658 ksize = ndvram + mmu_ptob(npages); 659 660 est_fmt = "%s estimated size with " 661 "%scompression %lld, ksize %lld\n"; 662 nback = mmu_ptob(STAT->cs_nosw_pages); 663 if (CPR->c_flags & C_COMPRESSING) { 664 size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) + 665 raw_data + ((nback * 10) / UCOMP_RATE); 666 CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize); 667 } else { 668 size = ksize + raw_data + nback; 669 CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ", 670 size, ksize); 671 } 672 } 673 674 /* 675 * All this is much simpler for a block device 676 */ 677 if (vp->v_type == VBLK) { 678 space = cpr_get_devsize(vp->v_rdev); 679 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 680 prom_printf("statefile dev size %lu\n", space); 681 682 /* 683 * Export the estimated filesize info, this value will be 684 * compared before dumping out the statefile in the case of 685 * no compression. 686 */ 687 STAT->cs_est_statefsz = size; 688 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 689 prom_printf("%s Estimated statefile size %llu, " 690 "space %lu\n", str, size, space); 691 if (size > space) { 692 cpr_err(CE_CONT, "Statefile partition too small."); 693 return (ENOMEM); 694 } 695 return (0); 696 } else { 697 if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) { 698 cpr_err(CE_CONT, "Statefile allocation retry failed\n"); 699 return (ENOMEM); 700 } 701 702 /* 703 * Estimate space needed for the state file. 704 * 705 * State file size in bytes: 706 * kernel size + non-cache pte seg + 707 * bitmap size + cpr state file headers size 708 * (round up to fs->fs_bsize) 709 */ 710 size = blkroundup(ip->i_fs, size); 711 712 /* 713 * Export the estimated filesize info, this value will be 714 * compared before dumping out the statefile in the case of 715 * no compression. 716 */ 717 STAT->cs_est_statefsz = size; 718 error = cpr_grow_statefile(vp, size); 719 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) { 720 rw_enter(&ip->i_contents, RW_READER); 721 isize = ip->i_size; 722 rw_exit(&ip->i_contents); 723 prom_printf("%s Estimated statefile size %lld, " 724 "i_size %lld\n", str, size, isize); 725 } 726 727 return (error); 728 } 729 } 730 731 732 void 733 cpr_statef_close(void) 734 { 735 if (C_VP) { 736 if (!cpr_reusable_mode) 737 (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL); 738 (void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL); 739 VN_RELE(C_VP); 740 C_VP = 0; 741 } 742 } 743 744 745 /* 746 * open cpr default file and display error 747 */ 748 int 749 cpr_open_deffile(int mode, vnode_t **vpp) 750 { 751 int error; 752 753 if (error = cpr_open(cpr_default_path, mode, vpp)) 754 cpr_err(CE_CONT, "cannot open \"%s\", error %d\n", 755 cpr_default_path, error); 756 return (error); 757 } 758 759 760 /* 761 * write cdef_t to disk. This contains the original values of prom 762 * properties that we modify. We fill in the magic number of the file 763 * here as a signal to the booter code that the state file is valid. 764 * Be sure the file gets synced, since we may be shutting down the OS. 765 */ 766 int 767 cpr_write_deffile(cdef_t *cdef) 768 { 769 struct vnode *vp; 770 char *str; 771 int rc; 772 773 if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp)) 774 return (rc); 775 776 if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef))) 777 str = "write"; 778 else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL)) 779 str = "fsync"; 780 (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL); 781 VN_RELE(vp); 782 783 if (rc) { 784 cpr_err(CE_WARN, "%s error %d, file \"%s\"", 785 str, rc, cpr_default_path); 786 } 787 return (rc); 788 } 789 790 /* 791 * Clear the magic number in the defaults file. This tells the booter 792 * program that the state file is not current and thus prevents 793 * any attempt to restore from an obsolete state file. 794 */ 795 void 796 cpr_clear_definfo(void) 797 { 798 struct vnode *vp; 799 cmini_t mini; 800 801 if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) || 802 cpr_open_deffile(FCREAT|FWRITE, &vp)) 803 return; 804 mini.magic = mini.reusable = 0; 805 (void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini)); 806 (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL); 807 VN_RELE(vp); 808 } 809 810 /* 811 * If the cpr default file is invalid, then we must not be in reusable mode 812 * if it is valid, it tells us our mode 813 */ 814 int 815 cpr_get_reusable_mode(void) 816 { 817 struct vnode *vp; 818 cmini_t mini; 819 int rc; 820 821 if (cpr_open(cpr_default_path, FREAD, &vp)) 822 return (0); 823 824 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 825 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL); 826 VN_RELE(vp); 827 if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC) 828 return (mini.reusable); 829 830 return (0); 831 } 832 #endif 833 834 /* 835 * clock/time related routines 836 */ 837 static time_t cpr_time_stamp; 838 839 840 void 841 cpr_tod_get(cpr_time_t *ctp) 842 { 843 timestruc_t ts; 844 845 mutex_enter(&tod_lock); 846 ts = TODOP_GET(tod_ops); 847 mutex_exit(&tod_lock); 848 ctp->tv_sec = (time32_t)ts.tv_sec; 849 ctp->tv_nsec = (int32_t)ts.tv_nsec; 850 } 851 852 void 853 cpr_tod_status_set(int tod_flag) 854 { 855 mutex_enter(&tod_lock); 856 tod_status_set(tod_flag); 857 mutex_exit(&tod_lock); 858 } 859 860 void 861 cpr_save_time(void) 862 { 863 cpr_time_stamp = gethrestime_sec(); 864 } 865 866 /* 867 * correct time based on saved time stamp or hardware clock 868 */ 869 void 870 cpr_restore_time(void) 871 { 872 clkset(cpr_time_stamp); 873 } 874 875 #if defined(__sparc) 876 /* 877 * CPU ONLINE/OFFLINE CODE 878 */ 879 int 880 cpr_mp_offline(void) 881 { 882 cpu_t *cp, *bootcpu; 883 int rc = 0; 884 int brought_up_boot = 0; 885 886 /* 887 * Do nothing for UP. 888 */ 889 if (ncpus == 1) 890 return (0); 891 892 mutex_enter(&cpu_lock); 893 894 cpr_save_mp_state(); 895 896 bootcpu = i_cpr_bootcpu(); 897 if (!CPU_ACTIVE(bootcpu)) { 898 if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) { 899 mutex_exit(&cpu_lock); 900 return (rc); 901 } 902 brought_up_boot = 1; 903 } 904 905 cp = cpu_list; 906 do { 907 if (cp == bootcpu) 908 continue; 909 if (cp->cpu_flags & CPU_OFFLINE) 910 continue; 911 if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) { 912 mutex_exit(&cpu_lock); 913 return (rc); 914 } 915 } while ((cp = cp->cpu_next) != cpu_list); 916 if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))) 917 prom_printf("changed cpu %p to state %d\n", 918 (void *)bootcpu, CPU_CPR_ONLINE); 919 mutex_exit(&cpu_lock); 920 921 return (rc); 922 } 923 924 int 925 cpr_mp_online(void) 926 { 927 cpu_t *cp, *bootcpu = CPU; 928 int rc = 0; 929 930 /* 931 * Do nothing for UP. 932 */ 933 if (ncpus == 1) 934 return (0); 935 936 /* 937 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags 938 * to indicate a cpu was online at the time of cpr_suspend(); 939 * now restart those cpus that were marked as CPU_CPR_ONLINE 940 * and actually are offline. 941 */ 942 mutex_enter(&cpu_lock); 943 for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) { 944 /* 945 * Clear the CPU_FROZEN flag in all cases. 946 */ 947 cp->cpu_flags &= ~CPU_FROZEN; 948 949 if (CPU_CPR_IS_OFFLINE(cp)) 950 continue; 951 if (CPU_ACTIVE(cp)) 952 continue; 953 if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) { 954 mutex_exit(&cpu_lock); 955 return (rc); 956 } 957 } 958 959 /* 960 * turn off the boot cpu if it was offlined 961 */ 962 if (CPU_CPR_IS_OFFLINE(bootcpu)) { 963 if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) { 964 mutex_exit(&cpu_lock); 965 return (rc); 966 } 967 } 968 mutex_exit(&cpu_lock); 969 return (0); 970 } 971 972 static void 973 cpr_save_mp_state(void) 974 { 975 cpu_t *cp; 976 977 ASSERT(MUTEX_HELD(&cpu_lock)); 978 979 cp = cpu_list; 980 do { 981 cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE; 982 if (CPU_ACTIVE(cp)) 983 CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE); 984 } while ((cp = cp->cpu_next) != cpu_list); 985 } 986 987 /* 988 * change cpu to online/offline 989 */ 990 static int 991 cpr_p_online(cpu_t *cp, int state) 992 { 993 int rc; 994 995 ASSERT(MUTEX_HELD(&cpu_lock)); 996 997 switch (state) { 998 case CPU_CPR_ONLINE: 999 rc = cpu_online(cp); 1000 break; 1001 case CPU_CPR_OFFLINE: 1002 rc = cpu_offline(cp, CPU_FORCED); 1003 break; 1004 } 1005 if (rc) { 1006 cpr_err(CE_WARN, "Failed to change processor %d to " 1007 "state %d, (errno %d)", cp->cpu_id, state, rc); 1008 } 1009 return (rc); 1010 } 1011 1012 /* 1013 * Construct the pathname of the state file and return a pointer to 1014 * caller. Read the config file to get the mount point of the 1015 * filesystem and the pathname within fs. 1016 */ 1017 char * 1018 cpr_build_statefile_path(void) 1019 { 1020 struct cprconfig *cf = &cprconfig; 1021 1022 if (cpr_get_config()) 1023 return (NULL); 1024 1025 switch (cf->cf_type) { 1026 case CFT_UFS: 1027 if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) { 1028 cpr_err(CE_CONT, "Statefile path is too long.\n"); 1029 return (NULL); 1030 } 1031 return (cpr_cprconfig_to_path()); 1032 case CFT_ZVOL: 1033 /*FALLTHROUGH*/ 1034 case CFT_SPEC: 1035 return (cf->cf_devfs); 1036 default: 1037 cpr_err(CE_PANIC, "invalid statefile type"); 1038 /*NOTREACHED*/ 1039 return (NULL); 1040 } 1041 } 1042 1043 int 1044 cpr_statefile_is_spec(void) 1045 { 1046 if (cpr_get_config()) 1047 return (0); 1048 return (cprconfig.cf_type == CFT_SPEC); 1049 } 1050 1051 char * 1052 cpr_get_statefile_prom_path(void) 1053 { 1054 struct cprconfig *cf = &cprconfig; 1055 1056 ASSERT(cprconfig_loaded); 1057 ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC); 1058 ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL); 1059 return (cf->cf_dev_prom); 1060 } 1061 1062 1063 /* 1064 * XXX The following routines need to be in the vfs source code. 1065 */ 1066 1067 int 1068 cpr_is_ufs(struct vfs *vfsp) 1069 { 1070 char *fsname; 1071 1072 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1073 return (strcmp(fsname, "ufs") == 0); 1074 } 1075 1076 int 1077 cpr_is_zfs(struct vfs *vfsp) 1078 { 1079 char *fsname; 1080 1081 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1082 return (strcmp(fsname, "zfs") == 0); 1083 } 1084 1085 /* 1086 * This is a list of file systems that are allowed to be writeable when a 1087 * reusable statefile checkpoint is taken. They must not have any state that 1088 * cannot be restored to consistency by simply rebooting using the checkpoint. 1089 * (In contrast to ufs and pcfs which have disk state that could get 1090 * out of sync with the in-kernel data). 1091 */ 1092 int 1093 cpr_reusable_mount_check(void) 1094 { 1095 struct vfs *vfsp; 1096 char *fsname; 1097 char **cpp; 1098 static char *cpr_writeok_fss[] = { 1099 "autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs", 1100 "proc", "tmpfs", "ctfs", "objfs", "dev", NULL 1101 }; 1102 1103 vfs_list_read_lock(); 1104 vfsp = rootvfs; 1105 do { 1106 if (vfsp->vfs_flag & VFS_RDONLY) { 1107 vfsp = vfsp->vfs_next; 1108 continue; 1109 } 1110 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1111 for (cpp = cpr_writeok_fss; *cpp; cpp++) { 1112 if (strcmp(fsname, *cpp) == 0) 1113 break; 1114 } 1115 /* 1116 * if the inner loop reached the NULL terminator, 1117 * the current fs-type does not match any OK-type 1118 */ 1119 if (*cpp == NULL) { 1120 cpr_err(CE_CONT, "a filesystem of type %s is " 1121 "mounted read/write.\nReusable statefile requires " 1122 "no writeable filesystem of this type be mounted\n", 1123 fsname); 1124 vfs_list_unlock(); 1125 return (EINVAL); 1126 } 1127 vfsp = vfsp->vfs_next; 1128 } while (vfsp != rootvfs); 1129 vfs_list_unlock(); 1130 return (0); 1131 } 1132 1133 /* 1134 * return statefile offset in DEV_BSIZE units 1135 */ 1136 int 1137 cpr_statefile_offset(void) 1138 { 1139 return (cprconfig.cf_type != CFT_UFS ? btod(CPR_SPEC_OFFSET) : 0); 1140 } 1141 1142 /* 1143 * Force a fresh read of the cprinfo per uadmin 3 call 1144 */ 1145 void 1146 cpr_forget_cprconfig(void) 1147 { 1148 cprconfig_loaded = 0; 1149 } 1150 #endif 1151