1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2019 Joyent, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/errno.h> 30 #include <sys/cpuvar.h> 31 #include <sys/vfs.h> 32 #include <sys/vnode.h> 33 #include <sys/pathname.h> 34 #include <sys/callb.h> 35 #include <sys/fs/ufs_inode.h> 36 #include <vm/anon.h> 37 #include <sys/fs/swapnode.h> /* for swapfs_minfree */ 38 #include <sys/kmem.h> 39 #include <sys/cpr.h> 40 #include <sys/conf.h> 41 #include <sys/machclock.h> 42 43 /* 44 * CPR miscellaneous support routines 45 */ 46 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 47 mode, 0600, vpp, CRCREAT, 0)) 48 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 49 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 50 (ssize_t *)NULL)) 51 52 extern void clkset(time_t); 53 extern cpu_t *i_cpr_bootcpu(void); 54 extern caddr_t i_cpr_map_setup(void); 55 extern void i_cpr_free_memory_resources(void); 56 57 extern kmutex_t cpr_slock; 58 extern size_t cpr_buf_size; 59 extern char *cpr_buf; 60 extern size_t cpr_pagedata_size; 61 extern char *cpr_pagedata; 62 extern int cpr_bufs_allocated; 63 extern int cpr_bitmaps_allocated; 64 65 #if defined(__sparc) 66 static struct cprconfig cprconfig; 67 static int cprconfig_loaded = 0; 68 static int cpr_statefile_ok(vnode_t *, int); 69 static int cpr_p_online(cpu_t *, int); 70 static void cpr_save_mp_state(void); 71 #endif 72 73 int cpr_is_ufs(struct vfs *); 74 int cpr_is_zfs(struct vfs *); 75 76 char cpr_default_path[] = CPR_DEFAULT; 77 78 #define COMPRESS_PERCENT 40 /* approx compression ratio in percent */ 79 #define SIZE_RATE 115 /* increase size by 15% */ 80 #define INTEGRAL 100 /* for integer math */ 81 82 83 /* 84 * cmn_err() followed by a 1/4 second delay; this gives the 85 * logging service a chance to flush messages and helps avoid 86 * intermixing output from prom_printf(). 87 */ 88 /*PRINTFLIKE2*/ 89 void 90 cpr_err(int ce, const char *fmt, ...) 91 { 92 va_list adx; 93 94 va_start(adx, fmt); 95 vcmn_err(ce, fmt, adx); 96 va_end(adx); 97 drv_usecwait(MICROSEC >> 2); 98 } 99 100 101 int 102 cpr_init(int fcn) 103 { 104 /* 105 * Allow only one suspend/resume process. 106 */ 107 if (mutex_tryenter(&cpr_slock) == 0) 108 return (EBUSY); 109 110 CPR->c_flags = 0; 111 CPR->c_substate = 0; 112 CPR->c_cprboot_magic = 0; 113 CPR->c_alloc_cnt = 0; 114 115 CPR->c_fcn = fcn; 116 if (fcn == AD_CPR_REUSABLE) 117 CPR->c_flags |= C_REUSABLE; 118 else 119 CPR->c_flags |= C_SUSPENDING; 120 if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) { 121 return (0); 122 } 123 #if defined(__sparc) 124 if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ) 125 CPR->c_flags |= C_COMPRESSING; 126 /* 127 * reserve CPR_MAXCONTIG virtual pages for cpr_dump() 128 */ 129 CPR->c_mapping_area = i_cpr_map_setup(); 130 if (CPR->c_mapping_area == 0) { /* no space in kernelmap */ 131 cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n"); 132 mutex_exit(&cpr_slock); 133 return (EAGAIN); 134 } 135 if (cpr_debug & CPR_DEBUG3) 136 cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing " 137 "kas\n", (void *)CPR->c_mapping_area); 138 #endif 139 140 return (0); 141 } 142 143 /* 144 * This routine releases any resources used during the checkpoint. 145 */ 146 void 147 cpr_done(void) 148 { 149 cpr_stat_cleanup(); 150 i_cpr_bitmap_cleanup(); 151 152 /* 153 * Free pages used by cpr buffers. 154 */ 155 if (cpr_buf) { 156 kmem_free(cpr_buf, cpr_buf_size); 157 cpr_buf = NULL; 158 } 159 if (cpr_pagedata) { 160 kmem_free(cpr_pagedata, cpr_pagedata_size); 161 cpr_pagedata = NULL; 162 } 163 164 i_cpr_free_memory_resources(); 165 mutex_exit(&cpr_slock); 166 cpr_err(CE_CONT, "System has been resumed.\n"); 167 } 168 169 170 #if defined(__sparc) 171 /* 172 * reads config data into cprconfig 173 */ 174 static int 175 cpr_get_config(void) 176 { 177 static char config_path[] = CPR_CONFIG; 178 struct cprconfig *cf = &cprconfig; 179 struct vnode *vp; 180 char *fmt; 181 int err; 182 183 if (cprconfig_loaded) 184 return (0); 185 186 fmt = "cannot %s config file \"%s\", error %d\n"; 187 if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) { 188 cpr_err(CE_CONT, fmt, "open", config_path, err); 189 return (err); 190 } 191 192 err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf)); 193 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL); 194 VN_RELE(vp); 195 if (err) { 196 cpr_err(CE_CONT, fmt, "read", config_path, err); 197 return (err); 198 } 199 200 if (cf->cf_magic == CPR_CONFIG_MAGIC) 201 cprconfig_loaded = 1; 202 else { 203 cpr_err(CE_CONT, "invalid config file \"%s\", " 204 "rerun pmconfig(1M)\n", config_path); 205 err = EINVAL; 206 } 207 208 return (err); 209 } 210 211 212 /* 213 * concat fs and path fields of the cprconfig structure; 214 * returns pointer to the base of static data 215 */ 216 static char * 217 cpr_cprconfig_to_path(void) 218 { 219 static char full_path[MAXNAMELEN]; 220 struct cprconfig *cf = &cprconfig; 221 char *ptr; 222 223 /* 224 * build /fs/path without extra '/' 225 */ 226 (void) strcpy(full_path, cf->cf_fs); 227 if (strcmp(cf->cf_fs, "/")) 228 (void) strcat(full_path, "/"); 229 ptr = cf->cf_path; 230 if (*ptr == '/') 231 ptr++; 232 (void) strcat(full_path, ptr); 233 return (full_path); 234 } 235 236 237 /* 238 * Verify that the information in the configuration file regarding the 239 * location for the statefile is still valid, depending on cf_type. 240 * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be 241 * mounted on the same device as when pmconfig was last run, 242 * and the translation of that device to a node in the prom's 243 * device tree must be the same as when pmconfig was last run. 244 * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block 245 * special file, it must have no file system mounted on it, 246 * and the translation of that device to a node in the prom's 247 * device tree must be the same as when pmconfig was last run. 248 */ 249 static int 250 cpr_verify_statefile_path(void) 251 { 252 struct cprconfig *cf = &cprconfig; 253 static const char long_name[] = "Statefile pathname is too long.\n"; 254 static const char lookup_fmt[] = "Lookup failed for " 255 "cpr statefile device %s.\n"; 256 static const char path_chg_fmt[] = "Device path for statefile " 257 "has changed from %s to %s.\t%s\n"; 258 static const char rerun[] = "Please rerun pmconfig(1m)."; 259 struct vfs *vfsp = NULL, *vfsp_save = rootvfs; 260 ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data; 261 ufsvfs_t *ufsvfsp_save = ufsvfsp; 262 int error; 263 struct vnode *vp; 264 char *slash, *tail, *longest; 265 char *errstr; 266 int found = 0; 267 union { 268 char un_devpath[OBP_MAXPATHLEN]; 269 char un_sfpath[MAXNAMELEN]; 270 } un; 271 #define devpath un.un_devpath 272 #define sfpath un.un_sfpath 273 274 ASSERT(cprconfig_loaded); 275 /* 276 * We need not worry about locking or the timing of releasing 277 * the vnode, since we are single-threaded now. 278 */ 279 280 switch (cf->cf_type) { 281 case CFT_SPEC: 282 error = i_devname_to_promname(cf->cf_devfs, devpath, 283 OBP_MAXPATHLEN); 284 if (error || strcmp(devpath, cf->cf_dev_prom)) { 285 cpr_err(CE_CONT, path_chg_fmt, 286 cf->cf_dev_prom, devpath, rerun); 287 return (error); 288 } 289 /*FALLTHROUGH*/ 290 case CFT_ZVOL: 291 if (strlen(cf->cf_path) > sizeof (sfpath)) { 292 cpr_err(CE_CONT, long_name); 293 return (ENAMETOOLONG); 294 } 295 if ((error = lookupname(cf->cf_devfs, 296 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 297 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs); 298 return (error); 299 } 300 if (vp->v_type != VBLK) 301 errstr = "statefile must be a block device"; 302 else if (vfs_devismounted(vp->v_rdev)) 303 errstr = "statefile device must not " 304 "have a file system mounted on it"; 305 else if (IS_SWAPVP(vp)) 306 errstr = "statefile device must not " 307 "be configured as swap file"; 308 else 309 errstr = NULL; 310 311 VN_RELE(vp); 312 if (errstr) { 313 cpr_err(CE_CONT, "%s.\n", errstr); 314 return (ENOTSUP); 315 } 316 317 return (error); 318 case CFT_UFS: 319 break; /* don't indent all the original code */ 320 default: 321 cpr_err(CE_PANIC, "invalid cf_type"); 322 } 323 324 /* 325 * The original code for UFS statefile 326 */ 327 if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) { 328 cpr_err(CE_CONT, long_name); 329 return (ENAMETOOLONG); 330 } 331 332 bzero(sfpath, sizeof (sfpath)); 333 (void) strcpy(sfpath, cpr_cprconfig_to_path()); 334 335 if (*sfpath != '/') { 336 cpr_err(CE_CONT, "Statefile pathname %s " 337 "must begin with a /\n", sfpath); 338 return (EINVAL); 339 } 340 341 /* 342 * Find the longest prefix of the statefile pathname which 343 * is the mountpoint of a filesystem. This string must 344 * match the cf_fs field we read from the config file. Other- 345 * wise the user has changed things without running pmconfig. 346 */ 347 tail = longest = sfpath + 1; /* pt beyond the leading "/" */ 348 while ((slash = strchr(tail, '/')) != NULL) { 349 *slash = '\0'; /* temporarily terminate the string */ 350 if ((error = lookupname(sfpath, 351 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 352 *slash = '/'; 353 cpr_err(CE_CONT, "A directory in the " 354 "statefile path %s was not found.\n", sfpath); 355 VN_RELE(vp); 356 357 return (error); 358 } 359 360 vfs_list_read_lock(); 361 vfsp = rootvfs; 362 do { 363 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 364 if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) { 365 found = 1; 366 break; 367 } 368 vfsp = vfsp->vfs_next; 369 } while (vfsp != rootvfs); 370 vfs_list_unlock(); 371 372 /* 373 * If we have found a filesystem mounted on the current 374 * path prefix, remember the end of the string in 375 * "longest". If it happens to be the the exact fs 376 * saved in the configuration file, save the current 377 * ufsvfsp so we can make additional checks further down. 378 */ 379 if (found) { 380 longest = slash; 381 if (strcmp(cf->cf_fs, sfpath) == 0) { 382 ufsvfsp_save = ufsvfsp; 383 vfsp_save = vfsp; 384 } 385 found = 0; 386 } 387 388 VN_RELE(vp); 389 *slash = '/'; 390 tail = slash + 1; 391 } 392 *longest = '\0'; 393 if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) { 394 cpr_err(CE_CONT, "Filesystem containing " 395 "the statefile when pmconfig was run (%s) has " 396 "changed to %s. %s\n", cf->cf_fs, sfpath, rerun); 397 return (EINVAL); 398 } 399 400 if ((error = lookupname(cf->cf_devfs, 401 UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) { 402 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs); 403 return (error); 404 } 405 406 if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) { 407 cpr_err(CE_CONT, "Filesystem containing " 408 "statefile no longer mounted on device %s. " 409 "See power.conf(4).", cf->cf_devfs); 410 VN_RELE(vp); 411 return (ENXIO); 412 } 413 VN_RELE(vp); 414 415 error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN); 416 if (error || strcmp(devpath, cf->cf_dev_prom)) { 417 cpr_err(CE_CONT, path_chg_fmt, 418 cf->cf_dev_prom, devpath, rerun); 419 return (error); 420 } 421 422 return (0); 423 } 424 425 /* 426 * Make sure that the statefile can be used as a block special statefile 427 * (meaning that is exists and has nothing mounted on it) 428 * Returns errno if not a valid statefile. 429 */ 430 int 431 cpr_check_spec_statefile(void) 432 { 433 int err; 434 435 if (err = cpr_get_config()) 436 return (err); 437 ASSERT(cprconfig.cf_type == CFT_SPEC || 438 cprconfig.cf_type == CFT_ZVOL); 439 440 if (cprconfig.cf_devfs == NULL) 441 return (ENXIO); 442 443 return (cpr_verify_statefile_path()); 444 445 } 446 447 int 448 cpr_alloc_statefile(int alloc_retry) 449 { 450 register int rc = 0; 451 char *str; 452 453 /* 454 * Statefile size validation. If checkpoint the first time, disk blocks 455 * allocation will be done; otherwise, just do file size check. 456 * if statefile allocation is being retried, C_VP will be inited 457 */ 458 if (alloc_retry) { 459 str = "\n-->Retrying statefile allocation..."; 460 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7)) 461 prom_printf(str); 462 if (C_VP->v_type != VBLK) 463 (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL); 464 } else { 465 /* 466 * Open an exiting file for writing, the state file needs to be 467 * pre-allocated since we can't and don't want to do allocation 468 * during checkpoint (too much of the OS is disabled). 469 * - do a preliminary size checking here, if it is too small, 470 * allocate more space internally and retry. 471 * - check the vp to make sure it's the right type. 472 */ 473 char *path = cpr_build_statefile_path(); 474 475 if (path == NULL) 476 return (ENXIO); 477 else if (rc = cpr_verify_statefile_path()) 478 return (rc); 479 480 if (rc = vn_open(path, UIO_SYSSPACE, 481 FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) { 482 cpr_err(CE_WARN, "cannot open statefile %s", path); 483 return (rc); 484 } 485 } 486 487 /* 488 * Only ufs and block special statefiles supported 489 */ 490 if (C_VP->v_type != VREG && C_VP->v_type != VBLK) { 491 cpr_err(CE_CONT, 492 "Statefile must be regular file or block special file."); 493 return (EACCES); 494 } 495 496 if (rc = cpr_statefile_ok(C_VP, alloc_retry)) 497 return (rc); 498 499 if (C_VP->v_type != VBLK) { 500 /* 501 * sync out the fs change due to the statefile reservation. 502 */ 503 (void) VFS_SYNC(C_VP->v_vfsp, 0, CRED()); 504 505 /* 506 * Validate disk blocks allocation for the state file. 507 * Ask the file system prepare itself for the dump operation. 508 */ 509 if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) { 510 cpr_err(CE_CONT, "Error allocating " 511 "blocks for cpr statefile."); 512 return (rc); 513 } 514 } 515 return (0); 516 } 517 518 519 /* 520 * Lookup device size and return available space in bytes. 521 * NOTE: Since prop_op(9E) can't tell the difference between a character 522 * and a block reference, it is ok to ask for "Size" instead of "Nblocks". 523 */ 524 size_t 525 cpr_get_devsize(dev_t dev) 526 { 527 size_t bytes = 0; 528 529 bytes = cdev_Size(dev); 530 if (bytes == 0) 531 bytes = cdev_size(dev); 532 533 if (bytes > CPR_SPEC_OFFSET) 534 bytes -= CPR_SPEC_OFFSET; 535 else 536 bytes = 0; 537 538 return (bytes); 539 } 540 541 542 /* 543 * increase statefile size 544 */ 545 static int 546 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize) 547 { 548 extern uchar_t cpr_pagecopy[]; 549 struct inode *ip = VTOI(vp); 550 u_longlong_t offset; 551 int error, increase; 552 ssize_t resid; 553 554 rw_enter(&ip->i_contents, RW_READER); 555 increase = (ip->i_size < newsize); 556 offset = ip->i_size; 557 rw_exit(&ip->i_contents); 558 559 if (increase == 0) 560 return (0); 561 562 /* 563 * write to each logical block to reserve disk space 564 */ 565 error = 0; 566 cpr_pagecopy[0] = '1'; 567 for (; offset < newsize; offset += ip->i_fs->fs_bsize) { 568 if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy, 569 ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0, 570 (rlim64_t)MAXOFF_T, CRED(), &resid)) { 571 if (error == ENOSPC) { 572 cpr_err(CE_WARN, "error %d while reserving " 573 "disk space for statefile %s\n" 574 "wanted %lld bytes, file is %lld short", 575 error, cpr_cprconfig_to_path(), 576 newsize, newsize - offset); 577 } 578 break; 579 } 580 } 581 return (error); 582 } 583 584 585 /* 586 * do a simple estimate of the space needed to hold the statefile 587 * taking compression into account, but be fairly conservative 588 * so we have a better chance of completing; when dump fails, 589 * the retry cost is fairly high. 590 * 591 * Do disk blocks allocation for the state file if no space has 592 * been allocated yet. Since the state file will not be removed, 593 * allocation should only be done once. 594 */ 595 static int 596 cpr_statefile_ok(vnode_t *vp, int alloc_retry) 597 { 598 extern size_t cpr_bitmap_size; 599 struct inode *ip = VTOI(vp); 600 const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */ 601 u_longlong_t size, isize, ksize, raw_data; 602 char *str, *est_fmt; 603 size_t space; 604 int error; 605 606 /* 607 * number of pages short for swapping. 608 */ 609 STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv; 610 if (STAT->cs_nosw_pages < 0) 611 STAT->cs_nosw_pages = 0; 612 613 str = "cpr_statefile_ok:"; 614 615 CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n", 616 k_anoninfo.ani_max, k_anoninfo.ani_phys_resv); 617 CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n", 618 MAX(availrmem - swapfs_minfree, 0), 619 k_anoninfo.ani_mem_resv); 620 CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n", 621 CURRENT_TOTAL_AVAILABLE_SWAP); 622 623 /* 624 * try increasing filesize by 15% 625 */ 626 if (alloc_retry) { 627 /* 628 * block device doesn't get any bigger 629 */ 630 if (vp->v_type == VBLK) { 631 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 632 prom_printf( 633 "Retry statefile on special file\n"); 634 return (ENOMEM); 635 } else { 636 rw_enter(&ip->i_contents, RW_READER); 637 size = (ip->i_size * SIZE_RATE) / INTEGRAL; 638 rw_exit(&ip->i_contents); 639 } 640 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 641 prom_printf("Retry statefile size = %lld\n", size); 642 } else { 643 u_longlong_t cpd_size; 644 pgcnt_t npages, nback; 645 int ndvram; 646 647 ndvram = 0; 648 (void) callb_execute_class(CB_CL_CPR_FB, 649 (int)(uintptr_t)&ndvram); 650 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 651 prom_printf("ndvram size = %d\n", ndvram); 652 653 /* 654 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages 655 */ 656 npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit); 657 cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2)); 658 raw_data = cpd_size + cpr_bitmap_size; 659 ksize = ndvram + mmu_ptob(npages); 660 661 est_fmt = "%s estimated size with " 662 "%scompression %lld, ksize %lld\n"; 663 nback = mmu_ptob(STAT->cs_nosw_pages); 664 if (CPR->c_flags & C_COMPRESSING) { 665 size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) + 666 raw_data + ((nback * 10) / UCOMP_RATE); 667 CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize); 668 } else { 669 size = ksize + raw_data + nback; 670 CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ", 671 size, ksize); 672 } 673 } 674 675 /* 676 * All this is much simpler for a block device 677 */ 678 if (vp->v_type == VBLK) { 679 space = cpr_get_devsize(vp->v_rdev); 680 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 681 prom_printf("statefile dev size %lu\n", space); 682 683 /* 684 * Export the estimated filesize info, this value will be 685 * compared before dumping out the statefile in the case of 686 * no compression. 687 */ 688 STAT->cs_est_statefsz = size; 689 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) 690 prom_printf("%s Estimated statefile size %llu, " 691 "space %lu\n", str, size, space); 692 if (size > space) { 693 cpr_err(CE_CONT, "Statefile partition too small."); 694 return (ENOMEM); 695 } 696 return (0); 697 } else { 698 if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) { 699 cpr_err(CE_CONT, "Statefile allocation retry failed\n"); 700 return (ENOMEM); 701 } 702 703 /* 704 * Estimate space needed for the state file. 705 * 706 * State file size in bytes: 707 * kernel size + non-cache pte seg + 708 * bitmap size + cpr state file headers size 709 * (round up to fs->fs_bsize) 710 */ 711 size = blkroundup(ip->i_fs, size); 712 713 /* 714 * Export the estimated filesize info, this value will be 715 * compared before dumping out the statefile in the case of 716 * no compression. 717 */ 718 STAT->cs_est_statefsz = size; 719 error = cpr_grow_statefile(vp, size); 720 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) { 721 rw_enter(&ip->i_contents, RW_READER); 722 isize = ip->i_size; 723 rw_exit(&ip->i_contents); 724 prom_printf("%s Estimated statefile size %lld, " 725 "i_size %lld\n", str, size, isize); 726 } 727 728 return (error); 729 } 730 } 731 732 733 void 734 cpr_statef_close(void) 735 { 736 if (C_VP) { 737 if (!cpr_reusable_mode) 738 (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL); 739 (void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL); 740 VN_RELE(C_VP); 741 C_VP = 0; 742 } 743 } 744 745 746 /* 747 * open cpr default file and display error 748 */ 749 int 750 cpr_open_deffile(int mode, vnode_t **vpp) 751 { 752 int error; 753 754 if (error = cpr_open(cpr_default_path, mode, vpp)) 755 cpr_err(CE_CONT, "cannot open \"%s\", error %d\n", 756 cpr_default_path, error); 757 return (error); 758 } 759 760 761 /* 762 * write cdef_t to disk. This contains the original values of prom 763 * properties that we modify. We fill in the magic number of the file 764 * here as a signal to the booter code that the state file is valid. 765 * Be sure the file gets synced, since we may be shutting down the OS. 766 */ 767 int 768 cpr_write_deffile(cdef_t *cdef) 769 { 770 struct vnode *vp; 771 char *str; 772 int rc; 773 774 if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp)) 775 return (rc); 776 777 if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef))) 778 str = "write"; 779 else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL)) 780 str = "fsync"; 781 (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL); 782 VN_RELE(vp); 783 784 if (rc) { 785 cpr_err(CE_WARN, "%s error %d, file \"%s\"", 786 str, rc, cpr_default_path); 787 } 788 return (rc); 789 } 790 791 /* 792 * Clear the magic number in the defaults file. This tells the booter 793 * program that the state file is not current and thus prevents 794 * any attempt to restore from an obsolete state file. 795 */ 796 void 797 cpr_clear_definfo(void) 798 { 799 struct vnode *vp; 800 cmini_t mini; 801 802 if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) || 803 cpr_open_deffile(FCREAT|FWRITE, &vp)) 804 return; 805 mini.magic = mini.reusable = 0; 806 (void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini)); 807 (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL); 808 VN_RELE(vp); 809 } 810 811 /* 812 * If the cpr default file is invalid, then we must not be in reusable mode 813 * if it is valid, it tells us our mode 814 */ 815 int 816 cpr_get_reusable_mode(void) 817 { 818 struct vnode *vp; 819 cmini_t mini; 820 int rc; 821 822 if (cpr_open(cpr_default_path, FREAD, &vp)) 823 return (0); 824 825 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 826 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL); 827 VN_RELE(vp); 828 if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC) 829 return (mini.reusable); 830 831 return (0); 832 } 833 #endif 834 835 /* 836 * clock/time related routines 837 */ 838 static time_t cpr_time_stamp; 839 840 841 void 842 cpr_tod_get(cpr_time_t *ctp) 843 { 844 timestruc_t ts; 845 846 mutex_enter(&tod_lock); 847 ts = TODOP_GET(tod_ops); 848 mutex_exit(&tod_lock); 849 ctp->tv_sec = (time32_t)ts.tv_sec; 850 ctp->tv_nsec = (int32_t)ts.tv_nsec; 851 } 852 853 void 854 cpr_tod_status_set(int tod_flag) 855 { 856 mutex_enter(&tod_lock); 857 tod_status_set(tod_flag); 858 mutex_exit(&tod_lock); 859 } 860 861 void 862 cpr_save_time(void) 863 { 864 cpr_time_stamp = gethrestime_sec(); 865 } 866 867 /* 868 * correct time based on saved time stamp or hardware clock 869 */ 870 void 871 cpr_restore_time(void) 872 { 873 clkset(cpr_time_stamp); 874 } 875 876 #if defined(__sparc) 877 /* 878 * CPU ONLINE/OFFLINE CODE 879 */ 880 int 881 cpr_mp_offline(void) 882 { 883 cpu_t *cp, *bootcpu; 884 int rc = 0; 885 int brought_up_boot = 0; 886 887 /* 888 * Do nothing for UP. 889 */ 890 if (ncpus == 1) 891 return (0); 892 893 mutex_enter(&cpu_lock); 894 895 cpr_save_mp_state(); 896 897 bootcpu = i_cpr_bootcpu(); 898 if (!CPU_ACTIVE(bootcpu)) { 899 if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) { 900 mutex_exit(&cpu_lock); 901 return (rc); 902 } 903 brought_up_boot = 1; 904 } 905 906 cp = cpu_list; 907 do { 908 if (cp == bootcpu) 909 continue; 910 if (cp->cpu_flags & CPU_OFFLINE) 911 continue; 912 if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) { 913 mutex_exit(&cpu_lock); 914 return (rc); 915 } 916 } while ((cp = cp->cpu_next) != cpu_list); 917 if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))) 918 prom_printf("changed cpu %p to state %d\n", 919 (void *)bootcpu, CPU_CPR_ONLINE); 920 mutex_exit(&cpu_lock); 921 922 return (rc); 923 } 924 925 int 926 cpr_mp_online(void) 927 { 928 cpu_t *cp, *bootcpu = CPU; 929 int rc = 0; 930 931 /* 932 * Do nothing for UP. 933 */ 934 if (ncpus == 1) 935 return (0); 936 937 /* 938 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags 939 * to indicate a cpu was online at the time of cpr_suspend(); 940 * now restart those cpus that were marked as CPU_CPR_ONLINE 941 * and actually are offline. 942 */ 943 mutex_enter(&cpu_lock); 944 for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) { 945 /* 946 * Clear the CPU_FROZEN flag in all cases. 947 */ 948 cp->cpu_flags &= ~CPU_FROZEN; 949 950 if (CPU_CPR_IS_OFFLINE(cp)) 951 continue; 952 if (CPU_ACTIVE(cp)) 953 continue; 954 if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) { 955 mutex_exit(&cpu_lock); 956 return (rc); 957 } 958 } 959 960 /* 961 * turn off the boot cpu if it was offlined 962 */ 963 if (CPU_CPR_IS_OFFLINE(bootcpu)) { 964 if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) { 965 mutex_exit(&cpu_lock); 966 return (rc); 967 } 968 } 969 mutex_exit(&cpu_lock); 970 return (0); 971 } 972 973 static void 974 cpr_save_mp_state(void) 975 { 976 cpu_t *cp; 977 978 ASSERT(MUTEX_HELD(&cpu_lock)); 979 980 cp = cpu_list; 981 do { 982 cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE; 983 if (CPU_ACTIVE(cp)) 984 CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE); 985 } while ((cp = cp->cpu_next) != cpu_list); 986 } 987 988 /* 989 * change cpu to online/offline 990 */ 991 static int 992 cpr_p_online(cpu_t *cp, int state) 993 { 994 int rc; 995 996 ASSERT(MUTEX_HELD(&cpu_lock)); 997 998 switch (state) { 999 case CPU_CPR_ONLINE: 1000 rc = cpu_online(cp, 0); 1001 break; 1002 case CPU_CPR_OFFLINE: 1003 rc = cpu_offline(cp, CPU_FORCED); 1004 break; 1005 } 1006 if (rc) { 1007 cpr_err(CE_WARN, "Failed to change processor %d to " 1008 "state %d, (errno %d)", cp->cpu_id, state, rc); 1009 } 1010 return (rc); 1011 } 1012 1013 /* 1014 * Construct the pathname of the state file and return a pointer to 1015 * caller. Read the config file to get the mount point of the 1016 * filesystem and the pathname within fs. 1017 */ 1018 char * 1019 cpr_build_statefile_path(void) 1020 { 1021 struct cprconfig *cf = &cprconfig; 1022 1023 if (cpr_get_config()) 1024 return (NULL); 1025 1026 switch (cf->cf_type) { 1027 case CFT_UFS: 1028 if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) { 1029 cpr_err(CE_CONT, "Statefile path is too long.\n"); 1030 return (NULL); 1031 } 1032 return (cpr_cprconfig_to_path()); 1033 case CFT_ZVOL: 1034 /*FALLTHROUGH*/ 1035 case CFT_SPEC: 1036 return (cf->cf_devfs); 1037 default: 1038 cpr_err(CE_PANIC, "invalid statefile type"); 1039 /*NOTREACHED*/ 1040 return (NULL); 1041 } 1042 } 1043 1044 int 1045 cpr_statefile_is_spec(void) 1046 { 1047 if (cpr_get_config()) 1048 return (0); 1049 return (cprconfig.cf_type == CFT_SPEC); 1050 } 1051 1052 char * 1053 cpr_get_statefile_prom_path(void) 1054 { 1055 struct cprconfig *cf = &cprconfig; 1056 1057 ASSERT(cprconfig_loaded); 1058 ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC); 1059 ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL); 1060 return (cf->cf_dev_prom); 1061 } 1062 1063 1064 /* 1065 * XXX The following routines need to be in the vfs source code. 1066 */ 1067 1068 int 1069 cpr_is_ufs(struct vfs *vfsp) 1070 { 1071 char *fsname; 1072 1073 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1074 return (strcmp(fsname, "ufs") == 0); 1075 } 1076 1077 int 1078 cpr_is_zfs(struct vfs *vfsp) 1079 { 1080 char *fsname; 1081 1082 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1083 return (strcmp(fsname, "zfs") == 0); 1084 } 1085 1086 /* 1087 * This is a list of file systems that are allowed to be writeable when a 1088 * reusable statefile checkpoint is taken. They must not have any state that 1089 * cannot be restored to consistency by simply rebooting using the checkpoint. 1090 * (In contrast to ufs and pcfs which have disk state that could get 1091 * out of sync with the in-kernel data). 1092 */ 1093 int 1094 cpr_reusable_mount_check(void) 1095 { 1096 struct vfs *vfsp; 1097 char *fsname; 1098 char **cpp; 1099 static char *cpr_writeok_fss[] = { 1100 "autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs", 1101 "proc", "tmpfs", "ctfs", "objfs", "dev", NULL 1102 }; 1103 1104 vfs_list_read_lock(); 1105 vfsp = rootvfs; 1106 do { 1107 if (vfsp->vfs_flag & VFS_RDONLY) { 1108 vfsp = vfsp->vfs_next; 1109 continue; 1110 } 1111 fsname = vfssw[vfsp->vfs_fstype].vsw_name; 1112 for (cpp = cpr_writeok_fss; *cpp; cpp++) { 1113 if (strcmp(fsname, *cpp) == 0) 1114 break; 1115 } 1116 /* 1117 * if the inner loop reached the NULL terminator, 1118 * the current fs-type does not match any OK-type 1119 */ 1120 if (*cpp == NULL) { 1121 cpr_err(CE_CONT, "a filesystem of type %s is " 1122 "mounted read/write.\nReusable statefile requires " 1123 "no writeable filesystem of this type be mounted\n", 1124 fsname); 1125 vfs_list_unlock(); 1126 return (EINVAL); 1127 } 1128 vfsp = vfsp->vfs_next; 1129 } while (vfsp != rootvfs); 1130 vfs_list_unlock(); 1131 return (0); 1132 } 1133 1134 /* 1135 * return statefile offset in DEV_BSIZE units 1136 */ 1137 int 1138 cpr_statefile_offset(void) 1139 { 1140 return (cprconfig.cf_type != CFT_UFS ? btod(CPR_SPEC_OFFSET) : 0); 1141 } 1142 1143 /* 1144 * Force a fresh read of the cprinfo per uadmin 3 call 1145 */ 1146 void 1147 cpr_forget_cprconfig(void) 1148 { 1149 cprconfig_loaded = 0; 1150 } 1151 #endif 1152