xref: /titanic_50/usr/src/uts/common/cpr/cpr_misc.c (revision 9ec394dbf343c1f23c6e13c39df427f238e5a369)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/errno.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vfs.h>
32 #include <sys/vnode.h>
33 #include <sys/pathname.h>
34 #include <sys/callb.h>
35 #include <sys/fs/ufs_inode.h>
36 #include <vm/anon.h>
37 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
38 #include <sys/kmem.h>
39 #include <sys/cpr.h>
40 #include <sys/conf.h>
41 #include <sys/machclock.h>
42 
43 /*
44  * CPR miscellaneous support routines
45  */
46 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
47 		mode, 0600, vpp, CRCREAT, 0))
48 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
49 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
50 		(ssize_t *)NULL))
51 
52 extern void clkset(time_t);
53 extern cpu_t *i_cpr_bootcpu(void);
54 extern caddr_t i_cpr_map_setup(void);
55 extern void i_cpr_free_memory_resources(void);
56 
57 extern kmutex_t cpr_slock;
58 extern size_t cpr_buf_size;
59 extern char *cpr_buf;
60 extern size_t cpr_pagedata_size;
61 extern char *cpr_pagedata;
62 extern int cpr_bufs_allocated;
63 extern int cpr_bitmaps_allocated;
64 
65 #if defined(__sparc)
66 static struct cprconfig cprconfig;
67 static int cprconfig_loaded = 0;
68 static int cpr_statefile_ok(vnode_t *, int);
69 static int cpr_p_online(cpu_t *, int);
70 static void cpr_save_mp_state(void);
71 #endif
72 
73 int cpr_is_ufs(struct vfs *);
74 int cpr_is_zfs(struct vfs *);
75 
76 char cpr_default_path[] = CPR_DEFAULT;
77 
78 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
79 #define	SIZE_RATE	115	/* increase size by 15% */
80 #define	INTEGRAL	100	/* for integer math */
81 
82 
83 /*
84  * cmn_err() followed by a 1/4 second delay; this gives the
85  * logging service a chance to flush messages and helps avoid
86  * intermixing output from prom_printf().
87  */
88 /*PRINTFLIKE2*/
89 void
90 cpr_err(int ce, const char *fmt, ...)
91 {
92 	va_list adx;
93 
94 	va_start(adx, fmt);
95 	vcmn_err(ce, fmt, adx);
96 	va_end(adx);
97 	drv_usecwait(MICROSEC >> 2);
98 }
99 
100 
101 int
102 cpr_init(int fcn)
103 {
104 	/*
105 	 * Allow only one suspend/resume process.
106 	 */
107 	if (mutex_tryenter(&cpr_slock) == 0)
108 		return (EBUSY);
109 
110 	CPR->c_flags = 0;
111 	CPR->c_substate = 0;
112 	CPR->c_cprboot_magic = 0;
113 	CPR->c_alloc_cnt = 0;
114 
115 	CPR->c_fcn = fcn;
116 	if (fcn == AD_CPR_REUSABLE)
117 		CPR->c_flags |= C_REUSABLE;
118 	else
119 		CPR->c_flags |= C_SUSPENDING;
120 	if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
121 		return (0);
122 	}
123 #if defined(__sparc)
124 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
125 		CPR->c_flags |= C_COMPRESSING;
126 	/*
127 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
128 	 */
129 	CPR->c_mapping_area = i_cpr_map_setup();
130 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
131 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
132 		mutex_exit(&cpr_slock);
133 		return (EAGAIN);
134 	}
135 	if (cpr_debug & CPR_DEBUG3)
136 		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
137 		    "kas\n", (void *)CPR->c_mapping_area);
138 #endif
139 
140 	return (0);
141 }
142 
143 /*
144  * This routine releases any resources used during the checkpoint.
145  */
146 void
147 cpr_done(void)
148 {
149 	cpr_stat_cleanup();
150 	i_cpr_bitmap_cleanup();
151 
152 	/*
153 	 * Free pages used by cpr buffers.
154 	 */
155 	if (cpr_buf) {
156 		kmem_free(cpr_buf, cpr_buf_size);
157 		cpr_buf = NULL;
158 	}
159 	if (cpr_pagedata) {
160 		kmem_free(cpr_pagedata, cpr_pagedata_size);
161 		cpr_pagedata = NULL;
162 	}
163 
164 	i_cpr_free_memory_resources();
165 	mutex_exit(&cpr_slock);
166 	cpr_err(CE_CONT, "System has been resumed.\n");
167 }
168 
169 
170 #if defined(__sparc)
171 /*
172  * reads config data into cprconfig
173  */
174 static int
175 cpr_get_config(void)
176 {
177 	static char config_path[] = CPR_CONFIG;
178 	struct cprconfig *cf = &cprconfig;
179 	struct vnode *vp;
180 	char *fmt;
181 	int err;
182 
183 	if (cprconfig_loaded)
184 		return (0);
185 
186 	fmt = "cannot %s config file \"%s\", error %d\n";
187 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
188 		cpr_err(CE_CONT, fmt, "open", config_path, err);
189 		return (err);
190 	}
191 
192 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
193 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
194 	VN_RELE(vp);
195 	if (err) {
196 		cpr_err(CE_CONT, fmt, "read", config_path, err);
197 		return (err);
198 	}
199 
200 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
201 		cprconfig_loaded = 1;
202 	else {
203 		cpr_err(CE_CONT, "invalid config file \"%s\", "
204 		    "rerun pmconfig(1M)\n", config_path);
205 		err = EINVAL;
206 	}
207 
208 	return (err);
209 }
210 
211 
212 /*
213  * concat fs and path fields of the cprconfig structure;
214  * returns pointer to the base of static data
215  */
216 static char *
217 cpr_cprconfig_to_path(void)
218 {
219 	static char full_path[MAXNAMELEN];
220 	struct cprconfig *cf = &cprconfig;
221 	char *ptr;
222 
223 	/*
224 	 * build /fs/path without extra '/'
225 	 */
226 	(void) strcpy(full_path, cf->cf_fs);
227 	if (strcmp(cf->cf_fs, "/"))
228 		(void) strcat(full_path, "/");
229 	ptr = cf->cf_path;
230 	if (*ptr == '/')
231 		ptr++;
232 	(void) strcat(full_path, ptr);
233 	return (full_path);
234 }
235 
236 
237 /*
238  * Verify that the information in the configuration file regarding the
239  * location for the statefile is still valid, depending on cf_type.
240  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
241  *	mounted on the same device as when pmconfig was last run,
242  *	and the translation of that device to a node in the prom's
243  *	device tree must be the same as when pmconfig was last run.
244  * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block
245  *      special file, it must have no file system mounted on it,
246  *	and the translation of that device to a node in the prom's
247  *	device tree must be the same as when pmconfig was last run.
248  */
249 static int
250 cpr_verify_statefile_path(void)
251 {
252 	struct cprconfig *cf = &cprconfig;
253 	static const char long_name[] = "Statefile pathname is too long.\n";
254 	static const char lookup_fmt[] = "Lookup failed for "
255 	    "cpr statefile device %s.\n";
256 	static const char path_chg_fmt[] = "Device path for statefile "
257 	    "has changed from %s to %s.\t%s\n";
258 	static const char rerun[] = "Please rerun pmconfig(1m).";
259 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
260 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
261 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
262 	int error;
263 	struct vnode *vp;
264 	char *slash, *tail, *longest;
265 	char *errstr;
266 	int found = 0;
267 	union {
268 		char un_devpath[OBP_MAXPATHLEN];
269 		char un_sfpath[MAXNAMELEN];
270 	} un;
271 #define	devpath	un.un_devpath
272 #define	sfpath	un.un_sfpath
273 
274 	ASSERT(cprconfig_loaded);
275 	/*
276 	 * We need not worry about locking or the timing of releasing
277 	 * the vnode, since we are single-threaded now.
278 	 */
279 
280 	switch (cf->cf_type) {
281 	case CFT_SPEC:
282 		error = i_devname_to_promname(cf->cf_devfs, devpath,
283 		    OBP_MAXPATHLEN);
284 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
285 			cpr_err(CE_CONT, path_chg_fmt,
286 			    cf->cf_dev_prom, devpath, rerun);
287 			return (error);
288 		}
289 		/*FALLTHROUGH*/
290 	case CFT_ZVOL:
291 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
292 			cpr_err(CE_CONT, long_name);
293 			return (ENAMETOOLONG);
294 		}
295 		if ((error = lookupname(cf->cf_devfs,
296 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
297 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
298 			return (error);
299 		}
300 		if (vp->v_type != VBLK)
301 			errstr = "statefile must be a block device";
302 		else if (vfs_devismounted(vp->v_rdev))
303 			errstr = "statefile device must not "
304 			    "have a file system mounted on it";
305 		else if (IS_SWAPVP(vp))
306 			errstr = "statefile device must not "
307 			    "be configured as swap file";
308 		else
309 			errstr = NULL;
310 
311 		VN_RELE(vp);
312 		if (errstr) {
313 			cpr_err(CE_CONT, "%s.\n", errstr);
314 			return (ENOTSUP);
315 		}
316 
317 		return (error);
318 	case CFT_UFS:
319 		break;		/* don't indent all the original code */
320 	default:
321 		cpr_err(CE_PANIC, "invalid cf_type");
322 	}
323 
324 	/*
325 	 * The original code for UFS statefile
326 	 */
327 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
328 		cpr_err(CE_CONT, long_name);
329 		return (ENAMETOOLONG);
330 	}
331 
332 	bzero(sfpath, sizeof (sfpath));
333 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
334 
335 	if (*sfpath != '/') {
336 		cpr_err(CE_CONT, "Statefile pathname %s "
337 		    "must begin with a /\n", sfpath);
338 		return (EINVAL);
339 	}
340 
341 	/*
342 	 * Find the longest prefix of the statefile pathname which
343 	 * is the mountpoint of a filesystem.  This string must
344 	 * match the cf_fs field we read from the config file.  Other-
345 	 * wise the user has changed things without running pmconfig.
346 	 */
347 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
348 	while ((slash = strchr(tail, '/')) != NULL) {
349 		*slash = '\0';	  /* temporarily terminate the string */
350 		if ((error = lookupname(sfpath,
351 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
352 			*slash = '/';
353 			cpr_err(CE_CONT, "A directory in the "
354 			    "statefile path %s was not found.\n", sfpath);
355 			VN_RELE(vp);
356 
357 			return (error);
358 		}
359 
360 		vfs_list_read_lock();
361 		vfsp = rootvfs;
362 		do {
363 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
364 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
365 				found = 1;
366 				break;
367 			}
368 			vfsp = vfsp->vfs_next;
369 		} while (vfsp != rootvfs);
370 		vfs_list_unlock();
371 
372 		/*
373 		 * If we have found a filesystem mounted on the current
374 		 * path prefix, remember the end of the string in
375 		 * "longest".  If it happens to be the the exact fs
376 		 * saved in the configuration file, save the current
377 		 * ufsvfsp so we can make additional checks further down.
378 		 */
379 		if (found) {
380 			longest = slash;
381 			if (strcmp(cf->cf_fs, sfpath) == 0) {
382 				ufsvfsp_save = ufsvfsp;
383 				vfsp_save = vfsp;
384 			}
385 			found = 0;
386 		}
387 
388 		VN_RELE(vp);
389 		*slash = '/';
390 		tail = slash + 1;
391 	}
392 	*longest = '\0';
393 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
394 		cpr_err(CE_CONT, "Filesystem containing "
395 		    "the statefile when pmconfig was run (%s) has "
396 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
397 		return (EINVAL);
398 	}
399 
400 	if ((error = lookupname(cf->cf_devfs,
401 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
402 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
403 		return (error);
404 	}
405 
406 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
407 		cpr_err(CE_CONT, "Filesystem containing "
408 		    "statefile no longer mounted on device %s. "
409 		    "See power.conf(4).", cf->cf_devfs);
410 		VN_RELE(vp);
411 		return (ENXIO);
412 	}
413 	VN_RELE(vp);
414 
415 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
416 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
417 		cpr_err(CE_CONT, path_chg_fmt,
418 		    cf->cf_dev_prom, devpath, rerun);
419 		return (error);
420 	}
421 
422 	return (0);
423 }
424 
425 /*
426  * Make sure that the statefile can be used as a block special statefile
427  * (meaning that is exists and has nothing mounted on it)
428  * Returns errno if not a valid statefile.
429  */
430 int
431 cpr_check_spec_statefile(void)
432 {
433 	int err;
434 
435 	if (err = cpr_get_config())
436 		return (err);
437 	ASSERT(cprconfig.cf_type == CFT_SPEC ||
438 	    cprconfig.cf_type == CFT_ZVOL);
439 
440 	if (cprconfig.cf_devfs == NULL)
441 		return (ENXIO);
442 
443 	return (cpr_verify_statefile_path());
444 
445 }
446 
447 int
448 cpr_alloc_statefile(int alloc_retry)
449 {
450 	register int rc = 0;
451 	char *str;
452 
453 	/*
454 	 * Statefile size validation. If checkpoint the first time, disk blocks
455 	 * allocation will be done; otherwise, just do file size check.
456 	 * if statefile allocation is being retried, C_VP will be inited
457 	 */
458 	if (alloc_retry) {
459 		str = "\n-->Retrying statefile allocation...";
460 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
461 			prom_printf(str);
462 		if (C_VP->v_type != VBLK)
463 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
464 	} else {
465 		/*
466 		 * Open an exiting file for writing, the state file needs to be
467 		 * pre-allocated since we can't and don't want to do allocation
468 		 * during checkpoint (too much of the OS is disabled).
469 		 *    - do a preliminary size checking here, if it is too small,
470 		 *	allocate more space internally and retry.
471 		 *    - check the vp to make sure it's the right type.
472 		 */
473 		char *path = cpr_build_statefile_path();
474 
475 		if (path == NULL)
476 			return (ENXIO);
477 		else if (rc = cpr_verify_statefile_path())
478 			return (rc);
479 
480 		if (rc = vn_open(path, UIO_SYSSPACE,
481 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
482 			cpr_err(CE_WARN, "cannot open statefile %s", path);
483 			return (rc);
484 		}
485 	}
486 
487 	/*
488 	 * Only ufs and block special statefiles supported
489 	 */
490 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
491 		cpr_err(CE_CONT,
492 		    "Statefile must be regular file or block special file.");
493 		return (EACCES);
494 	}
495 
496 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
497 		return (rc);
498 
499 	if (C_VP->v_type != VBLK) {
500 		/*
501 		 * sync out the fs change due to the statefile reservation.
502 		 */
503 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
504 
505 		/*
506 		 * Validate disk blocks allocation for the state file.
507 		 * Ask the file system prepare itself for the dump operation.
508 		 */
509 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
510 			cpr_err(CE_CONT, "Error allocating "
511 			    "blocks for cpr statefile.");
512 			return (rc);
513 		}
514 	}
515 	return (0);
516 }
517 
518 
519 /*
520  * Lookup device size and return available space in bytes.
521  * NOTE: Since prop_op(9E) can't tell the difference between a character
522  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
523  */
524 size_t
525 cpr_get_devsize(dev_t dev)
526 {
527 	size_t bytes = 0;
528 
529 	bytes = cdev_Size(dev);
530 	if (bytes == 0)
531 		bytes = cdev_size(dev);
532 
533 	if (bytes > CPR_SPEC_OFFSET)
534 		bytes -= CPR_SPEC_OFFSET;
535 	else
536 		bytes = 0;
537 
538 	return (bytes);
539 }
540 
541 
542 /*
543  * increase statefile size
544  */
545 static int
546 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
547 {
548 	extern uchar_t cpr_pagecopy[];
549 	struct inode *ip = VTOI(vp);
550 	u_longlong_t offset;
551 	int error, increase;
552 	ssize_t resid;
553 
554 	rw_enter(&ip->i_contents, RW_READER);
555 	increase = (ip->i_size < newsize);
556 	offset = ip->i_size;
557 	rw_exit(&ip->i_contents);
558 
559 	if (increase == 0)
560 		return (0);
561 
562 	/*
563 	 * write to each logical block to reserve disk space
564 	 */
565 	error = 0;
566 	cpr_pagecopy[0] = '1';
567 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
568 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
569 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
570 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
571 			if (error == ENOSPC) {
572 				cpr_err(CE_WARN, "error %d while reserving "
573 				    "disk space for statefile %s\n"
574 				    "wanted %lld bytes, file is %lld short",
575 				    error, cpr_cprconfig_to_path(),
576 				    newsize, newsize - offset);
577 			}
578 			break;
579 		}
580 	}
581 	return (error);
582 }
583 
584 
585 /*
586  * do a simple estimate of the space needed to hold the statefile
587  * taking compression into account, but be fairly conservative
588  * so we have a better chance of completing; when dump fails,
589  * the retry cost is fairly high.
590  *
591  * Do disk blocks allocation for the state file if no space has
592  * been allocated yet. Since the state file will not be removed,
593  * allocation should only be done once.
594  */
595 static int
596 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
597 {
598 	extern size_t cpr_bitmap_size;
599 	struct inode *ip = VTOI(vp);
600 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
601 	u_longlong_t size, isize, ksize, raw_data;
602 	char *str, *est_fmt;
603 	size_t space;
604 	int error;
605 
606 	/*
607 	 * number of pages short for swapping.
608 	 */
609 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
610 	if (STAT->cs_nosw_pages < 0)
611 		STAT->cs_nosw_pages = 0;
612 
613 	str = "cpr_statefile_ok:";
614 
615 	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
616 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
617 	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
618 	    MAX(availrmem - swapfs_minfree, 0),
619 	    k_anoninfo.ani_mem_resv);
620 	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
621 	    CURRENT_TOTAL_AVAILABLE_SWAP);
622 
623 	/*
624 	 * try increasing filesize by 15%
625 	 */
626 	if (alloc_retry) {
627 		/*
628 		 * block device doesn't get any bigger
629 		 */
630 		if (vp->v_type == VBLK) {
631 			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
632 				prom_printf(
633 				    "Retry statefile on special file\n");
634 			return (ENOMEM);
635 		} else {
636 			rw_enter(&ip->i_contents, RW_READER);
637 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
638 			rw_exit(&ip->i_contents);
639 		}
640 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
641 			prom_printf("Retry statefile size = %lld\n", size);
642 	} else {
643 		u_longlong_t cpd_size;
644 		pgcnt_t npages, nback;
645 		int ndvram;
646 
647 		ndvram = 0;
648 		(void) callb_execute_class(CB_CL_CPR_FB,
649 		    (int)(uintptr_t)&ndvram);
650 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
651 			prom_printf("ndvram size = %d\n", ndvram);
652 
653 		/*
654 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
655 		 */
656 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
657 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
658 		raw_data = cpd_size + cpr_bitmap_size;
659 		ksize = ndvram + mmu_ptob(npages);
660 
661 		est_fmt = "%s estimated size with "
662 		    "%scompression %lld, ksize %lld\n";
663 		nback = mmu_ptob(STAT->cs_nosw_pages);
664 		if (CPR->c_flags & C_COMPRESSING) {
665 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
666 			    raw_data + ((nback * 10) / UCOMP_RATE);
667 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
668 		} else {
669 			size = ksize + raw_data + nback;
670 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
671 			    size, ksize);
672 		}
673 	}
674 
675 	/*
676 	 * All this is much simpler for a block device
677 	 */
678 	if (vp->v_type == VBLK) {
679 		space = cpr_get_devsize(vp->v_rdev);
680 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
681 			prom_printf("statefile dev size %lu\n", space);
682 
683 		/*
684 		 * Export the estimated filesize info, this value will be
685 		 * compared before dumping out the statefile in the case of
686 		 * no compression.
687 		 */
688 		STAT->cs_est_statefsz = size;
689 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
690 			prom_printf("%s Estimated statefile size %llu, "
691 			    "space %lu\n", str, size, space);
692 		if (size > space) {
693 			cpr_err(CE_CONT, "Statefile partition too small.");
694 			return (ENOMEM);
695 		}
696 		return (0);
697 	} else {
698 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
699 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
700 			return (ENOMEM);
701 		}
702 
703 		/*
704 		 * Estimate space needed for the state file.
705 		 *
706 		 * State file size in bytes:
707 		 * 	kernel size + non-cache pte seg +
708 		 *	bitmap size + cpr state file headers size
709 		 * (round up to fs->fs_bsize)
710 		 */
711 		size = blkroundup(ip->i_fs, size);
712 
713 		/*
714 		 * Export the estimated filesize info, this value will be
715 		 * compared before dumping out the statefile in the case of
716 		 * no compression.
717 		 */
718 		STAT->cs_est_statefsz = size;
719 		error = cpr_grow_statefile(vp, size);
720 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
721 			rw_enter(&ip->i_contents, RW_READER);
722 			isize = ip->i_size;
723 			rw_exit(&ip->i_contents);
724 			prom_printf("%s Estimated statefile size %lld, "
725 			    "i_size %lld\n", str, size, isize);
726 		}
727 
728 		return (error);
729 	}
730 }
731 
732 
733 void
734 cpr_statef_close(void)
735 {
736 	if (C_VP) {
737 		if (!cpr_reusable_mode)
738 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
739 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
740 		VN_RELE(C_VP);
741 		C_VP = 0;
742 	}
743 }
744 
745 
746 /*
747  * open cpr default file and display error
748  */
749 int
750 cpr_open_deffile(int mode, vnode_t **vpp)
751 {
752 	int error;
753 
754 	if (error = cpr_open(cpr_default_path, mode, vpp))
755 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
756 		    cpr_default_path, error);
757 	return (error);
758 }
759 
760 
761 /*
762  * write cdef_t to disk.  This contains the original values of prom
763  * properties that we modify.  We fill in the magic number of the file
764  * here as a signal to the booter code that the state file is valid.
765  * Be sure the file gets synced, since we may be shutting down the OS.
766  */
767 int
768 cpr_write_deffile(cdef_t *cdef)
769 {
770 	struct vnode *vp;
771 	char *str;
772 	int rc;
773 
774 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
775 		return (rc);
776 
777 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
778 		str = "write";
779 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
780 		str = "fsync";
781 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
782 	VN_RELE(vp);
783 
784 	if (rc) {
785 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
786 		    str, rc, cpr_default_path);
787 	}
788 	return (rc);
789 }
790 
791 /*
792  * Clear the magic number in the defaults file.  This tells the booter
793  * program that the state file is not current and thus prevents
794  * any attempt to restore from an obsolete state file.
795  */
796 void
797 cpr_clear_definfo(void)
798 {
799 	struct vnode *vp;
800 	cmini_t mini;
801 
802 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
803 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
804 		return;
805 	mini.magic = mini.reusable = 0;
806 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
807 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
808 	VN_RELE(vp);
809 }
810 
811 /*
812  * If the cpr default file is invalid, then we must not be in reusable mode
813  * if it is valid, it tells us our mode
814  */
815 int
816 cpr_get_reusable_mode(void)
817 {
818 	struct vnode *vp;
819 	cmini_t mini;
820 	int rc;
821 
822 	if (cpr_open(cpr_default_path, FREAD, &vp))
823 		return (0);
824 
825 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
826 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
827 	VN_RELE(vp);
828 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
829 		return (mini.reusable);
830 
831 	return (0);
832 }
833 #endif
834 
835 /*
836  * clock/time related routines
837  */
838 static time_t   cpr_time_stamp;
839 
840 
841 void
842 cpr_tod_get(cpr_time_t *ctp)
843 {
844 	timestruc_t ts;
845 
846 	mutex_enter(&tod_lock);
847 	ts = TODOP_GET(tod_ops);
848 	mutex_exit(&tod_lock);
849 	ctp->tv_sec = (time32_t)ts.tv_sec;
850 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
851 }
852 
853 void
854 cpr_tod_fault_reset(void)
855 {
856 	mutex_enter(&tod_lock);
857 	tod_fault_reset();
858 	mutex_exit(&tod_lock);
859 }
860 
861 void
862 cpr_save_time(void)
863 {
864 	cpr_time_stamp = gethrestime_sec();
865 }
866 
867 /*
868  * correct time based on saved time stamp or hardware clock
869  */
870 void
871 cpr_restore_time(void)
872 {
873 	clkset(cpr_time_stamp);
874 }
875 
876 #if defined(__sparc)
877 /*
878  * CPU ONLINE/OFFLINE CODE
879  */
880 int
881 cpr_mp_offline(void)
882 {
883 	cpu_t *cp, *bootcpu;
884 	int rc = 0;
885 	int brought_up_boot = 0;
886 
887 	/*
888 	 * Do nothing for UP.
889 	 */
890 	if (ncpus == 1)
891 		return (0);
892 
893 	mutex_enter(&cpu_lock);
894 
895 	cpr_save_mp_state();
896 
897 	bootcpu = i_cpr_bootcpu();
898 	if (!CPU_ACTIVE(bootcpu)) {
899 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
900 			mutex_exit(&cpu_lock);
901 			return (rc);
902 		}
903 		brought_up_boot = 1;
904 	}
905 
906 	cp = cpu_list;
907 	do {
908 		if (cp == bootcpu)
909 			continue;
910 		if (cp->cpu_flags & CPU_OFFLINE)
911 			continue;
912 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
913 			mutex_exit(&cpu_lock);
914 			return (rc);
915 		}
916 	} while ((cp = cp->cpu_next) != cpu_list);
917 	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
918 		prom_printf("changed cpu %p to state %d\n",
919 		    bootcpu, CPU_CPR_ONLINE);
920 	mutex_exit(&cpu_lock);
921 
922 	return (rc);
923 }
924 
925 int
926 cpr_mp_online(void)
927 {
928 	cpu_t *cp, *bootcpu = CPU;
929 	int rc = 0;
930 
931 	/*
932 	 * Do nothing for UP.
933 	 */
934 	if (ncpus == 1)
935 		return (0);
936 
937 	/*
938 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
939 	 * to indicate a cpu was online at the time of cpr_suspend();
940 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
941 	 * and actually are offline.
942 	 */
943 	mutex_enter(&cpu_lock);
944 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
945 		/*
946 		 * Clear the CPU_FROZEN flag in all cases.
947 		 */
948 		cp->cpu_flags &= ~CPU_FROZEN;
949 
950 		if (CPU_CPR_IS_OFFLINE(cp))
951 			continue;
952 		if (CPU_ACTIVE(cp))
953 			continue;
954 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
955 			mutex_exit(&cpu_lock);
956 			return (rc);
957 		}
958 	}
959 
960 	/*
961 	 * turn off the boot cpu if it was offlined
962 	 */
963 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
964 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
965 			mutex_exit(&cpu_lock);
966 			return (rc);
967 		}
968 	}
969 	mutex_exit(&cpu_lock);
970 	return (0);
971 }
972 
973 static void
974 cpr_save_mp_state(void)
975 {
976 	cpu_t *cp;
977 
978 	ASSERT(MUTEX_HELD(&cpu_lock));
979 
980 	cp = cpu_list;
981 	do {
982 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
983 		if (CPU_ACTIVE(cp))
984 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
985 	} while ((cp = cp->cpu_next) != cpu_list);
986 }
987 
988 /*
989  * change cpu to online/offline
990  */
991 static int
992 cpr_p_online(cpu_t *cp, int state)
993 {
994 	int rc;
995 
996 	ASSERT(MUTEX_HELD(&cpu_lock));
997 
998 	switch (state) {
999 	case CPU_CPR_ONLINE:
1000 		rc = cpu_online(cp);
1001 		break;
1002 	case CPU_CPR_OFFLINE:
1003 		rc = cpu_offline(cp, CPU_FORCED);
1004 		break;
1005 	}
1006 	if (rc) {
1007 		cpr_err(CE_WARN, "Failed to change processor %d to "
1008 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
1009 	}
1010 	return (rc);
1011 }
1012 
1013 /*
1014  * Construct the pathname of the state file and return a pointer to
1015  * caller.  Read the config file to get the mount point of the
1016  * filesystem and the pathname within fs.
1017  */
1018 char *
1019 cpr_build_statefile_path(void)
1020 {
1021 	struct cprconfig *cf = &cprconfig;
1022 
1023 	if (cpr_get_config())
1024 		return (NULL);
1025 
1026 	switch (cf->cf_type) {
1027 	case CFT_UFS:
1028 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1029 			cpr_err(CE_CONT, "Statefile path is too long.\n");
1030 			return (NULL);
1031 		}
1032 		return (cpr_cprconfig_to_path());
1033 	case CFT_ZVOL:
1034 		/*FALLTHROUGH*/
1035 	case CFT_SPEC:
1036 		return (cf->cf_devfs);
1037 	default:
1038 		cpr_err(CE_PANIC, "invalid statefile type");
1039 		/*NOTREACHED*/
1040 		return (NULL);
1041 	}
1042 }
1043 
1044 int
1045 cpr_statefile_is_spec(void)
1046 {
1047 	if (cpr_get_config())
1048 		return (0);
1049 	return (cprconfig.cf_type == CFT_SPEC);
1050 }
1051 
1052 char *
1053 cpr_get_statefile_prom_path(void)
1054 {
1055 	struct cprconfig *cf = &cprconfig;
1056 
1057 	ASSERT(cprconfig_loaded);
1058 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1059 	ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL);
1060 	return (cf->cf_dev_prom);
1061 }
1062 
1063 
1064 /*
1065  * XXX The following routines need to be in the vfs source code.
1066  */
1067 
1068 int
1069 cpr_is_ufs(struct vfs *vfsp)
1070 {
1071 	char *fsname;
1072 
1073 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1074 	return (strcmp(fsname, "ufs") == 0);
1075 }
1076 
1077 int
1078 cpr_is_zfs(struct vfs *vfsp)
1079 {
1080 	char *fsname;
1081 
1082 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1083 	return (strcmp(fsname, "zfs") == 0);
1084 }
1085 
1086 /*
1087  * This is a list of file systems that are allowed to be writeable when a
1088  * reusable statefile checkpoint is taken.  They must not have any state that
1089  * cannot be restored to consistency by simply rebooting using the checkpoint.
1090  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
1091  * out of sync with the in-kernel data).
1092  */
1093 int
1094 cpr_reusable_mount_check(void)
1095 {
1096 	struct vfs *vfsp;
1097 	char *fsname;
1098 	char **cpp;
1099 	static char *cpr_writeok_fss[] = {
1100 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1101 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1102 	};
1103 
1104 	vfs_list_read_lock();
1105 	vfsp = rootvfs;
1106 	do {
1107 		if (vfsp->vfs_flag & VFS_RDONLY) {
1108 			vfsp = vfsp->vfs_next;
1109 			continue;
1110 		}
1111 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1112 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1113 			if (strcmp(fsname, *cpp) == 0)
1114 				break;
1115 		}
1116 		/*
1117 		 * if the inner loop reached the NULL terminator,
1118 		 * the current fs-type does not match any OK-type
1119 		 */
1120 		if (*cpp == NULL) {
1121 			cpr_err(CE_CONT, "a filesystem of type %s is "
1122 			    "mounted read/write.\nReusable statefile requires "
1123 			    "no writeable filesystem of this type be mounted\n",
1124 			    fsname);
1125 			vfs_list_unlock();
1126 			return (EINVAL);
1127 		}
1128 		vfsp = vfsp->vfs_next;
1129 	} while (vfsp != rootvfs);
1130 	vfs_list_unlock();
1131 	return (0);
1132 }
1133 
1134 /*
1135  * return statefile offset in DEV_BSIZE units
1136  */
1137 int
1138 cpr_statefile_offset(void)
1139 {
1140 	return (cprconfig.cf_type != CFT_UFS ? btod(CPR_SPEC_OFFSET) : 0);
1141 }
1142 
1143 /*
1144  * Force a fresh read of the cprinfo per uadmin 3 call
1145  */
1146 void
1147 cpr_forget_cprconfig(void)
1148 {
1149 	cprconfig_loaded = 0;
1150 }
1151 #endif
1152