xref: /titanic_51/usr/src/uts/common/cpr/cpr_misc.c (revision e79c98e6c943cb3032f272714ff4ce6137d40394)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/errno.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vfs.h>
32 #include <sys/vnode.h>
33 #include <sys/pathname.h>
34 #include <sys/callb.h>
35 #include <sys/fs/ufs_inode.h>
36 #include <vm/anon.h>
37 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
38 #include <sys/kmem.h>
39 #include <sys/cpr.h>
40 #include <sys/conf.h>
41 #include <sys/machclock.h>
42 
43 /*
44  * CPR miscellaneous support routines
45  */
46 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
47 		mode, 0600, vpp, CRCREAT, 0))
48 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
49 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
50 		(ssize_t *)NULL))
51 
52 extern void clkset(time_t);
53 extern cpu_t *i_cpr_bootcpu(void);
54 extern caddr_t i_cpr_map_setup(void);
55 extern void i_cpr_free_memory_resources(void);
56 
57 extern kmutex_t cpr_slock;
58 extern size_t cpr_buf_size;
59 extern char *cpr_buf;
60 extern size_t cpr_pagedata_size;
61 extern char *cpr_pagedata;
62 extern int cpr_bufs_allocated;
63 extern int cpr_bitmaps_allocated;
64 
65 #if defined(__sparc)
66 static struct cprconfig cprconfig;
67 static int cprconfig_loaded = 0;
68 static int cpr_statefile_ok(vnode_t *, int);
69 static int cpr_p_online(cpu_t *, int);
70 static void cpr_save_mp_state(void);
71 #endif
72 
73 int cpr_is_ufs(struct vfs *);
74 
75 char cpr_default_path[] = CPR_DEFAULT;
76 
77 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
78 #define	SIZE_RATE	115	/* increase size by 15% */
79 #define	INTEGRAL	100	/* for integer math */
80 
81 
82 /*
83  * cmn_err() followed by a 1/4 second delay; this gives the
84  * logging service a chance to flush messages and helps avoid
85  * intermixing output from prom_printf().
86  */
87 /*PRINTFLIKE2*/
88 void
89 cpr_err(int ce, const char *fmt, ...)
90 {
91 	va_list adx;
92 
93 	va_start(adx, fmt);
94 	vcmn_err(ce, fmt, adx);
95 	va_end(adx);
96 	drv_usecwait(MICROSEC >> 2);
97 }
98 
99 
100 int
101 cpr_init(int fcn)
102 {
103 	/*
104 	 * Allow only one suspend/resume process.
105 	 */
106 	if (mutex_tryenter(&cpr_slock) == 0)
107 		return (EBUSY);
108 
109 	CPR->c_flags = 0;
110 	CPR->c_substate = 0;
111 	CPR->c_cprboot_magic = 0;
112 	CPR->c_alloc_cnt = 0;
113 
114 	CPR->c_fcn = fcn;
115 	if (fcn == AD_CPR_REUSABLE)
116 		CPR->c_flags |= C_REUSABLE;
117 	else
118 		CPR->c_flags |= C_SUSPENDING;
119 	if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
120 		return (0);
121 	}
122 #if defined(__sparc)
123 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
124 		CPR->c_flags |= C_COMPRESSING;
125 	/*
126 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
127 	 */
128 	CPR->c_mapping_area = i_cpr_map_setup();
129 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
130 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
131 		mutex_exit(&cpr_slock);
132 		return (EAGAIN);
133 	}
134 	if (cpr_debug & CPR_DEBUG3)
135 		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
136 		    "kas\n", (void *)CPR->c_mapping_area);
137 #endif
138 
139 	return (0);
140 }
141 
142 /*
143  * This routine releases any resources used during the checkpoint.
144  */
145 void
146 cpr_done(void)
147 {
148 	cpr_stat_cleanup();
149 	i_cpr_bitmap_cleanup();
150 
151 	/*
152 	 * Free pages used by cpr buffers.
153 	 */
154 	if (cpr_buf) {
155 		kmem_free(cpr_buf, cpr_buf_size);
156 		cpr_buf = NULL;
157 	}
158 	if (cpr_pagedata) {
159 		kmem_free(cpr_pagedata, cpr_pagedata_size);
160 		cpr_pagedata = NULL;
161 	}
162 
163 	i_cpr_free_memory_resources();
164 	mutex_exit(&cpr_slock);
165 	cpr_err(CE_CONT, "System has been resumed.\n");
166 }
167 
168 
169 #if defined(__sparc)
170 /*
171  * reads config data into cprconfig
172  */
173 static int
174 cpr_get_config(void)
175 {
176 	static char config_path[] = CPR_CONFIG;
177 	struct cprconfig *cf = &cprconfig;
178 	struct vnode *vp;
179 	char *fmt;
180 	int err;
181 
182 	if (cprconfig_loaded)
183 		return (0);
184 
185 	fmt = "cannot %s config file \"%s\", error %d\n";
186 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
187 		cpr_err(CE_CONT, fmt, "open", config_path, err);
188 		return (err);
189 	}
190 
191 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
192 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
193 	VN_RELE(vp);
194 	if (err) {
195 		cpr_err(CE_CONT, fmt, "read", config_path, err);
196 		return (err);
197 	}
198 
199 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
200 		cprconfig_loaded = 1;
201 	else {
202 		cpr_err(CE_CONT, "invalid config file \"%s\", "
203 		    "rerun pmconfig(1M)\n", config_path);
204 		err = EINVAL;
205 	}
206 
207 	return (err);
208 }
209 
210 
211 /*
212  * concat fs and path fields of the cprconfig structure;
213  * returns pointer to the base of static data
214  */
215 static char *
216 cpr_cprconfig_to_path(void)
217 {
218 	static char full_path[MAXNAMELEN];
219 	struct cprconfig *cf = &cprconfig;
220 	char *ptr;
221 
222 	/*
223 	 * build /fs/path without extra '/'
224 	 */
225 	(void) strcpy(full_path, cf->cf_fs);
226 	if (strcmp(cf->cf_fs, "/"))
227 		(void) strcat(full_path, "/");
228 	ptr = cf->cf_path;
229 	if (*ptr == '/')
230 		ptr++;
231 	(void) strcat(full_path, ptr);
232 	return (full_path);
233 }
234 
235 
236 /*
237  * Verify that the information in the configuration file regarding the
238  * location for the statefile is still valid, depending on cf_type.
239  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
240  *	mounted on the same device as when pmconfig was last run,
241  *	and the translation of that device to a node in the prom's
242  *	device tree must be the same as when pmconfig was last run.
243  * for CFT_SPEC, cf_path must be the path to a block special file,
244  *	it must have no file system mounted on it,
245  *	and the translation of that device to a node in the prom's
246  *	device tree must be the same as when pmconfig was last run.
247  */
248 static int
249 cpr_verify_statefile_path(void)
250 {
251 	struct cprconfig *cf = &cprconfig;
252 	static const char long_name[] = "Statefile pathname is too long.\n";
253 	static const char lookup_fmt[] = "Lookup failed for "
254 	    "cpr statefile device %s.\n";
255 	static const char path_chg_fmt[] = "Device path for statefile "
256 	    "has changed from %s to %s.\t%s\n";
257 	static const char rerun[] = "Please rerun pmconfig(1m).";
258 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
259 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
260 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
261 	int error;
262 	struct vnode *vp;
263 	char *slash, *tail, *longest;
264 	char *errstr;
265 	int found = 0;
266 	union {
267 		char un_devpath[OBP_MAXPATHLEN];
268 		char un_sfpath[MAXNAMELEN];
269 	} un;
270 #define	devpath	un.un_devpath
271 #define	sfpath	un.un_sfpath
272 
273 	ASSERT(cprconfig_loaded);
274 	/*
275 	 * We need not worry about locking or the timing of releasing
276 	 * the vnode, since we are single-threaded now.
277 	 */
278 
279 	switch (cf->cf_type) {
280 	case CFT_SPEC:
281 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
282 			cpr_err(CE_CONT, long_name);
283 			return (ENAMETOOLONG);
284 		}
285 		if ((error = lookupname(cf->cf_devfs,
286 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
287 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
288 			return (error);
289 		}
290 		if (vp->v_type != VBLK)
291 			errstr = "statefile must be a block device";
292 		else if (vfs_devismounted(vp->v_rdev))
293 			errstr = "statefile device must not "
294 			    "have a file system mounted on it";
295 		else if (IS_SWAPVP(vp))
296 			errstr = "statefile device must not "
297 			    "be configured as swap file";
298 		else
299 			errstr = NULL;
300 
301 		VN_RELE(vp);
302 		if (errstr) {
303 			cpr_err(CE_CONT, "%s.\n", errstr);
304 			return (ENOTSUP);
305 		}
306 
307 		error = i_devname_to_promname(cf->cf_devfs, devpath,
308 		    OBP_MAXPATHLEN);
309 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
310 			cpr_err(CE_CONT, path_chg_fmt,
311 			    cf->cf_dev_prom, devpath, rerun);
312 		}
313 		return (error);
314 	case CFT_UFS:
315 		break;		/* don't indent all the original code */
316 	default:
317 		cpr_err(CE_PANIC, "invalid cf_type");
318 	}
319 
320 	/*
321 	 * The original code for UFS statefile
322 	 */
323 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
324 		cpr_err(CE_CONT, long_name);
325 		return (ENAMETOOLONG);
326 	}
327 
328 	bzero(sfpath, sizeof (sfpath));
329 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
330 
331 	if (*sfpath != '/') {
332 		cpr_err(CE_CONT, "Statefile pathname %s "
333 		    "must begin with a /\n", sfpath);
334 		return (EINVAL);
335 	}
336 
337 	/*
338 	 * Find the longest prefix of the statefile pathname which
339 	 * is the mountpoint of a filesystem.  This string must
340 	 * match the cf_fs field we read from the config file.  Other-
341 	 * wise the user has changed things without running pmconfig.
342 	 */
343 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
344 	while ((slash = strchr(tail, '/')) != NULL) {
345 		*slash = '\0';	  /* temporarily terminate the string */
346 		if ((error = lookupname(sfpath,
347 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
348 			*slash = '/';
349 			cpr_err(CE_CONT, "A directory in the "
350 			    "statefile path %s was not found.\n", sfpath);
351 			VN_RELE(vp);
352 
353 			return (error);
354 		}
355 
356 		vfs_list_read_lock();
357 		vfsp = rootvfs;
358 		do {
359 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
360 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
361 				found = 1;
362 				break;
363 			}
364 			vfsp = vfsp->vfs_next;
365 		} while (vfsp != rootvfs);
366 		vfs_list_unlock();
367 
368 		/*
369 		 * If we have found a filesystem mounted on the current
370 		 * path prefix, remember the end of the string in
371 		 * "longest".  If it happens to be the the exact fs
372 		 * saved in the configuration file, save the current
373 		 * ufsvfsp so we can make additional checks further down.
374 		 */
375 		if (found) {
376 			longest = slash;
377 			if (strcmp(cf->cf_fs, sfpath) == 0) {
378 				ufsvfsp_save = ufsvfsp;
379 				vfsp_save = vfsp;
380 			}
381 			found = 0;
382 		}
383 
384 		VN_RELE(vp);
385 		*slash = '/';
386 		tail = slash + 1;
387 	}
388 	*longest = '\0';
389 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
390 		cpr_err(CE_CONT, "Filesystem containing "
391 		    "the statefile when pmconfig was run (%s) has "
392 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
393 		return (EINVAL);
394 	}
395 
396 	if ((error = lookupname(cf->cf_devfs,
397 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
398 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
399 		return (error);
400 	}
401 
402 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
403 		cpr_err(CE_CONT, "Filesystem containing "
404 		    "statefile no longer mounted on device %s. "
405 		    "See power.conf(4).", cf->cf_devfs);
406 		VN_RELE(vp);
407 		return (ENXIO);
408 	}
409 	VN_RELE(vp);
410 
411 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
412 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
413 		cpr_err(CE_CONT, path_chg_fmt,
414 		    cf->cf_dev_prom, devpath, rerun);
415 		return (error);
416 	}
417 
418 	return (0);
419 }
420 
421 /*
422  * Make sure that the statefile can be used as a block special statefile
423  * (meaning that is exists and has nothing mounted on it)
424  * Returns errno if not a valid statefile.
425  */
426 int
427 cpr_check_spec_statefile(void)
428 {
429 	int err;
430 
431 	if (err = cpr_get_config())
432 		return (err);
433 	ASSERT(cprconfig.cf_type == CFT_SPEC);
434 
435 	if (cprconfig.cf_devfs == NULL)
436 		return (ENXIO);
437 
438 	return (cpr_verify_statefile_path());
439 
440 }
441 
442 int
443 cpr_alloc_statefile(int alloc_retry)
444 {
445 	register int rc = 0;
446 	char *str;
447 
448 	/*
449 	 * Statefile size validation. If checkpoint the first time, disk blocks
450 	 * allocation will be done; otherwise, just do file size check.
451 	 * if statefile allocation is being retried, C_VP will be inited
452 	 */
453 	if (alloc_retry) {
454 		str = "\n-->Retrying statefile allocation...";
455 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
456 			prom_printf(str);
457 		if (C_VP->v_type != VBLK)
458 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
459 	} else {
460 		/*
461 		 * Open an exiting file for writing, the state file needs to be
462 		 * pre-allocated since we can't and don't want to do allocation
463 		 * during checkpoint (too much of the OS is disabled).
464 		 *    - do a preliminary size checking here, if it is too small,
465 		 *	allocate more space internally and retry.
466 		 *    - check the vp to make sure it's the right type.
467 		 */
468 		char *path = cpr_build_statefile_path();
469 
470 		if (path == NULL)
471 			return (ENXIO);
472 		else if (rc = cpr_verify_statefile_path())
473 			return (rc);
474 
475 		if (rc = vn_open(path, UIO_SYSSPACE,
476 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
477 			cpr_err(CE_WARN, "cannot open statefile %s", path);
478 			return (rc);
479 		}
480 	}
481 
482 	/*
483 	 * Only ufs and block special statefiles supported
484 	 */
485 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
486 		cpr_err(CE_CONT,
487 		    "Statefile must be regular file or block special file.");
488 		return (EACCES);
489 	}
490 
491 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
492 		return (rc);
493 
494 	if (C_VP->v_type != VBLK) {
495 		/*
496 		 * sync out the fs change due to the statefile reservation.
497 		 */
498 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
499 
500 		/*
501 		 * Validate disk blocks allocation for the state file.
502 		 * Ask the file system prepare itself for the dump operation.
503 		 */
504 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
505 			cpr_err(CE_CONT, "Error allocating "
506 			    "blocks for cpr statefile.");
507 			return (rc);
508 		}
509 	}
510 	return (0);
511 }
512 
513 
514 /*
515  * Lookup device size and return available space in bytes.
516  * NOTE: Since prop_op(9E) can't tell the difference between a character
517  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
518  */
519 size_t
520 cpr_get_devsize(dev_t dev)
521 {
522 	size_t bytes = 0;
523 
524 	bytes = cdev_Size(dev);
525 	if (bytes == 0)
526 		bytes = cdev_size(dev);
527 
528 	if (bytes > CPR_SPEC_OFFSET)
529 		bytes -= CPR_SPEC_OFFSET;
530 	else
531 		bytes = 0;
532 
533 	return (bytes);
534 }
535 
536 
537 /*
538  * increase statefile size
539  */
540 static int
541 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
542 {
543 	extern uchar_t cpr_pagecopy[];
544 	struct inode *ip = VTOI(vp);
545 	u_longlong_t offset;
546 	int error, increase;
547 	ssize_t resid;
548 
549 	rw_enter(&ip->i_contents, RW_READER);
550 	increase = (ip->i_size < newsize);
551 	offset = ip->i_size;
552 	rw_exit(&ip->i_contents);
553 
554 	if (increase == 0)
555 		return (0);
556 
557 	/*
558 	 * write to each logical block to reserve disk space
559 	 */
560 	error = 0;
561 	cpr_pagecopy[0] = '1';
562 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
563 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
564 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
565 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
566 			if (error == ENOSPC) {
567 				cpr_err(CE_WARN, "error %d while reserving "
568 				    "disk space for statefile %s\n"
569 				    "wanted %lld bytes, file is %lld short",
570 				    error, cpr_cprconfig_to_path(),
571 				    newsize, newsize - offset);
572 			}
573 			break;
574 		}
575 	}
576 	return (error);
577 }
578 
579 
580 /*
581  * do a simple estimate of the space needed to hold the statefile
582  * taking compression into account, but be fairly conservative
583  * so we have a better chance of completing; when dump fails,
584  * the retry cost is fairly high.
585  *
586  * Do disk blocks allocation for the state file if no space has
587  * been allocated yet. Since the state file will not be removed,
588  * allocation should only be done once.
589  */
590 static int
591 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
592 {
593 	extern size_t cpr_bitmap_size;
594 	struct inode *ip = VTOI(vp);
595 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
596 	u_longlong_t size, isize, ksize, raw_data;
597 	char *str, *est_fmt;
598 	size_t space;
599 	int error;
600 
601 	/*
602 	 * number of pages short for swapping.
603 	 */
604 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
605 	if (STAT->cs_nosw_pages < 0)
606 		STAT->cs_nosw_pages = 0;
607 
608 	str = "cpr_statefile_ok:";
609 
610 	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
611 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
612 	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
613 	    MAX(availrmem - swapfs_minfree, 0),
614 	    k_anoninfo.ani_mem_resv);
615 	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
616 	    CURRENT_TOTAL_AVAILABLE_SWAP);
617 
618 	/*
619 	 * try increasing filesize by 15%
620 	 */
621 	if (alloc_retry) {
622 		/*
623 		 * block device doesn't get any bigger
624 		 */
625 		if (vp->v_type == VBLK) {
626 			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
627 				prom_printf(
628 				    "Retry statefile on special file\n");
629 			return (ENOMEM);
630 		} else {
631 			rw_enter(&ip->i_contents, RW_READER);
632 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
633 			rw_exit(&ip->i_contents);
634 		}
635 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
636 			prom_printf("Retry statefile size = %lld\n", size);
637 	} else {
638 		u_longlong_t cpd_size;
639 		pgcnt_t npages, nback;
640 		int ndvram;
641 
642 		ndvram = 0;
643 		(void) callb_execute_class(CB_CL_CPR_FB,
644 		    (int)(uintptr_t)&ndvram);
645 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
646 			prom_printf("ndvram size = %d\n", ndvram);
647 
648 		/*
649 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
650 		 */
651 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
652 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
653 		raw_data = cpd_size + cpr_bitmap_size;
654 		ksize = ndvram + mmu_ptob(npages);
655 
656 		est_fmt = "%s estimated size with "
657 		    "%scompression %lld, ksize %lld\n";
658 		nback = mmu_ptob(STAT->cs_nosw_pages);
659 		if (CPR->c_flags & C_COMPRESSING) {
660 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
661 			    raw_data + ((nback * 10) / UCOMP_RATE);
662 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
663 		} else {
664 			size = ksize + raw_data + nback;
665 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
666 			    size, ksize);
667 		}
668 	}
669 
670 	/*
671 	 * All this is much simpler for a block device
672 	 */
673 	if (vp->v_type == VBLK) {
674 		space = cpr_get_devsize(vp->v_rdev);
675 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
676 			prom_printf("statefile dev size %lu\n", space);
677 
678 		/*
679 		 * Export the estimated filesize info, this value will be
680 		 * compared before dumping out the statefile in the case of
681 		 * no compression.
682 		 */
683 		STAT->cs_est_statefsz = size;
684 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
685 			prom_printf("%s Estimated statefile size %llu, "
686 			    "space %lu\n", str, size, space);
687 		if (size > space) {
688 			cpr_err(CE_CONT, "Statefile partition too small.");
689 			return (ENOMEM);
690 		}
691 		return (0);
692 	} else {
693 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
694 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
695 			return (ENOMEM);
696 		}
697 
698 		/*
699 		 * Estimate space needed for the state file.
700 		 *
701 		 * State file size in bytes:
702 		 * 	kernel size + non-cache pte seg +
703 		 *	bitmap size + cpr state file headers size
704 		 * (round up to fs->fs_bsize)
705 		 */
706 		size = blkroundup(ip->i_fs, size);
707 
708 		/*
709 		 * Export the estimated filesize info, this value will be
710 		 * compared before dumping out the statefile in the case of
711 		 * no compression.
712 		 */
713 		STAT->cs_est_statefsz = size;
714 		error = cpr_grow_statefile(vp, size);
715 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
716 			rw_enter(&ip->i_contents, RW_READER);
717 			isize = ip->i_size;
718 			rw_exit(&ip->i_contents);
719 			prom_printf("%s Estimated statefile size %lld, "
720 			    "i_size %lld\n", str, size, isize);
721 		}
722 
723 		return (error);
724 	}
725 }
726 
727 
728 void
729 cpr_statef_close(void)
730 {
731 	if (C_VP) {
732 		if (!cpr_reusable_mode)
733 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
734 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
735 		VN_RELE(C_VP);
736 		C_VP = 0;
737 	}
738 }
739 
740 
741 /*
742  * open cpr default file and display error
743  */
744 int
745 cpr_open_deffile(int mode, vnode_t **vpp)
746 {
747 	int error;
748 
749 	if (error = cpr_open(cpr_default_path, mode, vpp))
750 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
751 		    cpr_default_path, error);
752 	return (error);
753 }
754 
755 
756 /*
757  * write cdef_t to disk.  This contains the original values of prom
758  * properties that we modify.  We fill in the magic number of the file
759  * here as a signal to the booter code that the state file is valid.
760  * Be sure the file gets synced, since we may be shutting down the OS.
761  */
762 int
763 cpr_write_deffile(cdef_t *cdef)
764 {
765 	struct vnode *vp;
766 	char *str;
767 	int rc;
768 
769 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
770 		return (rc);
771 
772 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
773 		str = "write";
774 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
775 		str = "fsync";
776 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
777 	VN_RELE(vp);
778 
779 	if (rc) {
780 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
781 		    str, rc, cpr_default_path);
782 	}
783 	return (rc);
784 }
785 
786 /*
787  * Clear the magic number in the defaults file.  This tells the booter
788  * program that the state file is not current and thus prevents
789  * any attempt to restore from an obsolete state file.
790  */
791 void
792 cpr_clear_definfo(void)
793 {
794 	struct vnode *vp;
795 	cmini_t mini;
796 
797 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
798 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
799 		return;
800 	mini.magic = mini.reusable = 0;
801 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
802 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
803 	VN_RELE(vp);
804 }
805 
806 /*
807  * If the cpr default file is invalid, then we must not be in reusable mode
808  * if it is valid, it tells us our mode
809  */
810 int
811 cpr_get_reusable_mode(void)
812 {
813 	struct vnode *vp;
814 	cmini_t mini;
815 	int rc;
816 
817 	if (cpr_open(cpr_default_path, FREAD, &vp))
818 		return (0);
819 
820 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
821 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
822 	VN_RELE(vp);
823 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
824 		return (mini.reusable);
825 
826 	return (0);
827 }
828 #endif
829 
830 /*
831  * clock/time related routines
832  */
833 static time_t   cpr_time_stamp;
834 
835 
836 void
837 cpr_tod_get(cpr_time_t *ctp)
838 {
839 	timestruc_t ts;
840 
841 	mutex_enter(&tod_lock);
842 	ts = TODOP_GET(tod_ops);
843 	mutex_exit(&tod_lock);
844 	ctp->tv_sec = (time32_t)ts.tv_sec;
845 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
846 }
847 
848 void
849 cpr_tod_fault_reset(void)
850 {
851 	mutex_enter(&tod_lock);
852 	tod_fault_reset();
853 	mutex_exit(&tod_lock);
854 }
855 
856 void
857 cpr_save_time(void)
858 {
859 	cpr_time_stamp = gethrestime_sec();
860 }
861 
862 /*
863  * correct time based on saved time stamp or hardware clock
864  */
865 void
866 cpr_restore_time(void)
867 {
868 	clkset(cpr_time_stamp);
869 }
870 
871 #if defined(__sparc)
872 /*
873  * CPU ONLINE/OFFLINE CODE
874  */
875 int
876 cpr_mp_offline(void)
877 {
878 	cpu_t *cp, *bootcpu;
879 	int rc = 0;
880 	int brought_up_boot = 0;
881 
882 	/*
883 	 * Do nothing for UP.
884 	 */
885 	if (ncpus == 1)
886 		return (0);
887 
888 	mutex_enter(&cpu_lock);
889 
890 	cpr_save_mp_state();
891 
892 	bootcpu = i_cpr_bootcpu();
893 	if (!CPU_ACTIVE(bootcpu)) {
894 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
895 			mutex_exit(&cpu_lock);
896 			return (rc);
897 		}
898 		brought_up_boot = 1;
899 	}
900 
901 	cp = cpu_list;
902 	do {
903 		if (cp == bootcpu)
904 			continue;
905 		if (cp->cpu_flags & CPU_OFFLINE)
906 			continue;
907 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
908 			mutex_exit(&cpu_lock);
909 			return (rc);
910 		}
911 	} while ((cp = cp->cpu_next) != cpu_list);
912 	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
913 		prom_printf("changed cpu %p to state %d\n",
914 		    bootcpu, CPU_CPR_ONLINE);
915 	mutex_exit(&cpu_lock);
916 
917 	return (rc);
918 }
919 
920 int
921 cpr_mp_online(void)
922 {
923 	cpu_t *cp, *bootcpu = CPU;
924 	int rc = 0;
925 
926 	/*
927 	 * Do nothing for UP.
928 	 */
929 	if (ncpus == 1)
930 		return (0);
931 
932 	/*
933 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
934 	 * to indicate a cpu was online at the time of cpr_suspend();
935 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
936 	 * and actually are offline.
937 	 */
938 	mutex_enter(&cpu_lock);
939 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
940 		/*
941 		 * Clear the CPU_FROZEN flag in all cases.
942 		 */
943 		cp->cpu_flags &= ~CPU_FROZEN;
944 
945 		if (CPU_CPR_IS_OFFLINE(cp))
946 			continue;
947 		if (CPU_ACTIVE(cp))
948 			continue;
949 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
950 			mutex_exit(&cpu_lock);
951 			return (rc);
952 		}
953 	}
954 
955 	/*
956 	 * turn off the boot cpu if it was offlined
957 	 */
958 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
959 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
960 			mutex_exit(&cpu_lock);
961 			return (rc);
962 		}
963 	}
964 	mutex_exit(&cpu_lock);
965 	return (0);
966 }
967 
968 static void
969 cpr_save_mp_state(void)
970 {
971 	cpu_t *cp;
972 
973 	ASSERT(MUTEX_HELD(&cpu_lock));
974 
975 	cp = cpu_list;
976 	do {
977 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
978 		if (CPU_ACTIVE(cp))
979 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
980 	} while ((cp = cp->cpu_next) != cpu_list);
981 }
982 
983 /*
984  * change cpu to online/offline
985  */
986 static int
987 cpr_p_online(cpu_t *cp, int state)
988 {
989 	int rc;
990 
991 	ASSERT(MUTEX_HELD(&cpu_lock));
992 
993 	switch (state) {
994 	case CPU_CPR_ONLINE:
995 		rc = cpu_online(cp);
996 		break;
997 	case CPU_CPR_OFFLINE:
998 		rc = cpu_offline(cp, CPU_FORCED);
999 		break;
1000 	}
1001 	if (rc) {
1002 		cpr_err(CE_WARN, "Failed to change processor %d to "
1003 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
1004 	}
1005 	return (rc);
1006 }
1007 
1008 /*
1009  * Construct the pathname of the state file and return a pointer to
1010  * caller.  Read the config file to get the mount point of the
1011  * filesystem and the pathname within fs.
1012  */
1013 char *
1014 cpr_build_statefile_path(void)
1015 {
1016 	struct cprconfig *cf = &cprconfig;
1017 
1018 	if (cpr_get_config())
1019 		return (NULL);
1020 
1021 	switch (cf->cf_type) {
1022 	case CFT_UFS:
1023 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1024 			cpr_err(CE_CONT, "Statefile path is too long.\n");
1025 			return (NULL);
1026 		}
1027 		return (cpr_cprconfig_to_path());
1028 	case CFT_SPEC:
1029 		return (cf->cf_devfs);
1030 	default:
1031 		cpr_err(CE_PANIC, "invalid statefile type");
1032 		/*NOTREACHED*/
1033 		return (NULL);
1034 	}
1035 }
1036 
1037 int
1038 cpr_statefile_is_spec(void)
1039 {
1040 	if (cpr_get_config())
1041 		return (0);
1042 	return (cprconfig.cf_type == CFT_SPEC);
1043 }
1044 
1045 char *
1046 cpr_get_statefile_prom_path(void)
1047 {
1048 	struct cprconfig *cf = &cprconfig;
1049 
1050 	ASSERT(cprconfig_loaded);
1051 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1052 	ASSERT(cf->cf_type == CFT_SPEC);
1053 	return (cf->cf_dev_prom);
1054 }
1055 
1056 
1057 /*
1058  * XXX The following routines need to be in the vfs source code.
1059  */
1060 
1061 int
1062 cpr_is_ufs(struct vfs *vfsp)
1063 {
1064 	char *fsname;
1065 
1066 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1067 	return (strcmp(fsname, "ufs") == 0);
1068 }
1069 
1070 /*
1071  * This is a list of file systems that are allowed to be writeable when a
1072  * reusable statefile checkpoint is taken.  They must not have any state that
1073  * cannot be restored to consistency by simply rebooting using the checkpoint.
1074  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
1075  * out of sync with the in-kernel data).
1076  */
1077 int
1078 cpr_reusable_mount_check(void)
1079 {
1080 	struct vfs *vfsp;
1081 	char *fsname;
1082 	char **cpp;
1083 	static char *cpr_writeok_fss[] = {
1084 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1085 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1086 	};
1087 
1088 	vfs_list_read_lock();
1089 	vfsp = rootvfs;
1090 	do {
1091 		if (vfsp->vfs_flag & VFS_RDONLY) {
1092 			vfsp = vfsp->vfs_next;
1093 			continue;
1094 		}
1095 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1096 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1097 			if (strcmp(fsname, *cpp) == 0)
1098 				break;
1099 		}
1100 		/*
1101 		 * if the inner loop reached the NULL terminator,
1102 		 * the current fs-type does not match any OK-type
1103 		 */
1104 		if (*cpp == NULL) {
1105 			cpr_err(CE_CONT, "a filesystem of type %s is "
1106 			    "mounted read/write.\nReusable statefile requires "
1107 			    "no writeable filesystem of this type be mounted\n",
1108 			    fsname);
1109 			vfs_list_unlock();
1110 			return (EINVAL);
1111 		}
1112 		vfsp = vfsp->vfs_next;
1113 	} while (vfsp != rootvfs);
1114 	vfs_list_unlock();
1115 	return (0);
1116 }
1117 
1118 /*
1119  * return statefile offset in DEV_BSIZE units
1120  */
1121 int
1122 cpr_statefile_offset(void)
1123 {
1124 	return (cpr_statefile_is_spec() ? btod(CPR_SPEC_OFFSET) : 0);
1125 }
1126 
1127 /*
1128  * Force a fresh read of the cprinfo per uadmin 3 call
1129  */
1130 void
1131 cpr_forget_cprconfig(void)
1132 {
1133 	cprconfig_loaded = 0;
1134 }
1135 #endif
1136