xref: /titanic_52/usr/src/uts/common/cpr/cpr_misc.c (revision 2788047ed466bca04477ef667f8b70b9367bb0f8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/cpuvar.h>
29 #include <sys/vfs.h>
30 #include <sys/vnode.h>
31 #include <sys/pathname.h>
32 #include <sys/callb.h>
33 #include <sys/fs/ufs_inode.h>
34 #include <vm/anon.h>
35 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
36 #include <sys/kmem.h>
37 #include <sys/cpr.h>
38 #include <sys/conf.h>
39 #include <sys/machclock.h>
40 
41 /*
42  * CPR miscellaneous support routines
43  */
44 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
45 		mode, 0600, vpp, CRCREAT, 0))
46 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
47 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
48 		(ssize_t *)NULL))
49 
50 extern void clkset(time_t);
51 extern cpu_t *i_cpr_bootcpu(void);
52 extern caddr_t i_cpr_map_setup(void);
53 extern void i_cpr_free_memory_resources(void);
54 
55 extern kmutex_t cpr_slock;
56 extern size_t cpr_buf_size;
57 extern char *cpr_buf;
58 extern size_t cpr_pagedata_size;
59 extern char *cpr_pagedata;
60 extern int cpr_bufs_allocated;
61 extern int cpr_bitmaps_allocated;
62 
63 #if defined(__sparc)
64 static struct cprconfig cprconfig;
65 static int cprconfig_loaded = 0;
66 static int cpr_statefile_ok(vnode_t *, int);
67 static int cpr_p_online(cpu_t *, int);
68 static void cpr_save_mp_state(void);
69 #endif
70 
71 int cpr_is_ufs(struct vfs *);
72 int cpr_is_zfs(struct vfs *);
73 
74 char cpr_default_path[] = CPR_DEFAULT;
75 
76 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
77 #define	SIZE_RATE	115	/* increase size by 15% */
78 #define	INTEGRAL	100	/* for integer math */
79 
80 
81 /*
82  * cmn_err() followed by a 1/4 second delay; this gives the
83  * logging service a chance to flush messages and helps avoid
84  * intermixing output from prom_printf().
85  */
86 /*PRINTFLIKE2*/
87 void
88 cpr_err(int ce, const char *fmt, ...)
89 {
90 	va_list adx;
91 
92 	va_start(adx, fmt);
93 	vcmn_err(ce, fmt, adx);
94 	va_end(adx);
95 	drv_usecwait(MICROSEC >> 2);
96 }
97 
98 
99 int
100 cpr_init(int fcn)
101 {
102 	/*
103 	 * Allow only one suspend/resume process.
104 	 */
105 	if (mutex_tryenter(&cpr_slock) == 0)
106 		return (EBUSY);
107 
108 	CPR->c_flags = 0;
109 	CPR->c_substate = 0;
110 	CPR->c_cprboot_magic = 0;
111 	CPR->c_alloc_cnt = 0;
112 
113 	CPR->c_fcn = fcn;
114 	if (fcn == AD_CPR_REUSABLE)
115 		CPR->c_flags |= C_REUSABLE;
116 	else
117 		CPR->c_flags |= C_SUSPENDING;
118 	if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
119 		return (0);
120 	}
121 #if defined(__sparc)
122 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
123 		CPR->c_flags |= C_COMPRESSING;
124 	/*
125 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
126 	 */
127 	CPR->c_mapping_area = i_cpr_map_setup();
128 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
129 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
130 		mutex_exit(&cpr_slock);
131 		return (EAGAIN);
132 	}
133 	if (cpr_debug & CPR_DEBUG3)
134 		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
135 		    "kas\n", (void *)CPR->c_mapping_area);
136 #endif
137 
138 	return (0);
139 }
140 
141 /*
142  * This routine releases any resources used during the checkpoint.
143  */
144 void
145 cpr_done(void)
146 {
147 	cpr_stat_cleanup();
148 	i_cpr_bitmap_cleanup();
149 
150 	/*
151 	 * Free pages used by cpr buffers.
152 	 */
153 	if (cpr_buf) {
154 		kmem_free(cpr_buf, cpr_buf_size);
155 		cpr_buf = NULL;
156 	}
157 	if (cpr_pagedata) {
158 		kmem_free(cpr_pagedata, cpr_pagedata_size);
159 		cpr_pagedata = NULL;
160 	}
161 
162 	i_cpr_free_memory_resources();
163 	mutex_exit(&cpr_slock);
164 	cpr_err(CE_CONT, "System has been resumed.\n");
165 }
166 
167 
168 #if defined(__sparc)
169 /*
170  * reads config data into cprconfig
171  */
172 static int
173 cpr_get_config(void)
174 {
175 	static char config_path[] = CPR_CONFIG;
176 	struct cprconfig *cf = &cprconfig;
177 	struct vnode *vp;
178 	char *fmt;
179 	int err;
180 
181 	if (cprconfig_loaded)
182 		return (0);
183 
184 	fmt = "cannot %s config file \"%s\", error %d\n";
185 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
186 		cpr_err(CE_CONT, fmt, "open", config_path, err);
187 		return (err);
188 	}
189 
190 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
191 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
192 	VN_RELE(vp);
193 	if (err) {
194 		cpr_err(CE_CONT, fmt, "read", config_path, err);
195 		return (err);
196 	}
197 
198 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
199 		cprconfig_loaded = 1;
200 	else {
201 		cpr_err(CE_CONT, "invalid config file \"%s\", "
202 		    "rerun pmconfig(1M)\n", config_path);
203 		err = EINVAL;
204 	}
205 
206 	return (err);
207 }
208 
209 
210 /*
211  * concat fs and path fields of the cprconfig structure;
212  * returns pointer to the base of static data
213  */
214 static char *
215 cpr_cprconfig_to_path(void)
216 {
217 	static char full_path[MAXNAMELEN];
218 	struct cprconfig *cf = &cprconfig;
219 	char *ptr;
220 
221 	/*
222 	 * build /fs/path without extra '/'
223 	 */
224 	(void) strcpy(full_path, cf->cf_fs);
225 	if (strcmp(cf->cf_fs, "/"))
226 		(void) strcat(full_path, "/");
227 	ptr = cf->cf_path;
228 	if (*ptr == '/')
229 		ptr++;
230 	(void) strcat(full_path, ptr);
231 	return (full_path);
232 }
233 
234 
235 /*
236  * Verify that the information in the configuration file regarding the
237  * location for the statefile is still valid, depending on cf_type.
238  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
239  *	mounted on the same device as when pmconfig was last run,
240  *	and the translation of that device to a node in the prom's
241  *	device tree must be the same as when pmconfig was last run.
242  * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block
243  *      special file, it must have no file system mounted on it,
244  *	and the translation of that device to a node in the prom's
245  *	device tree must be the same as when pmconfig was last run.
246  */
247 static int
248 cpr_verify_statefile_path(void)
249 {
250 	struct cprconfig *cf = &cprconfig;
251 	static const char long_name[] = "Statefile pathname is too long.\n";
252 	static const char lookup_fmt[] = "Lookup failed for "
253 	    "cpr statefile device %s.\n";
254 	static const char path_chg_fmt[] = "Device path for statefile "
255 	    "has changed from %s to %s.\t%s\n";
256 	static const char rerun[] = "Please rerun pmconfig(1m).";
257 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
258 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
259 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
260 	int error;
261 	struct vnode *vp;
262 	char *slash, *tail, *longest;
263 	char *errstr;
264 	int found = 0;
265 	union {
266 		char un_devpath[OBP_MAXPATHLEN];
267 		char un_sfpath[MAXNAMELEN];
268 	} un;
269 #define	devpath	un.un_devpath
270 #define	sfpath	un.un_sfpath
271 
272 	ASSERT(cprconfig_loaded);
273 	/*
274 	 * We need not worry about locking or the timing of releasing
275 	 * the vnode, since we are single-threaded now.
276 	 */
277 
278 	switch (cf->cf_type) {
279 	case CFT_SPEC:
280 		error = i_devname_to_promname(cf->cf_devfs, devpath,
281 		    OBP_MAXPATHLEN);
282 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
283 			cpr_err(CE_CONT, path_chg_fmt,
284 			    cf->cf_dev_prom, devpath, rerun);
285 			return (error);
286 		}
287 		/*FALLTHROUGH*/
288 	case CFT_ZVOL:
289 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
290 			cpr_err(CE_CONT, long_name);
291 			return (ENAMETOOLONG);
292 		}
293 		if ((error = lookupname(cf->cf_devfs,
294 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
295 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
296 			return (error);
297 		}
298 		if (vp->v_type != VBLK)
299 			errstr = "statefile must be a block device";
300 		else if (vfs_devismounted(vp->v_rdev))
301 			errstr = "statefile device must not "
302 			    "have a file system mounted on it";
303 		else if (IS_SWAPVP(vp))
304 			errstr = "statefile device must not "
305 			    "be configured as swap file";
306 		else
307 			errstr = NULL;
308 
309 		VN_RELE(vp);
310 		if (errstr) {
311 			cpr_err(CE_CONT, "%s.\n", errstr);
312 			return (ENOTSUP);
313 		}
314 
315 		return (error);
316 	case CFT_UFS:
317 		break;		/* don't indent all the original code */
318 	default:
319 		cpr_err(CE_PANIC, "invalid cf_type");
320 	}
321 
322 	/*
323 	 * The original code for UFS statefile
324 	 */
325 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
326 		cpr_err(CE_CONT, long_name);
327 		return (ENAMETOOLONG);
328 	}
329 
330 	bzero(sfpath, sizeof (sfpath));
331 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
332 
333 	if (*sfpath != '/') {
334 		cpr_err(CE_CONT, "Statefile pathname %s "
335 		    "must begin with a /\n", sfpath);
336 		return (EINVAL);
337 	}
338 
339 	/*
340 	 * Find the longest prefix of the statefile pathname which
341 	 * is the mountpoint of a filesystem.  This string must
342 	 * match the cf_fs field we read from the config file.  Other-
343 	 * wise the user has changed things without running pmconfig.
344 	 */
345 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
346 	while ((slash = strchr(tail, '/')) != NULL) {
347 		*slash = '\0';	  /* temporarily terminate the string */
348 		if ((error = lookupname(sfpath,
349 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
350 			*slash = '/';
351 			cpr_err(CE_CONT, "A directory in the "
352 			    "statefile path %s was not found.\n", sfpath);
353 			VN_RELE(vp);
354 
355 			return (error);
356 		}
357 
358 		vfs_list_read_lock();
359 		vfsp = rootvfs;
360 		do {
361 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
362 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
363 				found = 1;
364 				break;
365 			}
366 			vfsp = vfsp->vfs_next;
367 		} while (vfsp != rootvfs);
368 		vfs_list_unlock();
369 
370 		/*
371 		 * If we have found a filesystem mounted on the current
372 		 * path prefix, remember the end of the string in
373 		 * "longest".  If it happens to be the the exact fs
374 		 * saved in the configuration file, save the current
375 		 * ufsvfsp so we can make additional checks further down.
376 		 */
377 		if (found) {
378 			longest = slash;
379 			if (strcmp(cf->cf_fs, sfpath) == 0) {
380 				ufsvfsp_save = ufsvfsp;
381 				vfsp_save = vfsp;
382 			}
383 			found = 0;
384 		}
385 
386 		VN_RELE(vp);
387 		*slash = '/';
388 		tail = slash + 1;
389 	}
390 	*longest = '\0';
391 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
392 		cpr_err(CE_CONT, "Filesystem containing "
393 		    "the statefile when pmconfig was run (%s) has "
394 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
395 		return (EINVAL);
396 	}
397 
398 	if ((error = lookupname(cf->cf_devfs,
399 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
400 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
401 		return (error);
402 	}
403 
404 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
405 		cpr_err(CE_CONT, "Filesystem containing "
406 		    "statefile no longer mounted on device %s. "
407 		    "See power.conf(4).", cf->cf_devfs);
408 		VN_RELE(vp);
409 		return (ENXIO);
410 	}
411 	VN_RELE(vp);
412 
413 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
414 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
415 		cpr_err(CE_CONT, path_chg_fmt,
416 		    cf->cf_dev_prom, devpath, rerun);
417 		return (error);
418 	}
419 
420 	return (0);
421 }
422 
423 /*
424  * Make sure that the statefile can be used as a block special statefile
425  * (meaning that is exists and has nothing mounted on it)
426  * Returns errno if not a valid statefile.
427  */
428 int
429 cpr_check_spec_statefile(void)
430 {
431 	int err;
432 
433 	if (err = cpr_get_config())
434 		return (err);
435 	ASSERT(cprconfig.cf_type == CFT_SPEC ||
436 	    cprconfig.cf_type == CFT_ZVOL);
437 
438 	if (cprconfig.cf_devfs == NULL)
439 		return (ENXIO);
440 
441 	return (cpr_verify_statefile_path());
442 
443 }
444 
445 int
446 cpr_alloc_statefile(int alloc_retry)
447 {
448 	register int rc = 0;
449 	char *str;
450 
451 	/*
452 	 * Statefile size validation. If checkpoint the first time, disk blocks
453 	 * allocation will be done; otherwise, just do file size check.
454 	 * if statefile allocation is being retried, C_VP will be inited
455 	 */
456 	if (alloc_retry) {
457 		str = "\n-->Retrying statefile allocation...";
458 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
459 			prom_printf(str);
460 		if (C_VP->v_type != VBLK)
461 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
462 	} else {
463 		/*
464 		 * Open an exiting file for writing, the state file needs to be
465 		 * pre-allocated since we can't and don't want to do allocation
466 		 * during checkpoint (too much of the OS is disabled).
467 		 *    - do a preliminary size checking here, if it is too small,
468 		 *	allocate more space internally and retry.
469 		 *    - check the vp to make sure it's the right type.
470 		 */
471 		char *path = cpr_build_statefile_path();
472 
473 		if (path == NULL)
474 			return (ENXIO);
475 		else if (rc = cpr_verify_statefile_path())
476 			return (rc);
477 
478 		if (rc = vn_open(path, UIO_SYSSPACE,
479 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
480 			cpr_err(CE_WARN, "cannot open statefile %s", path);
481 			return (rc);
482 		}
483 	}
484 
485 	/*
486 	 * Only ufs and block special statefiles supported
487 	 */
488 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
489 		cpr_err(CE_CONT,
490 		    "Statefile must be regular file or block special file.");
491 		return (EACCES);
492 	}
493 
494 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
495 		return (rc);
496 
497 	if (C_VP->v_type != VBLK) {
498 		/*
499 		 * sync out the fs change due to the statefile reservation.
500 		 */
501 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
502 
503 		/*
504 		 * Validate disk blocks allocation for the state file.
505 		 * Ask the file system prepare itself for the dump operation.
506 		 */
507 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
508 			cpr_err(CE_CONT, "Error allocating "
509 			    "blocks for cpr statefile.");
510 			return (rc);
511 		}
512 	}
513 	return (0);
514 }
515 
516 
517 /*
518  * Lookup device size and return available space in bytes.
519  * NOTE: Since prop_op(9E) can't tell the difference between a character
520  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
521  */
522 size_t
523 cpr_get_devsize(dev_t dev)
524 {
525 	size_t bytes = 0;
526 
527 	bytes = cdev_Size(dev);
528 	if (bytes == 0)
529 		bytes = cdev_size(dev);
530 
531 	if (bytes > CPR_SPEC_OFFSET)
532 		bytes -= CPR_SPEC_OFFSET;
533 	else
534 		bytes = 0;
535 
536 	return (bytes);
537 }
538 
539 
540 /*
541  * increase statefile size
542  */
543 static int
544 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
545 {
546 	extern uchar_t cpr_pagecopy[];
547 	struct inode *ip = VTOI(vp);
548 	u_longlong_t offset;
549 	int error, increase;
550 	ssize_t resid;
551 
552 	rw_enter(&ip->i_contents, RW_READER);
553 	increase = (ip->i_size < newsize);
554 	offset = ip->i_size;
555 	rw_exit(&ip->i_contents);
556 
557 	if (increase == 0)
558 		return (0);
559 
560 	/*
561 	 * write to each logical block to reserve disk space
562 	 */
563 	error = 0;
564 	cpr_pagecopy[0] = '1';
565 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
566 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
567 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
568 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
569 			if (error == ENOSPC) {
570 				cpr_err(CE_WARN, "error %d while reserving "
571 				    "disk space for statefile %s\n"
572 				    "wanted %lld bytes, file is %lld short",
573 				    error, cpr_cprconfig_to_path(),
574 				    newsize, newsize - offset);
575 			}
576 			break;
577 		}
578 	}
579 	return (error);
580 }
581 
582 
583 /*
584  * do a simple estimate of the space needed to hold the statefile
585  * taking compression into account, but be fairly conservative
586  * so we have a better chance of completing; when dump fails,
587  * the retry cost is fairly high.
588  *
589  * Do disk blocks allocation for the state file if no space has
590  * been allocated yet. Since the state file will not be removed,
591  * allocation should only be done once.
592  */
593 static int
594 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
595 {
596 	extern size_t cpr_bitmap_size;
597 	struct inode *ip = VTOI(vp);
598 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
599 	u_longlong_t size, isize, ksize, raw_data;
600 	char *str, *est_fmt;
601 	size_t space;
602 	int error;
603 
604 	/*
605 	 * number of pages short for swapping.
606 	 */
607 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
608 	if (STAT->cs_nosw_pages < 0)
609 		STAT->cs_nosw_pages = 0;
610 
611 	str = "cpr_statefile_ok:";
612 
613 	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
614 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
615 	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
616 	    MAX(availrmem - swapfs_minfree, 0),
617 	    k_anoninfo.ani_mem_resv);
618 	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
619 	    CURRENT_TOTAL_AVAILABLE_SWAP);
620 
621 	/*
622 	 * try increasing filesize by 15%
623 	 */
624 	if (alloc_retry) {
625 		/*
626 		 * block device doesn't get any bigger
627 		 */
628 		if (vp->v_type == VBLK) {
629 			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
630 				prom_printf(
631 				    "Retry statefile on special file\n");
632 			return (ENOMEM);
633 		} else {
634 			rw_enter(&ip->i_contents, RW_READER);
635 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
636 			rw_exit(&ip->i_contents);
637 		}
638 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
639 			prom_printf("Retry statefile size = %lld\n", size);
640 	} else {
641 		u_longlong_t cpd_size;
642 		pgcnt_t npages, nback;
643 		int ndvram;
644 
645 		ndvram = 0;
646 		(void) callb_execute_class(CB_CL_CPR_FB,
647 		    (int)(uintptr_t)&ndvram);
648 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
649 			prom_printf("ndvram size = %d\n", ndvram);
650 
651 		/*
652 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
653 		 */
654 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
655 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
656 		raw_data = cpd_size + cpr_bitmap_size;
657 		ksize = ndvram + mmu_ptob(npages);
658 
659 		est_fmt = "%s estimated size with "
660 		    "%scompression %lld, ksize %lld\n";
661 		nback = mmu_ptob(STAT->cs_nosw_pages);
662 		if (CPR->c_flags & C_COMPRESSING) {
663 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
664 			    raw_data + ((nback * 10) / UCOMP_RATE);
665 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
666 		} else {
667 			size = ksize + raw_data + nback;
668 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
669 			    size, ksize);
670 		}
671 	}
672 
673 	/*
674 	 * All this is much simpler for a block device
675 	 */
676 	if (vp->v_type == VBLK) {
677 		space = cpr_get_devsize(vp->v_rdev);
678 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
679 			prom_printf("statefile dev size %lu\n", space);
680 
681 		/*
682 		 * Export the estimated filesize info, this value will be
683 		 * compared before dumping out the statefile in the case of
684 		 * no compression.
685 		 */
686 		STAT->cs_est_statefsz = size;
687 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
688 			prom_printf("%s Estimated statefile size %llu, "
689 			    "space %lu\n", str, size, space);
690 		if (size > space) {
691 			cpr_err(CE_CONT, "Statefile partition too small.");
692 			return (ENOMEM);
693 		}
694 		return (0);
695 	} else {
696 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
697 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
698 			return (ENOMEM);
699 		}
700 
701 		/*
702 		 * Estimate space needed for the state file.
703 		 *
704 		 * State file size in bytes:
705 		 * 	kernel size + non-cache pte seg +
706 		 *	bitmap size + cpr state file headers size
707 		 * (round up to fs->fs_bsize)
708 		 */
709 		size = blkroundup(ip->i_fs, size);
710 
711 		/*
712 		 * Export the estimated filesize info, this value will be
713 		 * compared before dumping out the statefile in the case of
714 		 * no compression.
715 		 */
716 		STAT->cs_est_statefsz = size;
717 		error = cpr_grow_statefile(vp, size);
718 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
719 			rw_enter(&ip->i_contents, RW_READER);
720 			isize = ip->i_size;
721 			rw_exit(&ip->i_contents);
722 			prom_printf("%s Estimated statefile size %lld, "
723 			    "i_size %lld\n", str, size, isize);
724 		}
725 
726 		return (error);
727 	}
728 }
729 
730 
731 void
732 cpr_statef_close(void)
733 {
734 	if (C_VP) {
735 		if (!cpr_reusable_mode)
736 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
737 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
738 		VN_RELE(C_VP);
739 		C_VP = 0;
740 	}
741 }
742 
743 
744 /*
745  * open cpr default file and display error
746  */
747 int
748 cpr_open_deffile(int mode, vnode_t **vpp)
749 {
750 	int error;
751 
752 	if (error = cpr_open(cpr_default_path, mode, vpp))
753 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
754 		    cpr_default_path, error);
755 	return (error);
756 }
757 
758 
759 /*
760  * write cdef_t to disk.  This contains the original values of prom
761  * properties that we modify.  We fill in the magic number of the file
762  * here as a signal to the booter code that the state file is valid.
763  * Be sure the file gets synced, since we may be shutting down the OS.
764  */
765 int
766 cpr_write_deffile(cdef_t *cdef)
767 {
768 	struct vnode *vp;
769 	char *str;
770 	int rc;
771 
772 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
773 		return (rc);
774 
775 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
776 		str = "write";
777 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
778 		str = "fsync";
779 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
780 	VN_RELE(vp);
781 
782 	if (rc) {
783 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
784 		    str, rc, cpr_default_path);
785 	}
786 	return (rc);
787 }
788 
789 /*
790  * Clear the magic number in the defaults file.  This tells the booter
791  * program that the state file is not current and thus prevents
792  * any attempt to restore from an obsolete state file.
793  */
794 void
795 cpr_clear_definfo(void)
796 {
797 	struct vnode *vp;
798 	cmini_t mini;
799 
800 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
801 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
802 		return;
803 	mini.magic = mini.reusable = 0;
804 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
805 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
806 	VN_RELE(vp);
807 }
808 
809 /*
810  * If the cpr default file is invalid, then we must not be in reusable mode
811  * if it is valid, it tells us our mode
812  */
813 int
814 cpr_get_reusable_mode(void)
815 {
816 	struct vnode *vp;
817 	cmini_t mini;
818 	int rc;
819 
820 	if (cpr_open(cpr_default_path, FREAD, &vp))
821 		return (0);
822 
823 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
824 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
825 	VN_RELE(vp);
826 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
827 		return (mini.reusable);
828 
829 	return (0);
830 }
831 #endif
832 
833 /*
834  * clock/time related routines
835  */
836 static time_t   cpr_time_stamp;
837 
838 
839 void
840 cpr_tod_get(cpr_time_t *ctp)
841 {
842 	timestruc_t ts;
843 
844 	mutex_enter(&tod_lock);
845 	ts = TODOP_GET(tod_ops);
846 	mutex_exit(&tod_lock);
847 	ctp->tv_sec = (time32_t)ts.tv_sec;
848 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
849 }
850 
851 void
852 cpr_tod_status_set(int tod_flag)
853 {
854 	mutex_enter(&tod_lock);
855 	tod_status_set(tod_flag);
856 	mutex_exit(&tod_lock);
857 }
858 
859 void
860 cpr_save_time(void)
861 {
862 	cpr_time_stamp = gethrestime_sec();
863 }
864 
865 /*
866  * correct time based on saved time stamp or hardware clock
867  */
868 void
869 cpr_restore_time(void)
870 {
871 	clkset(cpr_time_stamp);
872 }
873 
874 #if defined(__sparc)
875 /*
876  * CPU ONLINE/OFFLINE CODE
877  */
878 int
879 cpr_mp_offline(void)
880 {
881 	cpu_t *cp, *bootcpu;
882 	int rc = 0;
883 	int brought_up_boot = 0;
884 
885 	/*
886 	 * Do nothing for UP.
887 	 */
888 	if (ncpus == 1)
889 		return (0);
890 
891 	mutex_enter(&cpu_lock);
892 
893 	cpr_save_mp_state();
894 
895 	bootcpu = i_cpr_bootcpu();
896 	if (!CPU_ACTIVE(bootcpu)) {
897 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
898 			mutex_exit(&cpu_lock);
899 			return (rc);
900 		}
901 		brought_up_boot = 1;
902 	}
903 
904 	cp = cpu_list;
905 	do {
906 		if (cp == bootcpu)
907 			continue;
908 		if (cp->cpu_flags & CPU_OFFLINE)
909 			continue;
910 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
911 			mutex_exit(&cpu_lock);
912 			return (rc);
913 		}
914 	} while ((cp = cp->cpu_next) != cpu_list);
915 	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
916 		prom_printf("changed cpu %p to state %d\n",
917 		    (void *)bootcpu, CPU_CPR_ONLINE);
918 	mutex_exit(&cpu_lock);
919 
920 	return (rc);
921 }
922 
923 int
924 cpr_mp_online(void)
925 {
926 	cpu_t *cp, *bootcpu = CPU;
927 	int rc = 0;
928 
929 	/*
930 	 * Do nothing for UP.
931 	 */
932 	if (ncpus == 1)
933 		return (0);
934 
935 	/*
936 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
937 	 * to indicate a cpu was online at the time of cpr_suspend();
938 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
939 	 * and actually are offline.
940 	 */
941 	mutex_enter(&cpu_lock);
942 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
943 		/*
944 		 * Clear the CPU_FROZEN flag in all cases.
945 		 */
946 		cp->cpu_flags &= ~CPU_FROZEN;
947 
948 		if (CPU_CPR_IS_OFFLINE(cp))
949 			continue;
950 		if (CPU_ACTIVE(cp))
951 			continue;
952 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
953 			mutex_exit(&cpu_lock);
954 			return (rc);
955 		}
956 	}
957 
958 	/*
959 	 * turn off the boot cpu if it was offlined
960 	 */
961 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
962 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
963 			mutex_exit(&cpu_lock);
964 			return (rc);
965 		}
966 	}
967 	mutex_exit(&cpu_lock);
968 	return (0);
969 }
970 
971 static void
972 cpr_save_mp_state(void)
973 {
974 	cpu_t *cp;
975 
976 	ASSERT(MUTEX_HELD(&cpu_lock));
977 
978 	cp = cpu_list;
979 	do {
980 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
981 		if (CPU_ACTIVE(cp))
982 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
983 	} while ((cp = cp->cpu_next) != cpu_list);
984 }
985 
986 /*
987  * change cpu to online/offline
988  */
989 static int
990 cpr_p_online(cpu_t *cp, int state)
991 {
992 	int rc;
993 
994 	ASSERT(MUTEX_HELD(&cpu_lock));
995 
996 	switch (state) {
997 	case CPU_CPR_ONLINE:
998 		rc = cpu_online(cp);
999 		break;
1000 	case CPU_CPR_OFFLINE:
1001 		rc = cpu_offline(cp, CPU_FORCED);
1002 		break;
1003 	}
1004 	if (rc) {
1005 		cpr_err(CE_WARN, "Failed to change processor %d to "
1006 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
1007 	}
1008 	return (rc);
1009 }
1010 
1011 /*
1012  * Construct the pathname of the state file and return a pointer to
1013  * caller.  Read the config file to get the mount point of the
1014  * filesystem and the pathname within fs.
1015  */
1016 char *
1017 cpr_build_statefile_path(void)
1018 {
1019 	struct cprconfig *cf = &cprconfig;
1020 
1021 	if (cpr_get_config())
1022 		return (NULL);
1023 
1024 	switch (cf->cf_type) {
1025 	case CFT_UFS:
1026 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1027 			cpr_err(CE_CONT, "Statefile path is too long.\n");
1028 			return (NULL);
1029 		}
1030 		return (cpr_cprconfig_to_path());
1031 	case CFT_ZVOL:
1032 		/*FALLTHROUGH*/
1033 	case CFT_SPEC:
1034 		return (cf->cf_devfs);
1035 	default:
1036 		cpr_err(CE_PANIC, "invalid statefile type");
1037 		/*NOTREACHED*/
1038 		return (NULL);
1039 	}
1040 }
1041 
1042 int
1043 cpr_statefile_is_spec(void)
1044 {
1045 	if (cpr_get_config())
1046 		return (0);
1047 	return (cprconfig.cf_type == CFT_SPEC);
1048 }
1049 
1050 char *
1051 cpr_get_statefile_prom_path(void)
1052 {
1053 	struct cprconfig *cf = &cprconfig;
1054 
1055 	ASSERT(cprconfig_loaded);
1056 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1057 	ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL);
1058 	return (cf->cf_dev_prom);
1059 }
1060 
1061 
1062 /*
1063  * XXX The following routines need to be in the vfs source code.
1064  */
1065 
1066 int
1067 cpr_is_ufs(struct vfs *vfsp)
1068 {
1069 	char *fsname;
1070 
1071 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1072 	return (strcmp(fsname, "ufs") == 0);
1073 }
1074 
1075 int
1076 cpr_is_zfs(struct vfs *vfsp)
1077 {
1078 	char *fsname;
1079 
1080 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1081 	return (strcmp(fsname, "zfs") == 0);
1082 }
1083 
1084 /*
1085  * This is a list of file systems that are allowed to be writeable when a
1086  * reusable statefile checkpoint is taken.  They must not have any state that
1087  * cannot be restored to consistency by simply rebooting using the checkpoint.
1088  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
1089  * out of sync with the in-kernel data).
1090  */
1091 int
1092 cpr_reusable_mount_check(void)
1093 {
1094 	struct vfs *vfsp;
1095 	char *fsname;
1096 	char **cpp;
1097 	static char *cpr_writeok_fss[] = {
1098 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1099 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1100 	};
1101 
1102 	vfs_list_read_lock();
1103 	vfsp = rootvfs;
1104 	do {
1105 		if (vfsp->vfs_flag & VFS_RDONLY) {
1106 			vfsp = vfsp->vfs_next;
1107 			continue;
1108 		}
1109 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1110 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1111 			if (strcmp(fsname, *cpp) == 0)
1112 				break;
1113 		}
1114 		/*
1115 		 * if the inner loop reached the NULL terminator,
1116 		 * the current fs-type does not match any OK-type
1117 		 */
1118 		if (*cpp == NULL) {
1119 			cpr_err(CE_CONT, "a filesystem of type %s is "
1120 			    "mounted read/write.\nReusable statefile requires "
1121 			    "no writeable filesystem of this type be mounted\n",
1122 			    fsname);
1123 			vfs_list_unlock();
1124 			return (EINVAL);
1125 		}
1126 		vfsp = vfsp->vfs_next;
1127 	} while (vfsp != rootvfs);
1128 	vfs_list_unlock();
1129 	return (0);
1130 }
1131 
1132 /*
1133  * return statefile offset in DEV_BSIZE units
1134  */
1135 int
1136 cpr_statefile_offset(void)
1137 {
1138 	return (cprconfig.cf_type != CFT_UFS ? btod(CPR_SPEC_OFFSET) : 0);
1139 }
1140 
1141 /*
1142  * Force a fresh read of the cprinfo per uadmin 3 call
1143  */
1144 void
1145 cpr_forget_cprconfig(void)
1146 {
1147 	cprconfig_loaded = 0;
1148 }
1149 #endif
1150