xref: /titanic_52/usr/src/uts/common/cpr/cpr_misc.c (revision 9113a79cf228b8f7bd509b1328adf88659dfe218)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/errno.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vfs.h>
32 #include <sys/vnode.h>
33 #include <sys/pathname.h>
34 #include <sys/callb.h>
35 #include <sys/fs/ufs_inode.h>
36 #include <vm/anon.h>
37 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
38 #include <sys/kmem.h>
39 #include <sys/cpr.h>
40 #include <sys/conf.h>
41 
42 /*
43  * CPR miscellaneous support routines
44  */
45 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
46 		mode, 0600, vpp, CRCREAT, 0))
47 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
48 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
49 		(ssize_t *)NULL))
50 
51 extern void clkset(time_t);
52 extern cpu_t *i_cpr_bootcpu(void);
53 extern caddr_t i_cpr_map_setup(void);
54 extern void i_cpr_free_memory_resources(void);
55 
56 extern kmutex_t cpr_slock;
57 extern size_t cpr_buf_size;
58 extern char *cpr_buf;
59 extern size_t cpr_pagedata_size;
60 extern char *cpr_pagedata;
61 extern int cpr_bufs_allocated;
62 extern int cpr_bitmaps_allocated;
63 
64 static struct cprconfig cprconfig;
65 static int cprconfig_loaded = 0;
66 static int cpr_statefile_ok(vnode_t *, int);
67 static int cpr_p_online(cpu_t *, int);
68 static void cpr_save_mp_state(void);
69 int cpr_is_ufs(struct vfs *);
70 
71 char cpr_default_path[] = CPR_DEFAULT;
72 
73 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
74 #define	SIZE_RATE	115	/* increase size by 15% */
75 #define	INTEGRAL	100	/* for integer math */
76 
77 
78 /*
79  * cmn_err() followed by a 1/4 second delay; this gives the
80  * logging service a chance to flush messages and helps avoid
81  * intermixing output from prom_printf().
82  */
83 /*PRINTFLIKE2*/
84 void
85 cpr_err(int ce, const char *fmt, ...)
86 {
87 	va_list adx;
88 
89 	va_start(adx, fmt);
90 	vcmn_err(ce, fmt, adx);
91 	va_end(adx);
92 	drv_usecwait(MICROSEC >> 2);
93 }
94 
95 
96 int
97 cpr_init(int fcn)
98 {
99 	/*
100 	 * Allow only one suspend/resume process.
101 	 */
102 	if (mutex_tryenter(&cpr_slock) == 0)
103 		return (EBUSY);
104 
105 	CPR->c_flags = 0;
106 	CPR->c_substate = 0;
107 	CPR->c_cprboot_magic = 0;
108 	CPR->c_alloc_cnt = 0;
109 
110 	CPR->c_fcn = fcn;
111 	if (fcn == AD_CPR_REUSABLE)
112 		CPR->c_flags |= C_REUSABLE;
113 	else
114 		CPR->c_flags |= C_SUSPENDING;
115 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
116 		CPR->c_flags |= C_COMPRESSING;
117 	/*
118 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
119 	 */
120 	CPR->c_mapping_area = i_cpr_map_setup();
121 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
122 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
123 		mutex_exit(&cpr_slock);
124 		return (EAGAIN);
125 	}
126 	if (cpr_debug & CPR_DEBUG3)
127 		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
128 		    "kas\n", (void *)CPR->c_mapping_area);
129 
130 	return (0);
131 }
132 
133 /*
134  * This routine releases any resources used during the checkpoint.
135  */
136 void
137 cpr_done(void)
138 {
139 	cpr_stat_cleanup();
140 	i_cpr_bitmap_cleanup();
141 
142 	/*
143 	 * Free pages used by cpr buffers.
144 	 */
145 	if (cpr_buf) {
146 		kmem_free(cpr_buf, cpr_buf_size);
147 		cpr_buf = NULL;
148 	}
149 	if (cpr_pagedata) {
150 		kmem_free(cpr_pagedata, cpr_pagedata_size);
151 		cpr_pagedata = NULL;
152 	}
153 
154 	i_cpr_free_memory_resources();
155 	mutex_exit(&cpr_slock);
156 	cpr_err(CE_CONT, "System has been resumed.\n");
157 }
158 
159 
160 /*
161  * reads config data into cprconfig
162  */
163 static int
164 cpr_get_config(void)
165 {
166 	static char config_path[] = CPR_CONFIG;
167 	struct cprconfig *cf = &cprconfig;
168 	struct vnode *vp;
169 	char *fmt;
170 	int err;
171 
172 	if (cprconfig_loaded)
173 		return (0);
174 
175 	fmt = "cannot %s config file \"%s\", error %d\n";
176 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
177 		cpr_err(CE_CONT, fmt, "open", config_path, err);
178 		return (err);
179 	}
180 
181 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
182 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
183 	VN_RELE(vp);
184 	if (err) {
185 		cpr_err(CE_CONT, fmt, "read", config_path, err);
186 		return (err);
187 	}
188 
189 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
190 		cprconfig_loaded = 1;
191 	else {
192 		cpr_err(CE_CONT, "invalid config file \"%s\", "
193 		    "rerun pmconfig(1M)\n", config_path);
194 		err = EINVAL;
195 	}
196 
197 	return (err);
198 }
199 
200 
201 /*
202  * concat fs and path fields of the cprconfig structure;
203  * returns pointer to the base of static data
204  */
205 static char *
206 cpr_cprconfig_to_path(void)
207 {
208 	static char full_path[MAXNAMELEN];
209 	struct cprconfig *cf = &cprconfig;
210 	char *ptr;
211 
212 	/*
213 	 * build /fs/path without extra '/'
214 	 */
215 	(void) strcpy(full_path, cf->cf_fs);
216 	if (strcmp(cf->cf_fs, "/"))
217 		(void) strcat(full_path, "/");
218 	ptr = cf->cf_path;
219 	if (*ptr == '/')
220 		ptr++;
221 	(void) strcat(full_path, ptr);
222 	return (full_path);
223 }
224 
225 
226 /*
227  * Verify that the information in the configuration file regarding the
228  * location for the statefile is still valid, depending on cf_type.
229  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
230  *	mounted on the same device as when pmconfig was last run,
231  *	and the translation of that device to a node in the prom's
232  *	device tree must be the same as when pmconfig was last run.
233  * for CFT_SPEC, cf_path must be the path to a block special file,
234  *	it must have no file system mounted on it,
235  *	and the translation of that device to a node in the prom's
236  *	device tree must be the same as when pmconfig was last run.
237  */
238 static int
239 cpr_verify_statefile_path(void)
240 {
241 	struct cprconfig *cf = &cprconfig;
242 	static const char long_name[] = "Statefile pathname is too long.\n";
243 	static const char lookup_fmt[] = "Lookup failed for "
244 	    "cpr statefile device %s.\n";
245 	static const char path_chg_fmt[] = "Device path for statefile "
246 	    "has changed from %s to %s.\t%s\n";
247 	static const char rerun[] = "Please rerun pmconfig(1m).";
248 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
249 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
250 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
251 	int error;
252 	struct vnode *vp;
253 	char *slash, *tail, *longest;
254 	char *errstr;
255 	int found = 0;
256 	union {
257 		char un_devpath[OBP_MAXPATHLEN];
258 		char un_sfpath[MAXNAMELEN];
259 	} un;
260 #define	devpath	un.un_devpath
261 #define	sfpath	un.un_sfpath
262 
263 	ASSERT(cprconfig_loaded);
264 	/*
265 	 * We need not worry about locking or the timing of releasing
266 	 * the vnode, since we are single-threaded now.
267 	 */
268 
269 	switch (cf->cf_type) {
270 	case CFT_SPEC:
271 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
272 			cpr_err(CE_CONT, long_name);
273 			return (ENAMETOOLONG);
274 		}
275 		if ((error = lookupname(cf->cf_devfs,
276 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
277 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
278 			return (error);
279 		}
280 		if (vp->v_type != VBLK)
281 			errstr = "statefile must be a block device";
282 		else if (vfs_devismounted(vp->v_rdev))
283 			errstr = "statefile device must not "
284 			    "have a file system mounted on it";
285 		else if (IS_SWAPVP(vp))
286 			errstr = "statefile device must not "
287 			    "be configured as swap file";
288 		else
289 			errstr = NULL;
290 
291 		VN_RELE(vp);
292 		if (errstr) {
293 			cpr_err(CE_CONT, "%s.\n", errstr);
294 			return (ENOTSUP);
295 		}
296 
297 		error = i_devname_to_promname(cf->cf_devfs, devpath,
298 		    OBP_MAXPATHLEN);
299 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
300 			cpr_err(CE_CONT, path_chg_fmt,
301 			    cf->cf_dev_prom, devpath, rerun);
302 		}
303 		return (error);
304 	case CFT_UFS:
305 		break;		/* don't indent all the original code */
306 	default:
307 		cpr_err(CE_PANIC, "invalid cf_type");
308 	}
309 
310 	/*
311 	 * The original code for UFS statefile
312 	 */
313 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
314 		cpr_err(CE_CONT, long_name);
315 		return (ENAMETOOLONG);
316 	}
317 
318 	bzero(sfpath, sizeof (sfpath));
319 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
320 
321 	if (*sfpath != '/') {
322 		cpr_err(CE_CONT, "Statefile pathname %s "
323 		    "must begin with a /\n", sfpath);
324 		return (EINVAL);
325 	}
326 
327 	/*
328 	 * Find the longest prefix of the statefile pathname which
329 	 * is the mountpoint of a filesystem.  This string must
330 	 * match the cf_fs field we read from the config file.  Other-
331 	 * wise the user has changed things without running pmconfig.
332 	 */
333 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
334 	while ((slash = strchr(tail, '/')) != NULL) {
335 		*slash = '\0';	  /* temporarily terminate the string */
336 		if ((error = lookupname(sfpath,
337 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
338 			*slash = '/';
339 			cpr_err(CE_CONT, "A directory in the "
340 			    "statefile path %s was not found.\n", sfpath);
341 			VN_RELE(vp);
342 
343 			return (error);
344 		}
345 
346 		vfs_list_read_lock();
347 		vfsp = rootvfs;
348 		do {
349 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
350 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
351 				found = 1;
352 				break;
353 			}
354 			vfsp = vfsp->vfs_next;
355 		} while (vfsp != rootvfs);
356 		vfs_list_unlock();
357 
358 		/*
359 		 * If we have found a filesystem mounted on the current
360 		 * path prefix, remember the end of the string in
361 		 * "longest".  If it happens to be the the exact fs
362 		 * saved in the configuration file, save the current
363 		 * ufsvfsp so we can make additional checks further down.
364 		 */
365 		if (found) {
366 			longest = slash;
367 			if (strcmp(cf->cf_fs, sfpath) == 0) {
368 				ufsvfsp_save = ufsvfsp;
369 				vfsp_save = vfsp;
370 			}
371 			found = 0;
372 		}
373 
374 		VN_RELE(vp);
375 		*slash = '/';
376 		tail = slash + 1;
377 	}
378 	*longest = '\0';
379 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
380 		cpr_err(CE_CONT, "Filesystem containing "
381 		    "the statefile when pmconfig was run (%s) has "
382 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
383 		return (EINVAL);
384 	}
385 
386 	if ((error = lookupname(cf->cf_devfs,
387 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
388 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
389 		return (error);
390 	}
391 
392 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
393 		cpr_err(CE_CONT, "Filesystem containing "
394 		    "statefile no longer mounted on device %s. "
395 		    "See power.conf(4).", cf->cf_devfs);
396 		VN_RELE(vp);
397 		return (ENXIO);
398 	}
399 	VN_RELE(vp);
400 
401 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
402 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
403 		cpr_err(CE_CONT, path_chg_fmt,
404 		    cf->cf_dev_prom, devpath, rerun);
405 		return (error);
406 	}
407 
408 	return (0);
409 }
410 
411 /*
412  * Make sure that the statefile can be used as a block special statefile
413  * (meaning that is exists and has nothing mounted on it)
414  * Returns errno if not a valid statefile.
415  */
416 int
417 cpr_check_spec_statefile(void)
418 {
419 	int err;
420 
421 	if (err = cpr_get_config())
422 		return (err);
423 	ASSERT(cprconfig.cf_type == CFT_SPEC);
424 
425 	if (cprconfig.cf_devfs == NULL)
426 		return (ENXIO);
427 
428 	return (cpr_verify_statefile_path());
429 
430 }
431 
432 int
433 cpr_alloc_statefile(int alloc_retry)
434 {
435 	register int rc = 0;
436 	char *str;
437 
438 	/*
439 	 * Statefile size validation. If checkpoint the first time, disk blocks
440 	 * allocation will be done; otherwise, just do file size check.
441 	 * if statefile allocation is being retried, C_VP will be inited
442 	 */
443 	if (alloc_retry) {
444 		str = "\n-->Retrying statefile allocation...";
445 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
446 			prom_printf(str);
447 		if (C_VP->v_type != VBLK)
448 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL);
449 	} else {
450 		/*
451 		 * Open an exiting file for writing, the state file needs to be
452 		 * pre-allocated since we can't and don't want to do allocation
453 		 * during checkpoint (too much of the OS is disabled).
454 		 *    - do a preliminary size checking here, if it is too small,
455 		 *	allocate more space internally and retry.
456 		 *    - check the vp to make sure it's the right type.
457 		 */
458 		char *path = cpr_build_statefile_path();
459 
460 		if (path == NULL)
461 			return (ENXIO);
462 		else if (rc = cpr_verify_statefile_path())
463 			return (rc);
464 
465 		if (rc = vn_open(path, UIO_SYSSPACE,
466 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
467 			cpr_err(CE_WARN, "cannot open statefile %s", path);
468 			return (rc);
469 		}
470 	}
471 
472 	/*
473 	 * Only ufs and block special statefiles supported
474 	 */
475 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
476 		cpr_err(CE_CONT,
477 		    "Statefile must be regular file or block special file.");
478 		return (EACCES);
479 	}
480 
481 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
482 		return (rc);
483 
484 	if (C_VP->v_type != VBLK) {
485 		/*
486 		 * sync out the fs change due to the statefile reservation.
487 		 */
488 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
489 
490 		/*
491 		 * Validate disk blocks allocation for the state file.
492 		 * Ask the file system prepare itself for the dump operation.
493 		 */
494 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL)) {
495 			cpr_err(CE_CONT, "Error allocating "
496 			    "blocks for cpr statefile.");
497 			return (rc);
498 		}
499 	}
500 	return (0);
501 }
502 
503 
504 /*
505  * lookup device size in blocks,
506  * and return available space in bytes
507  */
508 size_t
509 cpr_get_devsize(dev_t dev)
510 {
511 	size_t bytes = 0;
512 	int64_t Nblocks;
513 	int nblocks;
514 
515 	if ((Nblocks = bdev_Size(dev)) != -1)
516 		bytes = dbtob(Nblocks);
517 	else if ((nblocks = bdev_size(dev)) != -1)
518 		bytes = dbtob(nblocks);
519 
520 	if (bytes > CPR_SPEC_OFFSET)
521 		bytes -= CPR_SPEC_OFFSET;
522 	else
523 		bytes = 0;
524 
525 	return (bytes);
526 }
527 
528 
529 /*
530  * increase statefile size
531  */
532 static int
533 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
534 {
535 	extern uchar_t cpr_pagecopy[];
536 	struct inode *ip = VTOI(vp);
537 	u_longlong_t offset;
538 	int error, increase;
539 	ssize_t resid;
540 
541 	rw_enter(&ip->i_contents, RW_READER);
542 	increase = (ip->i_size < newsize);
543 	offset = ip->i_size;
544 	rw_exit(&ip->i_contents);
545 
546 	if (increase == 0)
547 		return (0);
548 
549 	/*
550 	 * write to each logical block to reserve disk space
551 	 */
552 	error = 0;
553 	cpr_pagecopy[0] = '1';
554 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
555 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
556 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
557 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
558 			if (error == ENOSPC) {
559 				cpr_err(CE_WARN, "error %d while reserving "
560 				    "disk space for statefile %s\n"
561 				    "wanted %lld bytes, file is %lld short",
562 				    error, cpr_cprconfig_to_path(),
563 				    newsize, newsize - offset);
564 			}
565 			break;
566 		}
567 	}
568 	return (error);
569 }
570 
571 
572 /*
573  * do a simple estimate of the space needed to hold the statefile
574  * taking compression into account, but be fairly conservative
575  * so we have a better chance of completing; when dump fails,
576  * the retry cost is fairly high.
577  *
578  * Do disk blocks allocation for the state file if no space has
579  * been allocated yet. Since the state file will not be removed,
580  * allocation should only be done once.
581  */
582 static int
583 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
584 {
585 	extern size_t cpr_bitmap_size;
586 	struct inode *ip = VTOI(vp);
587 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
588 	u_longlong_t size, isize, ksize, raw_data;
589 	char *str, *est_fmt;
590 	size_t space;
591 	int error;
592 
593 	/*
594 	 * number of pages short for swapping.
595 	 */
596 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
597 	if (STAT->cs_nosw_pages < 0)
598 		STAT->cs_nosw_pages = 0;
599 
600 	str = "cpr_statefile_ok:";
601 
602 	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
603 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
604 	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
605 	    MAX(availrmem - swapfs_minfree, 0),
606 	    k_anoninfo.ani_mem_resv);
607 	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
608 		CURRENT_TOTAL_AVAILABLE_SWAP);
609 
610 	/*
611 	 * try increasing filesize by 15%
612 	 */
613 	if (alloc_retry) {
614 		/*
615 		 * block device doesn't get any bigger
616 		 */
617 		if (vp->v_type == VBLK) {
618 			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
619 				prom_printf(
620 				    "Retry statefile on special file\n");
621 			return (ENOMEM);
622 		} else {
623 			rw_enter(&ip->i_contents, RW_READER);
624 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
625 			rw_exit(&ip->i_contents);
626 		}
627 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
628 			prom_printf("Retry statefile size = %lld\n", size);
629 	} else {
630 		u_longlong_t cpd_size;
631 		pgcnt_t npages, nback;
632 		int ndvram;
633 
634 		ndvram = 0;
635 		(void) callb_execute_class(CB_CL_CPR_FB,
636 		    (int)(uintptr_t)&ndvram);
637 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
638 			prom_printf("ndvram size = %d\n", ndvram);
639 
640 		/*
641 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
642 		 */
643 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
644 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
645 		raw_data = cpd_size + cpr_bitmap_size;
646 		ksize = ndvram + mmu_ptob(npages);
647 
648 		est_fmt = "%s estimated size with "
649 		    "%scompression %lld, ksize %lld\n";
650 		nback = mmu_ptob(STAT->cs_nosw_pages);
651 		if (CPR->c_flags & C_COMPRESSING) {
652 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
653 			    raw_data + ((nback * 10) / UCOMP_RATE);
654 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
655 		} else {
656 			size = ksize + raw_data + nback;
657 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
658 			    size, ksize);
659 		}
660 	}
661 
662 	/*
663 	 * All this is much simpler for a block device
664 	 */
665 	if (vp->v_type == VBLK) {
666 		space = cpr_get_devsize(vp->v_rdev);
667 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
668 			prom_printf("statefile dev size %lu\n", space);
669 
670 		/*
671 		 * Export the estimated filesize info, this value will be
672 		 * compared before dumping out the statefile in the case of
673 		 * no compression.
674 		 */
675 		STAT->cs_est_statefsz = size;
676 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
677 			prom_printf("%s Estimated statefile size %llu, "
678 			    "space %lu\n", str, size, space);
679 		if (size > space) {
680 			cpr_err(CE_CONT, "Statefile partition too small.");
681 			return (ENOMEM);
682 		}
683 		return (0);
684 	} else {
685 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
686 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
687 			return (ENOMEM);
688 		}
689 
690 		/*
691 		 * Estimate space needed for the state file.
692 		 *
693 		 * State file size in bytes:
694 		 * 	kernel size + non-cache pte seg +
695 		 *	bitmap size + cpr state file headers size
696 		 * (round up to fs->fs_bsize)
697 		 */
698 		size = blkroundup(ip->i_fs, size);
699 
700 		/*
701 		 * Export the estimated filesize info, this value will be
702 		 * compared before dumping out the statefile in the case of
703 		 * no compression.
704 		 */
705 		STAT->cs_est_statefsz = size;
706 		error = cpr_grow_statefile(vp, size);
707 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
708 			rw_enter(&ip->i_contents, RW_READER);
709 			isize = ip->i_size;
710 			rw_exit(&ip->i_contents);
711 			prom_printf("%s Estimated statefile size %lld, "
712 			    "i_size %lld\n", str, size, isize);
713 		}
714 
715 		return (error);
716 	}
717 }
718 
719 
720 void
721 cpr_statef_close(void)
722 {
723 	if (C_VP) {
724 		if (!cpr_reusable_mode)
725 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL);
726 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED());
727 		VN_RELE(C_VP);
728 		C_VP = 0;
729 	}
730 }
731 
732 
733 /*
734  * open cpr default file and display error
735  */
736 int
737 cpr_open_deffile(int mode, vnode_t **vpp)
738 {
739 	int error;
740 
741 	if (error = cpr_open(cpr_default_path, mode, vpp))
742 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
743 		    cpr_default_path, error);
744 	return (error);
745 }
746 
747 
748 /*
749  * write cdef_t to disk.  This contains the original values of prom
750  * properties that we modify.  We fill in the magic number of the file
751  * here as a signal to the booter code that the state file is valid.
752  * Be sure the file gets synced, since we may be shutting down the OS.
753  */
754 int
755 cpr_write_deffile(cdef_t *cdef)
756 {
757 	struct vnode *vp;
758 	char *str;
759 	int rc;
760 
761 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
762 		return (rc);
763 
764 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
765 		str = "write";
766 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED()))
767 		str = "fsync";
768 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
769 	VN_RELE(vp);
770 
771 	if (rc) {
772 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
773 		    str, rc, cpr_default_path);
774 	}
775 	return (rc);
776 }
777 
778 /*
779  * Clear the magic number in the defaults file.  This tells the booter
780  * program that the state file is not current and thus prevents
781  * any attempt to restore from an obsolete state file.
782  */
783 void
784 cpr_clear_definfo(void)
785 {
786 	struct vnode *vp;
787 	cmini_t mini;
788 
789 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
790 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
791 		return;
792 	mini.magic = mini.reusable = 0;
793 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
794 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
795 	VN_RELE(vp);
796 }
797 
798 /*
799  * If the cpr default file is invalid, then we must not be in reusable mode
800  * if it is valid, it tells us our mode
801  */
802 int
803 cpr_get_reusable_mode(void)
804 {
805 	struct vnode *vp;
806 	cmini_t mini;
807 	int rc;
808 
809 	if (cpr_open(cpr_default_path, FREAD, &vp))
810 		return (0);
811 
812 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
813 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
814 	VN_RELE(vp);
815 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
816 		return (mini.reusable);
817 
818 	return (0);
819 }
820 
821 /*
822  * clock/time related routines
823  */
824 static time_t   cpr_time_stamp;
825 
826 
827 void
828 cpr_tod_get(cpr_time_t *ctp)
829 {
830 	timestruc_t ts;
831 
832 	mutex_enter(&tod_lock);
833 	ts = tod_get();
834 	mutex_exit(&tod_lock);
835 	ctp->tv_sec = (time32_t)ts.tv_sec;
836 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
837 }
838 
839 void
840 cpr_tod_fault_reset(void)
841 {
842 	mutex_enter(&tod_lock);
843 	tod_fault_reset();
844 	mutex_exit(&tod_lock);
845 }
846 
847 void
848 cpr_save_time(void)
849 {
850 	cpr_time_stamp = gethrestime_sec();
851 }
852 
853 /*
854  * correct time based on saved time stamp or hardware clock
855  */
856 void
857 cpr_restore_time(void)
858 {
859 	clkset(cpr_time_stamp);
860 }
861 
862 /*
863  * CPU ONLINE/OFFLINE CODE
864  */
865 int
866 cpr_mp_offline(void)
867 {
868 	cpu_t *cp, *bootcpu;
869 	int rc = 0;
870 	int brought_up_boot = 0;
871 
872 	/*
873 	 * Do nothing for UP.
874 	 */
875 	if (ncpus == 1)
876 		return (0);
877 
878 	mutex_enter(&cpu_lock);
879 
880 	cpr_save_mp_state();
881 
882 	bootcpu = i_cpr_bootcpu();
883 	if (!CPU_ACTIVE(bootcpu)) {
884 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
885 			mutex_exit(&cpu_lock);
886 			return (rc);
887 		}
888 		brought_up_boot = 1;
889 	}
890 
891 	cp = cpu_list;
892 	do {
893 		if (cp == bootcpu)
894 			continue;
895 		if (cp->cpu_flags & CPU_OFFLINE)
896 			continue;
897 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
898 			mutex_exit(&cpu_lock);
899 			return (rc);
900 		}
901 	} while ((cp = cp->cpu_next) != cpu_list);
902 	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
903 		prom_printf("changed cpu %p to state %d\n",
904 		    bootcpu, CPU_CPR_ONLINE);
905 	mutex_exit(&cpu_lock);
906 
907 	return (rc);
908 }
909 
910 int
911 cpr_mp_online(void)
912 {
913 	cpu_t *cp, *bootcpu = CPU;
914 	int rc = 0;
915 
916 	/*
917 	 * Do nothing for UP.
918 	 */
919 	if (ncpus == 1)
920 		return (0);
921 
922 	/*
923 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
924 	 * to indicate a cpu was online at the time of cpr_suspend();
925 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
926 	 * and actually are offline.
927 	 */
928 	mutex_enter(&cpu_lock);
929 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
930 		/*
931 		 * Clear the CPU_FROZEN flag in all cases.
932 		 */
933 		cp->cpu_flags &= ~CPU_FROZEN;
934 
935 		if (CPU_CPR_IS_OFFLINE(cp))
936 			continue;
937 		if (CPU_ACTIVE(cp))
938 			continue;
939 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
940 			mutex_exit(&cpu_lock);
941 			return (rc);
942 		}
943 	}
944 
945 	/*
946 	 * turn off the boot cpu if it was offlined
947 	 */
948 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
949 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
950 			mutex_exit(&cpu_lock);
951 			return (rc);
952 		}
953 	}
954 	mutex_exit(&cpu_lock);
955 	return (0);
956 }
957 
958 static void
959 cpr_save_mp_state(void)
960 {
961 	cpu_t *cp;
962 
963 	ASSERT(MUTEX_HELD(&cpu_lock));
964 
965 	cp = cpu_list;
966 	do {
967 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
968 		if (CPU_ACTIVE(cp))
969 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
970 	} while ((cp = cp->cpu_next) != cpu_list);
971 }
972 
973 /*
974  * change cpu to online/offline
975  */
976 static int
977 cpr_p_online(cpu_t *cp, int state)
978 {
979 	int rc;
980 
981 	ASSERT(MUTEX_HELD(&cpu_lock));
982 
983 	switch (state) {
984 	case CPU_CPR_ONLINE:
985 		rc = cpu_online(cp);
986 		break;
987 	case CPU_CPR_OFFLINE:
988 		rc = cpu_offline(cp, CPU_FORCED);
989 		break;
990 	}
991 	if (rc) {
992 		cpr_err(CE_WARN, "Failed to change processor %d to "
993 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
994 	}
995 	return (rc);
996 }
997 
998 /*
999  * Construct the pathname of the state file and return a pointer to
1000  * caller.  Read the config file to get the mount point of the
1001  * filesystem and the pathname within fs.
1002  */
1003 char *
1004 cpr_build_statefile_path(void)
1005 {
1006 	struct cprconfig *cf = &cprconfig;
1007 
1008 	if (cpr_get_config())
1009 		return (NULL);
1010 
1011 	switch (cf->cf_type) {
1012 	case CFT_UFS:
1013 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1014 			cpr_err(CE_CONT, "Statefile path is too long.\n");
1015 			return (NULL);
1016 		}
1017 		return (cpr_cprconfig_to_path());
1018 	case CFT_SPEC:
1019 		return (cf->cf_devfs);
1020 	default:
1021 		cpr_err(CE_PANIC, "invalid statefile type");
1022 		/*NOTREACHED*/
1023 		return (NULL);
1024 	}
1025 }
1026 
1027 int
1028 cpr_statefile_is_spec(void)
1029 {
1030 	if (cpr_get_config())
1031 		return (0);
1032 	return (cprconfig.cf_type == CFT_SPEC);
1033 }
1034 
1035 char *
1036 cpr_get_statefile_prom_path(void)
1037 {
1038 	struct cprconfig *cf = &cprconfig;
1039 
1040 	ASSERT(cprconfig_loaded);
1041 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1042 	ASSERT(cf->cf_type == CFT_SPEC);
1043 	return (cf->cf_dev_prom);
1044 }
1045 
1046 
1047 /*
1048  * XXX The following routines need to be in the vfs source code.
1049  */
1050 
1051 int
1052 cpr_is_ufs(struct vfs *vfsp)
1053 {
1054 	char *fsname;
1055 
1056 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1057 	return (strcmp(fsname, "ufs") == 0);
1058 }
1059 
1060 /*
1061  * This is a list of file systems that are allowed to be writeable when a
1062  * reusable statefile checkpoint is taken.  They must not have any state that
1063  * cannot be restored to consistency by simply rebooting using the checkpoint.
1064  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
1065  * out of sync with the in-kernel data).
1066  */
1067 int
1068 cpr_reusable_mount_check(void)
1069 {
1070 	struct vfs *vfsp;
1071 	char *fsname;
1072 	char **cpp;
1073 	static char *cpr_writeok_fss[] = {
1074 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1075 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1076 	};
1077 
1078 	vfs_list_read_lock();
1079 	vfsp = rootvfs;
1080 	do {
1081 		if (vfsp->vfs_flag & VFS_RDONLY) {
1082 			vfsp = vfsp->vfs_next;
1083 			continue;
1084 		}
1085 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1086 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1087 			if (strcmp(fsname, *cpp) == 0)
1088 				break;
1089 		}
1090 		/*
1091 		 * if the inner loop reached the NULL terminator,
1092 		 * the current fs-type does not match any OK-type
1093 		 */
1094 		if (*cpp == NULL) {
1095 			cpr_err(CE_CONT, "a filesystem of type %s is "
1096 			    "mounted read/write.\nReusable statefile requires "
1097 			    "no writeable filesystem of this type be mounted\n",
1098 			    fsname);
1099 			vfs_list_unlock();
1100 			return (EINVAL);
1101 		}
1102 		vfsp = vfsp->vfs_next;
1103 	} while (vfsp != rootvfs);
1104 	vfs_list_unlock();
1105 	return (0);
1106 }
1107 
1108 /*
1109  * Force a fresh read of the cprinfo per uadmin 3 call
1110  */
1111 void
1112 cpr_forget_cprconfig(void)
1113 {
1114 	cprconfig_loaded = 0;
1115 }
1116 
1117 
1118 /*
1119  * return statefile offset in DEV_BSIZE units
1120  */
1121 int
1122 cpr_statefile_offset(void)
1123 {
1124 	return (cpr_statefile_is_spec() ? btod(CPR_SPEC_OFFSET) : 0);
1125 }
1126