xref: /illumos-gate/usr/src/uts/common/cpr/cpr_misc.c (revision 753d2d2e8e7fd0c9bcf736d9bf2f2faf4d6234cc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/errno.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vfs.h>
32 #include <sys/vnode.h>
33 #include <sys/pathname.h>
34 #include <sys/callb.h>
35 #include <sys/fs/ufs_inode.h>
36 #include <vm/anon.h>
37 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
38 #include <sys/kmem.h>
39 #include <sys/cpr.h>
40 #include <sys/conf.h>
41 
42 /*
43  * CPR miscellaneous support routines
44  */
45 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
46 		mode, 0600, vpp, CRCREAT, 0))
47 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
48 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
49 		(ssize_t *)NULL))
50 
51 extern void clkset(time_t);
52 extern cpu_t *i_cpr_bootcpu(void);
53 extern caddr_t i_cpr_map_setup(void);
54 extern void i_cpr_free_memory_resources(void);
55 
56 extern kmutex_t cpr_slock;
57 extern size_t cpr_buf_size;
58 extern char *cpr_buf;
59 extern size_t cpr_pagedata_size;
60 extern char *cpr_pagedata;
61 extern int cpr_bufs_allocated;
62 extern int cpr_bitmaps_allocated;
63 
64 static struct cprconfig cprconfig;
65 static int cprconfig_loaded = 0;
66 static int cpr_statefile_ok(vnode_t *, int);
67 static int cpr_p_online(cpu_t *, int);
68 static void cpr_save_mp_state(void);
69 int cpr_is_ufs(struct vfs *);
70 
71 char cpr_default_path[] = CPR_DEFAULT;
72 
73 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
74 #define	SIZE_RATE	115	/* increase size by 15% */
75 #define	INTEGRAL	100	/* for integer math */
76 
77 
78 /*
79  * cmn_err() followed by a 1/4 second delay; this gives the
80  * logging service a chance to flush messages and helps avoid
81  * intermixing output from prom_printf().
82  */
83 /*PRINTFLIKE2*/
84 void
85 cpr_err(int ce, const char *fmt, ...)
86 {
87 	va_list adx;
88 
89 	va_start(adx, fmt);
90 	vcmn_err(ce, fmt, adx);
91 	va_end(adx);
92 	drv_usecwait(MICROSEC >> 2);
93 }
94 
95 
96 int
97 cpr_init(int fcn)
98 {
99 	/*
100 	 * Allow only one suspend/resume process.
101 	 */
102 	if (mutex_tryenter(&cpr_slock) == 0)
103 		return (EBUSY);
104 
105 	CPR->c_flags = 0;
106 	CPR->c_substate = 0;
107 	CPR->c_cprboot_magic = 0;
108 	CPR->c_alloc_cnt = 0;
109 
110 	CPR->c_fcn = fcn;
111 	if (fcn == AD_CPR_REUSABLE)
112 		CPR->c_flags |= C_REUSABLE;
113 	else
114 		CPR->c_flags |= C_SUSPENDING;
115 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
116 		CPR->c_flags |= C_COMPRESSING;
117 	/*
118 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
119 	 */
120 	CPR->c_mapping_area = i_cpr_map_setup();
121 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
122 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
123 		mutex_exit(&cpr_slock);
124 		return (EAGAIN);
125 	}
126 	DEBUG3(cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
127 	    "kas\n", (void *)CPR->c_mapping_area));
128 
129 	return (0);
130 }
131 
132 /*
133  * This routine releases any resources used during the checkpoint.
134  */
135 void
136 cpr_done(void)
137 {
138 	cpr_stat_cleanup();
139 	i_cpr_bitmap_cleanup();
140 
141 	/*
142 	 * Free pages used by cpr buffers.
143 	 */
144 	if (cpr_buf) {
145 		kmem_free(cpr_buf, cpr_buf_size);
146 		cpr_buf = NULL;
147 	}
148 	if (cpr_pagedata) {
149 		kmem_free(cpr_pagedata, cpr_pagedata_size);
150 		cpr_pagedata = NULL;
151 	}
152 
153 	i_cpr_free_memory_resources();
154 	mutex_exit(&cpr_slock);
155 	cpr_err(CE_CONT, "System has been resumed.\n");
156 }
157 
158 
159 /*
160  * reads config data into cprconfig
161  */
162 static int
163 cpr_get_config(void)
164 {
165 	static char config_path[] = CPR_CONFIG;
166 	struct cprconfig *cf = &cprconfig;
167 	struct vnode *vp;
168 	char *fmt;
169 	int err;
170 
171 	if (cprconfig_loaded)
172 		return (0);
173 
174 	fmt = "cannot %s config file \"%s\", error %d\n";
175 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
176 		cpr_err(CE_CONT, fmt, "open", config_path, err);
177 		return (err);
178 	}
179 
180 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
181 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
182 	VN_RELE(vp);
183 	if (err) {
184 		cpr_err(CE_CONT, fmt, "read", config_path, err);
185 		return (err);
186 	}
187 
188 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
189 		cprconfig_loaded = 1;
190 	else {
191 		cpr_err(CE_CONT, "invalid config file \"%s\", "
192 		    "rerun pmconfig(1M)\n", config_path);
193 		err = EINVAL;
194 	}
195 
196 	return (err);
197 }
198 
199 
200 /*
201  * concat fs and path fields of the cprconfig structure;
202  * returns pointer to the base of static data
203  */
204 static char *
205 cpr_cprconfig_to_path(void)
206 {
207 	static char full_path[MAXNAMELEN];
208 	struct cprconfig *cf = &cprconfig;
209 	char *ptr;
210 
211 	/*
212 	 * build /fs/path without extra '/'
213 	 */
214 	(void) strcpy(full_path, cf->cf_fs);
215 	if (strcmp(cf->cf_fs, "/"))
216 		(void) strcat(full_path, "/");
217 	ptr = cf->cf_path;
218 	if (*ptr == '/')
219 		ptr++;
220 	(void) strcat(full_path, ptr);
221 	return (full_path);
222 }
223 
224 
225 /*
226  * Verify that the information in the configuration file regarding the
227  * location for the statefile is still valid, depending on cf_type.
228  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
229  *	mounted on the same device as when pmconfig was last run,
230  *	and the translation of that device to a node in the prom's
231  *	device tree must be the same as when pmconfig was last run.
232  * for CFT_SPEC, cf_path must be the path to a block special file,
233  *	it must have no file system mounted on it,
234  *	and the translation of that device to a node in the prom's
235  *	device tree must be the same as when pmconfig was last run.
236  */
237 static int
238 cpr_verify_statefile_path(void)
239 {
240 	struct cprconfig *cf = &cprconfig;
241 	static const char long_name[] = "Statefile pathname is too long.\n";
242 	static const char lookup_fmt[] = "Lookup failed for "
243 	    "cpr statefile device %s.\n";
244 	static const char path_chg_fmt[] = "Device path for statefile "
245 	    "has changed from %s to %s.\t%s\n";
246 	static const char rerun[] = "Please rerun pmconfig(1m).";
247 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
248 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
249 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
250 	int error;
251 	struct vnode *vp;
252 	char *slash, *tail, *longest;
253 	char *errstr;
254 	int found = 0;
255 	union {
256 		char un_devpath[OBP_MAXPATHLEN];
257 		char un_sfpath[MAXNAMELEN];
258 	} un;
259 #define	devpath	un.un_devpath
260 #define	sfpath	un.un_sfpath
261 
262 	ASSERT(cprconfig_loaded);
263 	/*
264 	 * We need not worry about locking or the timing of releasing
265 	 * the vnode, since we are single-threaded now.
266 	 */
267 
268 	switch (cf->cf_type) {
269 	case CFT_SPEC:
270 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
271 			cpr_err(CE_CONT, long_name);
272 			return (ENAMETOOLONG);
273 		}
274 		if ((error = lookupname(cf->cf_devfs,
275 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
276 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
277 			return (error);
278 		}
279 		if (vp->v_type != VBLK)
280 			errstr = "statefile must be a block device";
281 		else if (vfs_devismounted(vp->v_rdev))
282 			errstr = "statefile device must not "
283 			    "have a file system mounted on it";
284 		else if (IS_SWAPVP(vp))
285 			errstr = "statefile device must not "
286 			    "be configured as swap file";
287 		else
288 			errstr = NULL;
289 
290 		VN_RELE(vp);
291 		if (errstr) {
292 			cpr_err(CE_CONT, "%s.\n", errstr);
293 			return (ENOTSUP);
294 		}
295 
296 		error = i_devname_to_promname(cf->cf_devfs, devpath,
297 		    OBP_MAXPATHLEN);
298 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
299 			cpr_err(CE_CONT, path_chg_fmt,
300 			    cf->cf_dev_prom, devpath, rerun);
301 		}
302 		return (error);
303 	case CFT_UFS:
304 		break;		/* don't indent all the original code */
305 	default:
306 		cpr_err(CE_PANIC, "invalid cf_type");
307 	}
308 
309 	/*
310 	 * The original code for UFS statefile
311 	 */
312 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
313 		cpr_err(CE_CONT, long_name);
314 		return (ENAMETOOLONG);
315 	}
316 
317 	bzero(sfpath, sizeof (sfpath));
318 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
319 
320 	if (*sfpath != '/') {
321 		cpr_err(CE_CONT, "Statefile pathname %s "
322 		    "must begin with a /\n", sfpath);
323 		return (EINVAL);
324 	}
325 
326 	/*
327 	 * Find the longest prefix of the statefile pathname which
328 	 * is the mountpoint of a filesystem.  This string must
329 	 * match the cf_fs field we read from the config file.  Other-
330 	 * wise the user has changed things without running pmconfig.
331 	 */
332 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
333 	while ((slash = strchr(tail, '/')) != NULL) {
334 		*slash = '\0';	  /* temporarily terminate the string */
335 		if ((error = lookupname(sfpath,
336 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
337 			*slash = '/';
338 			cpr_err(CE_CONT, "A directory in the "
339 			    "statefile path %s was not found.\n", sfpath);
340 			VN_RELE(vp);
341 
342 			return (error);
343 		}
344 
345 		vfs_list_read_lock();
346 		vfsp = rootvfs;
347 		do {
348 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
349 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
350 				found = 1;
351 				break;
352 			}
353 			vfsp = vfsp->vfs_next;
354 		} while (vfsp != rootvfs);
355 		vfs_list_unlock();
356 
357 		/*
358 		 * If we have found a filesystem mounted on the current
359 		 * path prefix, remember the end of the string in
360 		 * "longest".  If it happens to be the the exact fs
361 		 * saved in the configuration file, save the current
362 		 * ufsvfsp so we can make additional checks further down.
363 		 */
364 		if (found) {
365 			longest = slash;
366 			if (strcmp(cf->cf_fs, sfpath) == 0) {
367 				ufsvfsp_save = ufsvfsp;
368 				vfsp_save = vfsp;
369 			}
370 			found = 0;
371 		}
372 
373 		VN_RELE(vp);
374 		*slash = '/';
375 		tail = slash + 1;
376 	}
377 	*longest = '\0';
378 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
379 		cpr_err(CE_CONT, "Filesystem containing "
380 		    "the statefile when pmconfig was run (%s) has "
381 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
382 		return (EINVAL);
383 	}
384 
385 	if ((error = lookupname(cf->cf_devfs,
386 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
387 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
388 		return (error);
389 	}
390 
391 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
392 		cpr_err(CE_CONT, "Filesystem containing "
393 		    "statefile no longer mounted on device %s. "
394 		    "See power.conf(4).", cf->cf_devfs);
395 		VN_RELE(vp);
396 		return (ENXIO);
397 	}
398 	VN_RELE(vp);
399 
400 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
401 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
402 		cpr_err(CE_CONT, path_chg_fmt,
403 		    cf->cf_dev_prom, devpath, rerun);
404 		return (error);
405 	}
406 
407 	return (0);
408 }
409 
410 /*
411  * Make sure that the statefile can be used as a block special statefile
412  * (meaning that is exists and has nothing mounted on it)
413  * Returns errno if not a valid statefile.
414  */
415 int
416 cpr_check_spec_statefile(void)
417 {
418 	int err;
419 
420 	if (err = cpr_get_config())
421 		return (err);
422 	ASSERT(cprconfig.cf_type == CFT_SPEC);
423 
424 	if (cprconfig.cf_devfs == NULL)
425 		return (ENXIO);
426 
427 	return (cpr_verify_statefile_path());
428 
429 }
430 
431 int
432 cpr_alloc_statefile(int alloc_retry)
433 {
434 	register int rc = 0;
435 	char *str;
436 
437 	/*
438 	 * Statefile size validation. If checkpoint the first time, disk blocks
439 	 * allocation will be done; otherwise, just do file size check.
440 	 * if statefile allocation is being retried, C_VP will be inited
441 	 */
442 	if (alloc_retry) {
443 		str = "\n-->Retrying statefile allocation...";
444 		if (cpr_debug & (LEVEL1 | LEVEL7))
445 			errp(str);
446 		if (C_VP->v_type != VBLK)
447 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL);
448 	} else {
449 		/*
450 		 * Open an exiting file for writing, the state file needs to be
451 		 * pre-allocated since we can't and don't want to do allocation
452 		 * during checkpoint (too much of the OS is disabled).
453 		 *    - do a preliminary size checking here, if it is too small,
454 		 *	allocate more space internally and retry.
455 		 *    - check the vp to make sure it's the right type.
456 		 */
457 		char *path = cpr_build_statefile_path();
458 
459 		if (path == NULL)
460 			return (ENXIO);
461 		else if (rc = cpr_verify_statefile_path())
462 			return (rc);
463 
464 		if (rc = vn_open(path, UIO_SYSSPACE,
465 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
466 			cpr_err(CE_WARN, "cannot open statefile %s", path);
467 			return (rc);
468 		}
469 	}
470 
471 	/*
472 	 * Only ufs and block special statefiles supported
473 	 */
474 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
475 		cpr_err(CE_CONT,
476 		    "Statefile must be regular file or block special file.");
477 		return (EACCES);
478 	}
479 
480 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
481 		return (rc);
482 
483 	if (C_VP->v_type != VBLK) {
484 		/*
485 		 * sync out the fs change due to the statefile reservation.
486 		 */
487 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
488 
489 		/*
490 		 * Validate disk blocks allocation for the state file.
491 		 * Ask the file system prepare itself for the dump operation.
492 		 */
493 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL)) {
494 			cpr_err(CE_CONT, "Error allocating "
495 			    "blocks for cpr statefile.");
496 			return (rc);
497 		}
498 	}
499 	return (0);
500 }
501 
502 
503 /*
504  * lookup device size in blocks,
505  * and return available space in bytes
506  */
507 size_t
508 cpr_get_devsize(dev_t dev)
509 {
510 	size_t bytes = 0;
511 	int64_t Nblocks;
512 	int nblocks;
513 
514 	if ((Nblocks = bdev_Size(dev)) != -1)
515 		bytes = dbtob(Nblocks);
516 	else if ((nblocks = bdev_size(dev)) != -1)
517 		bytes = dbtob(nblocks);
518 
519 	if (bytes > CPR_SPEC_OFFSET)
520 		bytes -= CPR_SPEC_OFFSET;
521 	else
522 		bytes = 0;
523 
524 	return (bytes);
525 }
526 
527 
528 /*
529  * increase statefile size
530  */
531 static int
532 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
533 {
534 	extern uchar_t cpr_pagecopy[];
535 	struct inode *ip = VTOI(vp);
536 	u_longlong_t offset;
537 	int error, increase;
538 	ssize_t resid;
539 
540 	rw_enter(&ip->i_contents, RW_READER);
541 	increase = (ip->i_size < newsize);
542 	offset = ip->i_size;
543 	rw_exit(&ip->i_contents);
544 
545 	if (increase == 0)
546 		return (0);
547 
548 	/*
549 	 * write to each logical block to reserve disk space
550 	 */
551 	error = 0;
552 	cpr_pagecopy[0] = '1';
553 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
554 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
555 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
556 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
557 			if (error == ENOSPC) {
558 				cpr_err(CE_WARN, "error %d while reserving "
559 				    "disk space for statefile %s\n"
560 				    "wanted %lld bytes, file is %lld short",
561 				    error, cpr_cprconfig_to_path(),
562 				    newsize, newsize - offset);
563 			}
564 			break;
565 		}
566 	}
567 	return (error);
568 }
569 
570 
571 /*
572  * do a simple estimate of the space needed to hold the statefile
573  * taking compression into account, but be fairly conservative
574  * so we have a better chance of completing; when dump fails,
575  * the retry cost is fairly high.
576  *
577  * Do disk blocks allocation for the state file if no space has
578  * been allocated yet. Since the state file will not be removed,
579  * allocation should only be done once.
580  */
581 static int
582 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
583 {
584 	extern size_t cpr_bitmap_size;
585 	struct inode *ip = VTOI(vp);
586 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
587 	u_longlong_t size, isize, ksize, raw_data;
588 	char *str, *est_fmt;
589 	size_t space;
590 	int error;
591 
592 	/*
593 	 * number of pages short for swapping.
594 	 */
595 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
596 	if (STAT->cs_nosw_pages < 0)
597 		STAT->cs_nosw_pages = 0;
598 
599 	str = "cpr_statefile_ok:";
600 
601 	DEBUG9(errp("Phys swap: max=%lu resv=%lu\n",
602 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv));
603 	DEBUG9(errp("Mem swap: max=%ld resv=%lu\n",
604 	    MAX(availrmem - swapfs_minfree, 0),
605 	    k_anoninfo.ani_mem_resv));
606 	DEBUG9(errp("Total available swap: %ld\n",
607 		CURRENT_TOTAL_AVAILABLE_SWAP));
608 
609 	/*
610 	 * try increasing filesize by 15%
611 	 */
612 	if (alloc_retry) {
613 		/*
614 		 * block device doesn't get any bigger
615 		 */
616 		if (vp->v_type == VBLK) {
617 			if (cpr_debug & (LEVEL1 | LEVEL6))
618 				errp("Retry statefile on special file\n");
619 			return (ENOMEM);
620 		} else {
621 			rw_enter(&ip->i_contents, RW_READER);
622 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
623 			rw_exit(&ip->i_contents);
624 		}
625 		if (cpr_debug & (LEVEL1 | LEVEL6))
626 			errp("Retry statefile size = %lld\n", size);
627 	} else {
628 		u_longlong_t cpd_size;
629 		pgcnt_t npages, nback;
630 		int ndvram;
631 
632 		ndvram = 0;
633 		(void) callb_execute_class(CB_CL_CPR_FB,
634 		    (int)(uintptr_t)&ndvram);
635 		if (cpr_debug & (LEVEL1 | LEVEL6))
636 			errp("ndvram size = %d\n", ndvram);
637 
638 		/*
639 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
640 		 */
641 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
642 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
643 		raw_data = cpd_size + cpr_bitmap_size;
644 		ksize = ndvram + mmu_ptob(npages);
645 
646 		est_fmt = "%s estimated size with "
647 		    "%scompression %lld, ksize %lld\n";
648 		nback = mmu_ptob(STAT->cs_nosw_pages);
649 		if (CPR->c_flags & C_COMPRESSING) {
650 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
651 			    raw_data + ((nback * 10) / UCOMP_RATE);
652 			DEBUG1(errp(est_fmt, str, "", size, ksize));
653 		} else {
654 			size = ksize + raw_data + nback;
655 			DEBUG1(errp(est_fmt, str, "no ", size, ksize));
656 		}
657 	}
658 
659 	/*
660 	 * All this is much simpler for a block device
661 	 */
662 	if (vp->v_type == VBLK) {
663 		space = cpr_get_devsize(vp->v_rdev);
664 		if (cpr_debug & (LEVEL1 | LEVEL6))
665 			errp("statefile dev size %lu\n", space);
666 
667 		/*
668 		 * Export the estimated filesize info, this value will be
669 		 * compared before dumping out the statefile in the case of
670 		 * no compression.
671 		 */
672 		STAT->cs_est_statefsz = size;
673 		if (cpr_debug & (LEVEL1 | LEVEL6))
674 			errp("%s Estimated statefile size %llu, space %lu\n",
675 			    str, size, space);
676 		if (size > space) {
677 			cpr_err(CE_CONT, "Statefile partition too small.");
678 			return (ENOMEM);
679 		}
680 		return (0);
681 	} else {
682 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
683 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
684 			return (ENOMEM);
685 		}
686 
687 		/*
688 		 * Estimate space needed for the state file.
689 		 *
690 		 * State file size in bytes:
691 		 * 	kernel size + non-cache pte seg +
692 		 *	bitmap size + cpr state file headers size
693 		 * (round up to fs->fs_bsize)
694 		 */
695 		size = blkroundup(ip->i_fs, size);
696 
697 		/*
698 		 * Export the estimated filesize info, this value will be
699 		 * compared before dumping out the statefile in the case of
700 		 * no compression.
701 		 */
702 		STAT->cs_est_statefsz = size;
703 		error = cpr_grow_statefile(vp, size);
704 		if (cpr_debug & (LEVEL1 | LEVEL6)) {
705 			rw_enter(&ip->i_contents, RW_READER);
706 			isize = ip->i_size;
707 			rw_exit(&ip->i_contents);
708 			errp("%s Estimated statefile size %lld, i_size %lld\n",
709 			    str, size, isize);
710 		}
711 
712 		return (error);
713 	}
714 }
715 
716 
717 void
718 cpr_statef_close(void)
719 {
720 	if (C_VP) {
721 		if (!cpr_reusable_mode)
722 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL);
723 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED());
724 		VN_RELE(C_VP);
725 		C_VP = 0;
726 	}
727 }
728 
729 
730 /*
731  * open cpr default file and display error
732  */
733 int
734 cpr_open_deffile(int mode, vnode_t **vpp)
735 {
736 	int error;
737 
738 	if (error = cpr_open(cpr_default_path, mode, vpp))
739 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
740 		    cpr_default_path, error);
741 	return (error);
742 }
743 
744 
745 /*
746  * write cdef_t to disk.  This contains the original values of prom
747  * properties that we modify.  We fill in the magic number of the file
748  * here as a signal to the booter code that the state file is valid.
749  * Be sure the file gets synced, since we may be shutting down the OS.
750  */
751 int
752 cpr_write_deffile(cdef_t *cdef)
753 {
754 	struct vnode *vp;
755 	char *str;
756 	int rc;
757 
758 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
759 		return (rc);
760 
761 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
762 		str = "write";
763 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED()))
764 		str = "fsync";
765 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
766 	VN_RELE(vp);
767 
768 	if (rc) {
769 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
770 		    str, rc, cpr_default_path);
771 	}
772 	return (rc);
773 }
774 
775 /*
776  * Clear the magic number in the defaults file.  This tells the booter
777  * program that the state file is not current and thus prevents
778  * any attempt to restore from an obsolete state file.
779  */
780 void
781 cpr_clear_definfo(void)
782 {
783 	struct vnode *vp;
784 	cmini_t mini;
785 
786 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
787 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
788 		return;
789 	mini.magic = mini.reusable = 0;
790 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
791 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
792 	VN_RELE(vp);
793 }
794 
795 /*
796  * If the cpr default file is invalid, then we must not be in reusable mode
797  * if it is valid, it tells us our mode
798  */
799 int
800 cpr_get_reusable_mode(void)
801 {
802 	struct vnode *vp;
803 	cmini_t mini;
804 	int rc;
805 
806 	if (cpr_open(cpr_default_path, FREAD, &vp))
807 		return (0);
808 
809 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
810 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
811 	VN_RELE(vp);
812 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
813 		return (mini.reusable);
814 
815 	return (0);
816 }
817 
818 /*
819  * clock/time related routines
820  */
821 static time_t   cpr_time_stamp;
822 
823 
824 void
825 cpr_tod_get(cpr_time_t *ctp)
826 {
827 	timestruc_t ts;
828 
829 	mutex_enter(&tod_lock);
830 	ts = tod_get();
831 	mutex_exit(&tod_lock);
832 	ctp->tv_sec = (time32_t)ts.tv_sec;
833 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
834 }
835 
836 void
837 cpr_tod_fault_reset(void)
838 {
839 	mutex_enter(&tod_lock);
840 	tod_fault_reset();
841 	mutex_exit(&tod_lock);
842 }
843 
844 void
845 cpr_save_time(void)
846 {
847 	cpr_time_stamp = gethrestime_sec();
848 }
849 
850 /*
851  * correct time based on saved time stamp or hardware clock
852  */
853 void
854 cpr_restore_time(void)
855 {
856 	clkset(cpr_time_stamp);
857 }
858 
859 /*
860  * CPU ONLINE/OFFLINE CODE
861  */
862 int
863 cpr_mp_offline(void)
864 {
865 	cpu_t *cp, *bootcpu;
866 	int rc = 0;
867 	int brought_up_boot = 0;
868 
869 	/*
870 	 * Do nothing for UP.
871 	 */
872 	if (ncpus == 1)
873 		return (0);
874 
875 	mutex_enter(&cpu_lock);
876 
877 	cpr_save_mp_state();
878 
879 	bootcpu = i_cpr_bootcpu();
880 	if (!CPU_ACTIVE(bootcpu)) {
881 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
882 			mutex_exit(&cpu_lock);
883 			return (rc);
884 		}
885 		brought_up_boot = 1;
886 	}
887 
888 	cp = cpu_list;
889 	do {
890 		if (cp == bootcpu)
891 			continue;
892 		if (cp->cpu_flags & CPU_OFFLINE)
893 			continue;
894 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
895 			mutex_exit(&cpu_lock);
896 			return (rc);
897 		}
898 	} while ((cp = cp->cpu_next) != cpu_list);
899 	if (brought_up_boot && (cpr_debug & (LEVEL1 | LEVEL6)))
900 		errp("changed cpu %p to state %d\n", bootcpu, CPU_CPR_ONLINE);
901 	mutex_exit(&cpu_lock);
902 
903 	return (rc);
904 }
905 
906 int
907 cpr_mp_online(void)
908 {
909 	cpu_t *cp, *bootcpu = CPU;
910 	int rc = 0;
911 
912 	/*
913 	 * Do nothing for UP.
914 	 */
915 	if (ncpus == 1)
916 		return (0);
917 
918 	/*
919 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
920 	 * to indicate a cpu was online at the time of cpr_suspend();
921 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
922 	 * and actually are offline.
923 	 */
924 	mutex_enter(&cpu_lock);
925 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
926 		/*
927 		 * Clear the CPU_FROZEN flag in all cases.
928 		 */
929 		cp->cpu_flags &= ~CPU_FROZEN;
930 
931 		if (CPU_CPR_IS_OFFLINE(cp))
932 			continue;
933 		if (CPU_ACTIVE(cp))
934 			continue;
935 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
936 			mutex_exit(&cpu_lock);
937 			return (rc);
938 		}
939 	}
940 
941 	/*
942 	 * turn off the boot cpu if it was offlined
943 	 */
944 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
945 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
946 			mutex_exit(&cpu_lock);
947 			return (rc);
948 		}
949 	}
950 	mutex_exit(&cpu_lock);
951 	return (0);
952 }
953 
954 static void
955 cpr_save_mp_state(void)
956 {
957 	cpu_t *cp;
958 
959 	ASSERT(MUTEX_HELD(&cpu_lock));
960 
961 	cp = cpu_list;
962 	do {
963 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
964 		if (CPU_ACTIVE(cp))
965 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
966 	} while ((cp = cp->cpu_next) != cpu_list);
967 }
968 
969 /*
970  * change cpu to online/offline
971  */
972 static int
973 cpr_p_online(cpu_t *cp, int state)
974 {
975 	int rc;
976 
977 	ASSERT(MUTEX_HELD(&cpu_lock));
978 
979 	switch (state) {
980 	case CPU_CPR_ONLINE:
981 		rc = cpu_online(cp);
982 		break;
983 	case CPU_CPR_OFFLINE:
984 		rc = cpu_offline(cp, CPU_FORCED);
985 		break;
986 	}
987 	if (rc) {
988 		cpr_err(CE_WARN, "Failed to change processor %d to "
989 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
990 	}
991 	return (rc);
992 }
993 
994 /*
995  * Construct the pathname of the state file and return a pointer to
996  * caller.  Read the config file to get the mount point of the
997  * filesystem and the pathname within fs.
998  */
999 char *
1000 cpr_build_statefile_path(void)
1001 {
1002 	struct cprconfig *cf = &cprconfig;
1003 
1004 	if (cpr_get_config())
1005 		return (NULL);
1006 
1007 	switch (cf->cf_type) {
1008 	case CFT_UFS:
1009 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1010 			cpr_err(CE_CONT, "Statefile path is too long.\n");
1011 			return (NULL);
1012 		}
1013 		return (cpr_cprconfig_to_path());
1014 	case CFT_SPEC:
1015 		return (cf->cf_devfs);
1016 	default:
1017 		cpr_err(CE_PANIC, "invalid statefile type");
1018 		/*NOTREACHED*/
1019 		return (NULL);
1020 	}
1021 }
1022 
1023 int
1024 cpr_statefile_is_spec(void)
1025 {
1026 	if (cpr_get_config())
1027 		return (0);
1028 	return (cprconfig.cf_type == CFT_SPEC);
1029 }
1030 
1031 char *
1032 cpr_get_statefile_prom_path(void)
1033 {
1034 	struct cprconfig *cf = &cprconfig;
1035 
1036 	ASSERT(cprconfig_loaded);
1037 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1038 	ASSERT(cf->cf_type == CFT_SPEC);
1039 	return (cf->cf_dev_prom);
1040 }
1041 
1042 
1043 /*
1044  * XXX The following routines need to be in the vfs source code.
1045  */
1046 
1047 int
1048 cpr_is_ufs(struct vfs *vfsp)
1049 {
1050 	char *fsname;
1051 
1052 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1053 	return (strcmp(fsname, "ufs") == 0);
1054 }
1055 
1056 /*
1057  * This is a list of file systems that are allowed to be writeable when a
1058  * reusable statefile checkpoint is taken.  They must not have any state that
1059  * cannot be restored to consistency by simply rebooting using the checkpoint.
1060  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
1061  * out of sync with the in-kernel data).
1062  */
1063 int
1064 cpr_reusable_mount_check(void)
1065 {
1066 	struct vfs *vfsp;
1067 	char *fsname;
1068 	char **cpp;
1069 	static char *cpr_writeok_fss[] = {
1070 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1071 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1072 	};
1073 
1074 	vfs_list_read_lock();
1075 	vfsp = rootvfs;
1076 	do {
1077 		if (vfsp->vfs_flag & VFS_RDONLY) {
1078 			vfsp = vfsp->vfs_next;
1079 			continue;
1080 		}
1081 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1082 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1083 			if (strcmp(fsname, *cpp) == 0)
1084 				break;
1085 		}
1086 		/*
1087 		 * if the inner loop reached the NULL terminator,
1088 		 * the current fs-type does not match any OK-type
1089 		 */
1090 		if (*cpp == NULL) {
1091 			cpr_err(CE_CONT, "a filesystem of type %s is "
1092 			    "mounted read/write.\nReusable statefile requires "
1093 			    "no writeable filesystem of this type be mounted\n",
1094 			    fsname);
1095 			vfs_list_unlock();
1096 			return (EINVAL);
1097 		}
1098 		vfsp = vfsp->vfs_next;
1099 	} while (vfsp != rootvfs);
1100 	vfs_list_unlock();
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Force a fresh read of the cprinfo per uadmin 3 call
1106  */
1107 void
1108 cpr_forget_cprconfig(void)
1109 {
1110 	cprconfig_loaded = 0;
1111 }
1112 
1113 
1114 /*
1115  * return statefile offset in DEV_BSIZE units
1116  */
1117 int
1118 cpr_statefile_offset(void)
1119 {
1120 	return (cpr_statefile_is_spec() ? btod(CPR_SPEC_OFFSET) : 0);
1121 }
1122