xref: /illumos-gate/usr/src/uts/common/cpr/cpr_dump.c (revision 4e93fb0f6383eaac21897dcdae56b87118131e4d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Fill in and write out the cpr state file
30  *	1. Allocate and write headers, ELF and cpr dump header
31  *	2. Allocate bitmaps according to phys_install
32  *	3. Tag kernel pages into corresponding bitmap
33  *	4. Write bitmaps to state file
34  *	5. Write actual physical page data to state file
35  */
36 
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/vm.h>
40 #include <sys/memlist.h>
41 #include <sys/kmem.h>
42 #include <sys/vnode.h>
43 #include <sys/fs/ufs_inode.h>
44 #include <sys/errno.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>
47 #include <vm/page.h>
48 #include <vm/seg.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kpm.h>
51 #include <vm/hat.h>
52 #include <sys/cpr.h>
53 #include <sys/conf.h>
54 #include <sys/ddi.h>
55 #include <sys/panic.h>
56 #include <sys/thread.h>
57 
58 /* Local defines and variables */
59 #define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
60 #define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
61 
62 static uint_t cpr_pages_tobe_dumped;
63 static uint_t cpr_regular_pgs_dumped;
64 
65 static int cpr_dump_regular_pages(vnode_t *);
66 static int cpr_count_upages(int, bitfunc_t);
67 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
68 int cpr_flush_write(vnode_t *);
69 
70 int cpr_contig_pages(vnode_t *, int);
71 
72 void cpr_clear_bitmaps();
73 
74 extern size_t cpr_get_devsize(dev_t);
75 extern int i_cpr_dump_setup(vnode_t *);
76 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
77 extern int cpr_test_mode;
78 
79 ctrm_t cpr_term;
80 
81 char *cpr_buf, *cpr_buf_end;
82 int cpr_buf_blocks;		/* size of cpr_buf in blocks */
83 size_t cpr_buf_size;		/* size of cpr_buf in bytes */
84 size_t cpr_bitmap_size;
85 int cpr_nbitmaps;
86 
87 char *cpr_pagedata;		/* page buffer for compression / tmp copy */
88 size_t cpr_pagedata_size;	/* page buffer size in bytes */
89 
90 static char *cpr_wptr;		/* keep track of where to write to next */
91 static int cpr_file_bn;		/* cpr state-file block offset */
92 static int cpr_disk_writes_ok;
93 static size_t cpr_dev_space = 0;
94 
95 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
96 
97 /*
98  * On some platforms bcopy may modify the thread structure
99  * during bcopy (eg, to prevent cpu migration).  If the
100  * range we are currently writing out includes our own
101  * thread structure then it will be snapshotted by bcopy
102  * including those modified members - and the updates made
103  * on exit from bcopy will no longer be seen when we later
104  * restore the mid-bcopy kthread_t.  So if the range we
105  * need to copy overlaps with our thread structure we will
106  * use a simple byte copy.
107  */
108 void
109 cprbcopy(void *from, void *to, size_t bytes)
110 {
111 	extern int curthreadremapped;
112 	caddr_t kthrend;
113 
114 	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
115 	if (curthreadremapped || (kthrend >= (caddr_t)from &&
116 	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
117 		caddr_t src = from, dst = to;
118 
119 		while (bytes-- > 0)
120 			*dst++ = *src++;
121 	} else {
122 		bcopy(from, to, bytes);
123 	}
124 }
125 
126 /*
127  * Allocate pages for buffers used in writing out the statefile
128  */
129 static int
130 cpr_alloc_bufs(void)
131 {
132 	char *allocerr = "Unable to allocate memory for cpr buffer";
133 	size_t size;
134 
135 	/*
136 	 * set the cpr write buffer size to at least the historic
137 	 * size (128k) or large enough to store the both the early
138 	 * set of statefile structures (well under 0x800) plus the
139 	 * bitmaps, and roundup to the next pagesize.
140 	 */
141 	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
142 	cpr_buf_size = MAX(size, CPRBUFSZ);
143 	cpr_buf_blocks = btodb(cpr_buf_size);
144 	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
145 	if (cpr_buf == NULL) {
146 		cpr_err(CE_WARN, allocerr);
147 		return (ENOMEM);
148 	}
149 	cpr_buf_end = cpr_buf + cpr_buf_size;
150 
151 	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
152 	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
153 	if (cpr_pagedata == NULL) {
154 		kmem_free(cpr_buf, cpr_buf_size);
155 		cpr_buf = NULL;
156 		cpr_err(CE_WARN, allocerr);
157 		return (ENOMEM);
158 	}
159 
160 	return (0);
161 }
162 
163 
164 /*
165  * Set bitmap size in bytes based on phys_install.
166  */
167 void
168 cpr_set_bitmap_size(void)
169 {
170 	struct memlist *pmem;
171 	size_t size = 0;
172 
173 	memlist_read_lock();
174 	for (pmem = phys_install; pmem; pmem = pmem->next)
175 		size += pmem->size;
176 	memlist_read_unlock();
177 	cpr_bitmap_size = BITMAP_BYTES(size);
178 }
179 
180 
181 /*
182  * CPR dump header contains the following information:
183  *	1. header magic -- unique to cpr state file
184  *	2. kernel return pc & ppn for resume
185  *	3. current thread info
186  *	4. debug level and test mode
187  *	5. number of bitmaps allocated
188  *	6. number of page records
189  */
190 static int
191 cpr_write_header(vnode_t *vp)
192 {
193 	extern ushort_t cpr_mach_type;
194 	struct cpr_dump_desc cdump;
195 	pgcnt_t bitmap_pages;
196 	pgcnt_t kpages, vpages, upages;
197 
198 	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
199 	cdump.cdd_version = CPR_VERSION;
200 	cdump.cdd_machine = cpr_mach_type;
201 	cdump.cdd_debug = cpr_debug;
202 	cdump.cdd_test_mode = cpr_test_mode;
203 	cdump.cdd_bitmaprec = cpr_nbitmaps;
204 
205 	cpr_clear_bitmaps();
206 
207 	/*
208 	 * Remember how many pages we plan to save to statefile.
209 	 * This information will be used for sanity checks.
210 	 * Untag those pages that will not be saved to statefile.
211 	 */
212 	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
213 	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
214 	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
215 	cdump.cdd_dumppgsize = kpages - vpages + upages;
216 	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
217 	CPR_DEBUG(CPR_DEBUG7,
218 	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
219 	    kpages, vpages, upages, cdump.cdd_dumppgsize);
220 
221 	/*
222 	 * Some pages contain volatile data (cpr_buf and storage area for
223 	 * sensitive kpages), which are no longer needed after the statefile
224 	 * is dumped to disk.  We have already untagged them from regular
225 	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
226 	 * volatile bitmaps will be claimed during resume, and the resumed
227 	 * kernel will free them.
228 	 */
229 	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
230 
231 	bitmap_pages = mmu_btopr(cpr_bitmap_size);
232 
233 	/*
234 	 * Export accurate statefile size for statefile allocation retry.
235 	 * statefile_size = all the headers + total pages +
236 	 * number of pages used by the bitmaps.
237 	 * Roundup will be done in the file allocation code.
238 	 */
239 	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
240 		(sizeof (cbd_t) * cdump.cdd_bitmaprec) +
241 		(sizeof (cpd_t) * cdump.cdd_dumppgsize) +
242 		mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
243 
244 	/*
245 	 * If the estimated statefile is not big enough,
246 	 * go retry now to save un-necessary operations.
247 	 */
248 	if (!(CPR->c_flags & C_COMPRESSING) &&
249 		(STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
250 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
251 		    prom_printf("cpr_write_header: STAT->cs_nocomp_statefsz > "
252 			"STAT->cs_est_statefsz\n");
253 		return (ENOSPC);
254 	}
255 
256 	/* now write cpr dump descriptor */
257 	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
258 }
259 
260 
261 /*
262  * CPR dump tail record contains the following information:
263  *	1. header magic -- unique to cpr state file
264  *	2. all misc info that needs to be passed to cprboot or resumed kernel
265  */
266 static int
267 cpr_write_terminator(vnode_t *vp)
268 {
269 	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
270 	cpr_term.va = (cpr_ptr)&cpr_term;
271 	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
272 
273 	/* count the last one (flush) */
274 	cpr_term.real_statef_size = STAT->cs_real_statefsz +
275 		btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
276 
277 	CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
278 		STAT->cs_real_statefsz);
279 
280 	cpr_tod_get(&cpr_term.tm_shutdown);
281 
282 	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
283 }
284 
285 /*
286  * Write bitmap descriptor array, followed by merged bitmaps.
287  */
288 static int
289 cpr_write_bitmap(vnode_t *vp)
290 {
291 	char *rmap, *vmap, *dst, *tail;
292 	size_t size, bytes;
293 	cbd_t *dp;
294 	int err;
295 
296 	dp = CPR->c_bmda;
297 	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
298 		return (err);
299 
300 	/*
301 	 * merge regular and volatile bitmaps into tmp space
302 	 * and write to disk
303 	 */
304 	for (; dp->cbd_size; dp++) {
305 		rmap = (char *)dp->cbd_reg_bitmap;
306 		vmap = (char *)dp->cbd_vlt_bitmap;
307 		for (size = dp->cbd_size; size; size -= bytes) {
308 			bytes = min(size, sizeof (cpr_pagecopy));
309 			tail = &cpr_pagecopy[bytes];
310 			for (dst = cpr_pagecopy; dst < tail; dst++)
311 				*dst = *rmap++ | *vmap++;
312 			if (err = cpr_write(vp, cpr_pagecopy, bytes))
313 				break;
314 		}
315 	}
316 
317 	return (err);
318 }
319 
320 
321 static int
322 cpr_write_statefile(vnode_t *vp)
323 {
324 	uint_t error = 0;
325 	extern	int	i_cpr_check_pgs_dumped();
326 	void flush_windows(void);
327 	pgcnt_t spages;
328 	char *str;
329 
330 	flush_windows();
331 
332 	/*
333 	 * to get an accurate view of kas, we need to untag sensitive
334 	 * pages *before* dumping them because the disk driver makes
335 	 * allocations and changes kas along the way.  The remaining
336 	 * pages referenced in the bitmaps are dumped out later as
337 	 * regular kpages.
338 	 */
339 	str = "cpr_write_statefile:";
340 	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
341 	CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
342 
343 	/*
344 	 * now it's OK to call a driver that makes allocations
345 	 */
346 	cpr_disk_writes_ok = 1;
347 
348 	/*
349 	 * now write out the clean sensitive kpages
350 	 * according to the sensitive descriptors
351 	 */
352 	error = i_cpr_dump_sensitive_kpages(vp);
353 	if (error) {
354 		CPR_DEBUG(CPR_DEBUG7,
355 		    "%s cpr_dump_sensitive_kpages() failed!\n", str);
356 		return (error);
357 	}
358 
359 	/*
360 	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
361 	 */
362 	error = cpr_dump_regular_pages(vp);
363 	if (error) {
364 		CPR_DEBUG(CPR_DEBUG7,
365 		    "%s cpr_dump_regular_pages() failed!\n", str);
366 		return (error);
367 	}
368 
369 	/*
370 	 * sanity check to verify the right number of pages were dumped
371 	 */
372 	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
373 	    cpr_regular_pgs_dumped);
374 
375 	if (error) {
376 		prom_printf("\n%s page count mismatch!\n", str);
377 #ifdef DEBUG
378 		if (cpr_test_mode)
379 			debug_enter(NULL);
380 #endif
381 	}
382 
383 	return (error);
384 }
385 
386 
387 /*
388  * creates the CPR state file, the following sections are
389  * written out in sequence:
390  *    - writes the cpr dump header
391  *    - writes the memory usage bitmaps
392  *    - writes the platform dependent info
393  *    - writes the remaining user pages
394  *    - writes the kernel pages
395  */
396 int
397 cpr_dump(vnode_t *vp)
398 {
399 	int error;
400 
401 	if (cpr_buf == NULL) {
402 		ASSERT(cpr_pagedata == NULL);
403 		if (error = cpr_alloc_bufs())
404 			return (error);
405 	}
406 	/* point to top of internal buffer */
407 	cpr_wptr = cpr_buf;
408 
409 	/* initialize global variables used by the write operation */
410 	cpr_file_bn = cpr_statefile_offset();
411 	cpr_dev_space = 0;
412 
413 	/* allocate bitmaps */
414 	if (CPR->c_bmda == NULL) {
415 		if (error = i_cpr_alloc_bitmaps()) {
416 			cpr_err(CE_WARN, "cannot allocate bitmaps");
417 			return (error);
418 		}
419 	}
420 
421 	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
422 		return (error);
423 
424 	if (error = i_cpr_dump_setup(vp))
425 		return (error);
426 
427 	/*
428 	 * set internal cross checking; we dont want to call
429 	 * a disk driver that makes allocations until after
430 	 * sensitive pages are saved
431 	 */
432 	cpr_disk_writes_ok = 0;
433 
434 	/*
435 	 * 1253112: heap corruption due to memory allocation when dumpping
436 	 *	    statefile.
437 	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
438 	 * kvseg segments can be contaminated should memory allocations happen
439 	 * during sddump, which is not supposed to happen after the system
440 	 * is quiesced. Let's call the kernel pages that tend to be affected
441 	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
442 	 * will allocate some storage space to save the clean sensitive pages
443 	 * aside before statefile dumping takes place. Since there may not be
444 	 * much memory left at this stage, the sensitive pages will be
445 	 * compressed before they are saved into the storage area.
446 	 */
447 	if (error = i_cpr_save_sensitive_kpages()) {
448 		CPR_DEBUG(CPR_DEBUG7,
449 		    "cpr_dump: save_sensitive_kpages failed!\n");
450 		return (error);
451 	}
452 
453 	/*
454 	 * since all cpr allocations are done (space for sensitive kpages,
455 	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
456 	 * count regular and sensitive kpages.
457 	 */
458 	if (error = cpr_write_header(vp)) {
459 		CPR_DEBUG(CPR_DEBUG7,
460 		    "cpr_dump: cpr_write_header() failed!\n");
461 		return (error);
462 	}
463 
464 	if (error = i_cpr_write_machdep(vp))
465 		return (error);
466 
467 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
468 		return (error);
469 
470 	if (error = cpr_write_bitmap(vp))
471 		return (error);
472 
473 	if (error = cpr_write_statefile(vp)) {
474 		CPR_DEBUG(CPR_DEBUG7,
475 		    "cpr_dump: cpr_write_statefile() failed!\n");
476 		return (error);
477 	}
478 
479 	if (error = cpr_write_terminator(vp))
480 		return (error);
481 
482 	if (error = cpr_flush_write(vp))
483 		return (error);
484 
485 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
486 		return (error);
487 
488 	return (0);
489 }
490 
491 
492 /*
493  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
494  * a page-count from each range is accumulated at arg->pages.
495  */
496 static void
497 cpr_xwalk(void *arg, void *base, size_t size)
498 {
499 	struct cpr_walkinfo *cwip = arg;
500 
501 	cwip->pages += cpr_count_pages(base, size,
502 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
503 	cwip->size += size;
504 	cwip->ranges++;
505 }
506 
507 /*
508  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
509  * a page-count from each range is accumulated at arg->pages.
510  */
511 static void
512 cpr_walk(void *arg, void *base, size_t size)
513 {
514 	caddr_t addr = base;
515 	caddr_t addr_end = addr + size;
516 
517 	/*
518 	 * If we are about to start walking the range of addresses we
519 	 * carved out of the kernel heap for the large page heap walk
520 	 * heap_lp_arena to find what segments are actually populated
521 	 */
522 	if (SEGKMEM_USE_LARGEPAGES &&
523 	    addr == heap_lp_base && addr_end == heap_lp_end &&
524 	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
525 		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
526 	} else {
527 		cpr_xwalk(arg, base, size);
528 	}
529 }
530 
531 
532 /*
533  * faster scan of kvseg using vmem_walk() to visit
534  * allocated ranges.
535  */
536 pgcnt_t
537 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
538 {
539 	struct cpr_walkinfo cwinfo;
540 
541 	bzero(&cwinfo, sizeof (cwinfo));
542 	cwinfo.mapflag = mapflag;
543 	cwinfo.bitfunc = bitfunc;
544 
545 	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
546 
547 	if (cpr_debug & CPR_DEBUG7) {
548 		prom_printf("walked %d sub-ranges, total pages %ld\n",
549 		    cwinfo.ranges, mmu_btop(cwinfo.size));
550 		cpr_show_range(seg->s_base, seg->s_size,
551 		    mapflag, bitfunc, cwinfo.pages);
552 	}
553 
554 	return (cwinfo.pages);
555 }
556 
557 
558 /*
559  * cpr_walk_kpm() is called for every used area within the large
560  * segkpm virtual address window. A page-count is accumulated at
561  * arg->pages.
562  */
563 static void
564 cpr_walk_kpm(void *arg, void *base, size_t size)
565 {
566 	struct cpr_walkinfo *cwip = arg;
567 
568 	cwip->pages += cpr_count_pages(base, size,
569 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
570 	cwip->size += size;
571 	cwip->ranges++;
572 }
573 
574 
575 /*
576  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
577  */
578 /*ARGSUSED*/
579 static pgcnt_t
580 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
581 {
582 	struct cpr_walkinfo cwinfo;
583 
584 	if (kpm_enable == 0)
585 		return (0);
586 
587 	bzero(&cwinfo, sizeof (cwinfo));
588 	cwinfo.mapflag = mapflag;
589 	cwinfo.bitfunc = bitfunc;
590 	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
591 
592 	if (cpr_debug & CPR_DEBUG7) {
593 		prom_printf("walked %d sub-ranges, total pages %ld\n",
594 		    cwinfo.ranges, mmu_btop(cwinfo.size));
595 		cpr_show_range(segkpm->s_base, segkpm->s_size,
596 		    mapflag, bitfunc, cwinfo.pages);
597 	}
598 
599 	return (cwinfo.pages);
600 }
601 
602 
603 /*
604  * Sparsely filled kernel segments are registered in kseg_table for
605  * easier lookup. See also block comment for cpr_count_seg_pages.
606  */
607 
608 #define	KSEG_SEG_ADDR	0	/* address of struct seg */
609 #define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
610 
611 typedef struct {
612 	struct seg **st_seg;		/* segment pointer or segment address */
613 	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
614 	int	st_addrtype;		/* address type in st_seg */
615 } ksegtbl_entry_t;
616 
617 ksegtbl_entry_t kseg_table[] = {
618 	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
619 	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
620 	{NULL,				0,			0}
621 };
622 
623 
624 /*
625  * Compare seg with each entry in kseg_table; when there is a match
626  * return the entry pointer, otherwise return NULL.
627  */
628 static ksegtbl_entry_t *
629 cpr_sparse_seg_check(struct seg *seg)
630 {
631 	ksegtbl_entry_t *ste = &kseg_table[0];
632 	struct seg *tseg;
633 
634 	for (; ste->st_seg; ste++) {
635 		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
636 				*ste->st_seg : (struct seg *)ste->st_seg;
637 		if (seg == tseg)
638 			return (ste);
639 	}
640 
641 	return ((ksegtbl_entry_t *)NULL);
642 }
643 
644 
645 /*
646  * Count pages within each kernel segment; call cpr_sparse_seg_check()
647  * to find out whether a sparsely filled segment needs special
648  * treatment (e.g. kvseg).
649  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
650  *       module shouldn't need to know segment details like if it is
651  *       sparsely filled or not (makes kseg_table obsolete).
652  */
653 pgcnt_t
654 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
655 {
656 	struct seg *segp;
657 	pgcnt_t pages;
658 	ksegtbl_entry_t *ste;
659 
660 	pages = 0;
661 	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
662 		if (ste = cpr_sparse_seg_check(segp)) {
663 			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
664 		} else {
665 			pages += cpr_count_pages(segp->s_base,
666 			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
667 		}
668 	}
669 
670 	return (pages);
671 }
672 
673 
674 /*
675  * count kernel pages within kas and any special ranges
676  */
677 pgcnt_t
678 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
679 {
680 	pgcnt_t kas_cnt;
681 
682 	/*
683 	 * Some pages need to be taken care of differently.
684 	 * eg: panicbuf pages of sun4m are not in kas but they need
685 	 * to be saved.  On sun4u, the physical pages of panicbuf are
686 	 * allocated via prom_retain().
687 	 */
688 	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
689 	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
690 
691 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
692 	CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
693 		kas_cnt, mmu_ptob(kas_cnt));
694 	return (kas_cnt);
695 }
696 
697 
698 /*
699  * Set a bit corresponding to the arg phys page number;
700  * returns 0 when the ppn is valid and the corresponding
701  * map bit was clear, otherwise returns 1.
702  */
703 int
704 cpr_setbit(pfn_t ppn, int mapflag)
705 {
706 	char *bitmap;
707 	cbd_t *dp;
708 	pfn_t rel;
709 	int clr;
710 
711 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
712 		if (PPN_IN_RANGE(ppn, dp)) {
713 			bitmap = DESC_TO_MAP(dp, mapflag);
714 			rel = ppn - dp->cbd_spfn;
715 			if ((clr = isclr(bitmap, rel)) != 0)
716 				setbit(bitmap, rel);
717 			return (clr == 0);
718 		}
719 	}
720 
721 	return (1);
722 }
723 
724 
725 /*
726  * Clear a bit corresponding to the arg phys page number.
727  */
728 int
729 cpr_clrbit(pfn_t ppn, int mapflag)
730 {
731 	char *bitmap;
732 	cbd_t *dp;
733 	pfn_t rel;
734 	int set;
735 
736 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
737 		if (PPN_IN_RANGE(ppn, dp)) {
738 			bitmap = DESC_TO_MAP(dp, mapflag);
739 			rel = ppn - dp->cbd_spfn;
740 			if ((set = isset(bitmap, rel)) != 0)
741 				clrbit(bitmap, rel);
742 			return (set == 0);
743 		}
744 	}
745 
746 	return (1);
747 }
748 
749 
750 /* ARGSUSED */
751 int
752 cpr_nobit(pfn_t ppn, int mapflag)
753 {
754 	return (0);
755 }
756 
757 
758 /*
759  * Lookup a bit corresponding to the arg phys page number.
760  */
761 int
762 cpr_isset(pfn_t ppn, int mapflag)
763 {
764 	char *bitmap;
765 	cbd_t *dp;
766 	pfn_t rel;
767 
768 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
769 		if (PPN_IN_RANGE(ppn, dp)) {
770 			bitmap = DESC_TO_MAP(dp, mapflag);
771 			rel = ppn - dp->cbd_spfn;
772 			return (isset(bitmap, rel));
773 		}
774 	}
775 
776 	return (0);
777 }
778 
779 
780 /*
781  * Go thru all pages and pick up any page not caught during the invalidation
782  * stage. This is also used to save pages with cow lock or phys page lock held
783  * (none zero p_lckcnt or p_cowcnt)
784  */
785 static	int
786 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
787 {
788 	page_t *pp, *page0;
789 	pgcnt_t dcnt = 0, tcnt = 0;
790 	pfn_t pfn;
791 
792 	page0 = pp = page_first();
793 
794 	do {
795 #if defined(__sparc)
796 		extern struct vnode prom_ppages;
797 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
798 		    pp->p_vnode == &prom_ppages ||
799 			PP_ISFREE(pp) && PP_ISAGED(pp))
800 #else
801 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
802 		    PP_ISFREE(pp) && PP_ISAGED(pp))
803 #endif /* __sparc */
804 			continue;
805 
806 		pfn = page_pptonum(pp);
807 		if (pf_is_memory(pfn)) {
808 			tcnt++;
809 			if ((*bitfunc)(pfn, mapflag) == 0)
810 				dcnt++; /* dirty count */
811 		}
812 	} while ((pp = page_next(pp)) != page0);
813 
814 	STAT->cs_upage2statef = dcnt;
815 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
816 		dcnt, tcnt);
817 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
818 		dcnt, mmu_ptob(dcnt));
819 	return (dcnt);
820 }
821 
822 
823 /*
824  * try compressing pages based on cflag,
825  * and for DEBUG kernels, verify uncompressed data checksum;
826  *
827  * this routine replaces common code from
828  * i_cpr_compress_and_save() and cpr_compress_and_write()
829  */
830 char *
831 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
832 {
833 	size_t nbytes, clen, len;
834 	uint32_t test_sum;
835 	char *datap;
836 
837 	nbytes = mmu_ptob(pages);
838 
839 	/*
840 	 * set length to the original uncompressed data size;
841 	 * always init cpd_flag to zero
842 	 */
843 	dp->cpd_length = nbytes;
844 	dp->cpd_flag = 0;
845 
846 #ifdef	DEBUG
847 	/*
848 	 * Make a copy of the uncompressed data so we can checksum it.
849 	 * Compress that copy so the checksum works at the other end
850 	 */
851 	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
852 	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
853 	dp->cpd_flag |= CPD_USUM;
854 	datap = cpr_pagecopy;
855 #else
856 	datap = CPR->c_mapping_area;
857 	dp->cpd_usum = 0;
858 #endif
859 
860 	/*
861 	 * try compressing the raw data to cpr_pagedata;
862 	 * if there was a size reduction: record the new length,
863 	 * flag the compression, and point to the compressed data.
864 	 */
865 	dp->cpd_csum = 0;
866 	if (cflag) {
867 		clen = compress(datap, cpr_pagedata, nbytes);
868 		if (clen < nbytes) {
869 			dp->cpd_flag |= CPD_COMPRESS;
870 			dp->cpd_length = clen;
871 			datap = cpr_pagedata;
872 #ifdef	DEBUG
873 			dp->cpd_csum = checksum32(datap, clen);
874 			dp->cpd_flag |= CPD_CSUM;
875 
876 			/*
877 			 * decompress the data back to a scratch area
878 			 * and compare the new checksum with the original
879 			 * checksum to verify the compression.
880 			 */
881 			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
882 			len = decompress(datap, cpr_pagecopy,
883 			    clen, sizeof (cpr_pagecopy));
884 			test_sum = checksum32(cpr_pagecopy, len);
885 			ASSERT(test_sum == dp->cpd_usum);
886 #endif
887 		}
888 	}
889 
890 	return (datap);
891 }
892 
893 
894 /*
895  * 1. Prepare cpr page descriptor and write it to file
896  * 2. Compress page data and write it out
897  */
898 static int
899 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
900 {
901 	int error = 0;
902 	char *datap;
903 	cpd_t cpd;	/* cpr page descriptor */
904 	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
905 	extern void i_cpr_mapout(caddr_t, uint_t);
906 
907 	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
908 
909 	CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
910 		npg, CPR->c_mapping_area, pfn);
911 
912 	/*
913 	 * Fill cpr page descriptor.
914 	 */
915 	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
916 	cpd.cpd_pfn = pfn;
917 	cpd.cpd_pages = npg;
918 
919 	STAT->cs_dumped_statefsz += mmu_ptob(npg);
920 
921 	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
922 
923 	/* Write cpr page descriptor */
924 	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
925 
926 	/* Write compressed page data */
927 	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
928 
929 	/*
930 	 * Unmap the pages for tlb and vac flushing
931 	 */
932 	i_cpr_mapout(CPR->c_mapping_area, npg);
933 
934 	if (error) {
935 		CPR_DEBUG(CPR_DEBUG1,
936 		    "cpr_compress_and_write: vp 0x%p va 0x%x ", vp, va);
937 		CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
938 		    pfn, cpr_file_bn, error);
939 	} else {
940 		cpr_regular_pgs_dumped += npg;
941 	}
942 
943 	return (error);
944 }
945 
946 
947 int
948 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
949 {
950 	caddr_t	fromp = buffer;
951 	size_t bytes, wbytes;
952 	int error;
953 
954 	if (cpr_dev_space == 0) {
955 		if (vp->v_type == VBLK) {
956 			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
957 			ASSERT(cpr_dev_space);
958 		} else
959 			cpr_dev_space = 1;	/* not used in this case */
960 	}
961 
962 	/*
963 	 * break the write into multiple part if request is large,
964 	 * calculate count up to buf page boundary, then write it out.
965 	 * repeat until done.
966 	 */
967 	while (size) {
968 		bytes = MIN(size, cpr_buf_end - cpr_wptr);
969 		cprbcopy(fromp, cpr_wptr, bytes);
970 		cpr_wptr += bytes;
971 		fromp += bytes;
972 		size -= bytes;
973 		if (cpr_wptr < cpr_buf_end)
974 			return (0);	/* buffer not full yet */
975 		ASSERT(cpr_wptr == cpr_buf_end);
976 
977 		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
978 		if (vp->v_type == VBLK) {
979 			if (wbytes > cpr_dev_space)
980 				return (ENOSPC);
981 		} else {
982 			if (wbytes > VTOI(vp)->i_size)
983 				return (ENOSPC);
984 		}
985 
986 		CPR_DEBUG(CPR_DEBUG3,
987 		    "cpr_write: frmp=%p wptr=%p cnt=%lx...",
988 		    fromp, cpr_wptr, bytes);
989 		/*
990 		 * cross check, this should not happen!
991 		 */
992 		if (cpr_disk_writes_ok == 0) {
993 			prom_printf("cpr_write: disk write too early!\n");
994 			return (EINVAL);
995 		}
996 
997 		do_polled_io = 1;
998 		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks);
999 		do_polled_io = 0;
1000 		CPR_DEBUG(CPR_DEBUG3, "done\n");
1001 
1002 		STAT->cs_real_statefsz += cpr_buf_size;
1003 
1004 		if (error) {
1005 			cpr_err(CE_WARN, "cpr_write error %d", error);
1006 			return (error);
1007 		}
1008 		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
1009 		cpr_wptr = cpr_buf;		/* back to top of buffer */
1010 	}
1011 	return (0);
1012 }
1013 
1014 
1015 int
1016 cpr_flush_write(vnode_t *vp)
1017 {
1018 	int	nblk;
1019 	int	error;
1020 
1021 	/*
1022 	 * Calculate remaining blocks in buffer, rounded up to nearest
1023 	 * disk block
1024 	 */
1025 	nblk = btod(cpr_wptr - cpr_buf);
1026 
1027 	do_polled_io = 1;
1028 	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk);
1029 	do_polled_io = 0;
1030 
1031 	cpr_file_bn += nblk;
1032 	if (error)
1033 		CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
1034 		    error);
1035 	return (error);
1036 }
1037 
1038 void
1039 cpr_clear_bitmaps(void)
1040 {
1041 	cbd_t *dp;
1042 
1043 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1044 		bzero((void *)dp->cbd_reg_bitmap,
1045 		    (size_t)dp->cbd_size * 2);
1046 	}
1047 	CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
1048 }
1049 
1050 int
1051 cpr_contig_pages(vnode_t *vp, int flag)
1052 {
1053 	int chunks = 0, error = 0;
1054 	pgcnt_t i, j, totbit;
1055 	pfn_t spfn;
1056 	cbd_t *dp;
1057 	uint_t	spin_cnt = 0;
1058 	extern	int i_cpr_compress_and_save();
1059 
1060 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1061 		spfn = dp->cbd_spfn;
1062 		totbit = BTOb(dp->cbd_size);
1063 		i = 0; /* Beginning of bitmap */
1064 		j = 0;
1065 		while (i < totbit) {
1066 			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1067 				if (isset((char *)dp->cbd_reg_bitmap, j+i))
1068 					j++;
1069 				else /* not contiguous anymore */
1070 					break;
1071 			}
1072 
1073 			if (j) {
1074 				chunks++;
1075 				if (flag == SAVE_TO_STORAGE) {
1076 					error = i_cpr_compress_and_save(
1077 					    chunks, spfn + i, j);
1078 					if (error)
1079 						return (error);
1080 				} else if (flag == WRITE_TO_STATEFILE) {
1081 					error = cpr_compress_and_write(vp, 0,
1082 					    spfn + i, j);
1083 					if (error)
1084 						return (error);
1085 					else {
1086 						spin_cnt++;
1087 						if ((spin_cnt & 0x5F) == 1)
1088 							cpr_spinning_bar();
1089 					}
1090 				}
1091 			}
1092 
1093 			i += j;
1094 			if (j != CPR_MAXCONTIG) {
1095 				/* Stopped on a non-tagged page */
1096 				i++;
1097 			}
1098 
1099 			j = 0;
1100 		}
1101 	}
1102 
1103 	if (flag == STORAGE_DESC_ALLOC)
1104 		return (chunks);
1105 	else
1106 		return (0);
1107 }
1108 
1109 
1110 void
1111 cpr_show_range(caddr_t vaddr, size_t size,
1112     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1113 {
1114 	char *action, *bname;
1115 
1116 	bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1117 	if (bitfunc == cpr_setbit)
1118 		action = "tag";
1119 	else if (bitfunc == cpr_clrbit)
1120 		action = "untag";
1121 	else
1122 		action = "none";
1123 	prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1124 	    vaddr, vaddr + size, bname, action, count);
1125 }
1126 
1127 
1128 pgcnt_t
1129 cpr_count_pages(caddr_t sva, size_t size,
1130     int mapflag, bitfunc_t bitfunc, int showrange)
1131 {
1132 	caddr_t	va, eva;
1133 	pfn_t pfn;
1134 	pgcnt_t count = 0;
1135 
1136 	eva = sva + PAGE_ROUNDUP(size);
1137 	for (va = sva; va < eva; va += MMU_PAGESIZE) {
1138 		pfn = va_to_pfn(va);
1139 		if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1140 			if ((*bitfunc)(pfn, mapflag) == 0)
1141 				count++;
1142 		}
1143 	}
1144 
1145 	if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
1146 		cpr_show_range(sva, size, mapflag, bitfunc, count);
1147 
1148 	return (count);
1149 }
1150 
1151 
1152 pgcnt_t
1153 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1154 {
1155 	pgcnt_t count = 0;
1156 
1157 	if (cpr_buf) {
1158 		count += cpr_count_pages(cpr_buf, cpr_buf_size,
1159 		    mapflag, bitfunc, DBG_SHOWRANGE);
1160 	}
1161 	if (cpr_pagedata) {
1162 		count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1163 		    mapflag, bitfunc, DBG_SHOWRANGE);
1164 	}
1165 	count += i_cpr_count_storage_pages(mapflag, bitfunc);
1166 
1167 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1168 	    count, mmu_ptob(count));
1169 	return (count);
1170 }
1171 
1172 
1173 static int
1174 cpr_dump_regular_pages(vnode_t *vp)
1175 {
1176 	int error;
1177 
1178 	cpr_regular_pgs_dumped = 0;
1179 	error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1180 	if (!error)
1181 		CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
1182 	return (error);
1183 }
1184