xref: /titanic_41/usr/src/uts/common/cpr/cpr_dump.c (revision fe1c642d06e14b412cd83ae2179303186ab08972)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Fill in and write out the cpr state file
28  *	1. Allocate and write headers, ELF and cpr dump header
29  *	2. Allocate bitmaps according to phys_install
30  *	3. Tag kernel pages into corresponding bitmap
31  *	4. Write bitmaps to state file
32  *	5. Write actual physical page data to state file
33  */
34 
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/vm.h>
38 #include <sys/memlist.h>
39 #include <sys/kmem.h>
40 #include <sys/vnode.h>
41 #include <sys/fs/ufs_inode.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <vm/page.h>
46 #include <vm/seg.h>
47 #include <vm/seg_kmem.h>
48 #include <vm/seg_kpm.h>
49 #include <vm/hat.h>
50 #include <sys/cpr.h>
51 #include <sys/conf.h>
52 #include <sys/ddi.h>
53 #include <sys/panic.h>
54 #include <sys/thread.h>
55 #include <sys/note.h>
56 
57 /* Local defines and variables */
58 #define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
59 #define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
60 
61 #if defined(__sparc)
62 static uint_t cpr_pages_tobe_dumped;
63 static uint_t cpr_regular_pgs_dumped;
64 static int cpr_dump_regular_pages(vnode_t *);
65 static int cpr_count_upages(int, bitfunc_t);
66 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
67 #endif
68 
69 int cpr_flush_write(vnode_t *);
70 
71 int cpr_contig_pages(vnode_t *, int);
72 
73 void cpr_clear_bitmaps();
74 
75 extern size_t cpr_get_devsize(dev_t);
76 extern int i_cpr_dump_setup(vnode_t *);
77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
78 extern int cpr_test_mode;
79 int cpr_setbit(pfn_t, int);
80 int cpr_clrbit(pfn_t, int);
81 
82 ctrm_t cpr_term;
83 
84 char *cpr_buf, *cpr_buf_end;
85 int cpr_buf_blocks;		/* size of cpr_buf in blocks */
86 size_t cpr_buf_size;		/* size of cpr_buf in bytes */
87 size_t cpr_bitmap_size;
88 int cpr_nbitmaps;
89 
90 char *cpr_pagedata;		/* page buffer for compression / tmp copy */
91 size_t cpr_pagedata_size;	/* page buffer size in bytes */
92 
93 #if defined(__sparc)
94 static char *cpr_wptr;		/* keep track of where to write to next */
95 static int cpr_file_bn;		/* cpr state-file block offset */
96 static int cpr_disk_writes_ok;
97 static size_t cpr_dev_space = 0;
98 #endif
99 
100 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
101 
102 #if defined(__sparc)
103 /*
104  * On some platforms bcopy may modify the thread structure
105  * during bcopy (eg, to prevent cpu migration).  If the
106  * range we are currently writing out includes our own
107  * thread structure then it will be snapshotted by bcopy
108  * including those modified members - and the updates made
109  * on exit from bcopy will no longer be seen when we later
110  * restore the mid-bcopy kthread_t.  So if the range we
111  * need to copy overlaps with our thread structure we will
112  * use a simple byte copy.
113  */
114 void
115 cprbcopy(void *from, void *to, size_t bytes)
116 {
117 	extern int curthreadremapped;
118 	caddr_t kthrend;
119 
120 	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
121 	if (curthreadremapped || (kthrend >= (caddr_t)from &&
122 	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
123 		caddr_t src = from, dst = to;
124 
125 		while (bytes-- > 0)
126 			*dst++ = *src++;
127 	} else {
128 		bcopy(from, to, bytes);
129 	}
130 }
131 
132 /*
133  * Allocate pages for buffers used in writing out the statefile
134  */
135 static int
136 cpr_alloc_bufs(void)
137 {
138 	char *allocerr = "Unable to allocate memory for cpr buffer";
139 	size_t size;
140 
141 	/*
142 	 * set the cpr write buffer size to at least the historic
143 	 * size (128k) or large enough to store the both the early
144 	 * set of statefile structures (well under 0x800) plus the
145 	 * bitmaps, and roundup to the next pagesize.
146 	 */
147 	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
148 	cpr_buf_size = MAX(size, CPRBUFSZ);
149 	cpr_buf_blocks = btodb(cpr_buf_size);
150 	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
151 	if (cpr_buf == NULL) {
152 		cpr_err(CE_WARN, allocerr);
153 		return (ENOMEM);
154 	}
155 	cpr_buf_end = cpr_buf + cpr_buf_size;
156 
157 	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
158 	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
159 	if (cpr_pagedata == NULL) {
160 		kmem_free(cpr_buf, cpr_buf_size);
161 		cpr_buf = NULL;
162 		cpr_err(CE_WARN, allocerr);
163 		return (ENOMEM);
164 	}
165 
166 	return (0);
167 }
168 
169 
170 /*
171  * Set bitmap size in bytes based on phys_install.
172  */
173 void
174 cpr_set_bitmap_size(void)
175 {
176 	struct memlist *pmem;
177 	size_t size = 0;
178 
179 	memlist_read_lock();
180 	for (pmem = phys_install; pmem; pmem = pmem->next)
181 		size += pmem->size;
182 	memlist_read_unlock();
183 	cpr_bitmap_size = BITMAP_BYTES(size);
184 }
185 
186 
187 /*
188  * CPR dump header contains the following information:
189  *	1. header magic -- unique to cpr state file
190  *	2. kernel return pc & ppn for resume
191  *	3. current thread info
192  *	4. debug level and test mode
193  *	5. number of bitmaps allocated
194  *	6. number of page records
195  */
196 static int
197 cpr_write_header(vnode_t *vp)
198 {
199 	extern ushort_t cpr_mach_type;
200 	struct cpr_dump_desc cdump;
201 	pgcnt_t bitmap_pages;
202 	pgcnt_t kpages, vpages, upages;
203 	pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
204 
205 	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
206 	cdump.cdd_version = CPR_VERSION;
207 	cdump.cdd_machine = cpr_mach_type;
208 	cdump.cdd_debug = cpr_debug;
209 	cdump.cdd_test_mode = cpr_test_mode;
210 	cdump.cdd_bitmaprec = cpr_nbitmaps;
211 
212 	cpr_clear_bitmaps();
213 
214 	/*
215 	 * Remember how many pages we plan to save to statefile.
216 	 * This information will be used for sanity checks.
217 	 * Untag those pages that will not be saved to statefile.
218 	 */
219 	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
220 	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
221 	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
222 	cdump.cdd_dumppgsize = kpages - vpages + upages;
223 	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
224 	CPR_DEBUG(CPR_DEBUG7,
225 	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
226 	    kpages, vpages, upages, cdump.cdd_dumppgsize);
227 
228 	/*
229 	 * Some pages contain volatile data (cpr_buf and storage area for
230 	 * sensitive kpages), which are no longer needed after the statefile
231 	 * is dumped to disk.  We have already untagged them from regular
232 	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
233 	 * volatile bitmaps will be claimed during resume, and the resumed
234 	 * kernel will free them.
235 	 */
236 	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
237 
238 	bitmap_pages = mmu_btopr(cpr_bitmap_size);
239 
240 	/*
241 	 * Export accurate statefile size for statefile allocation retry.
242 	 * statefile_size = all the headers + total pages +
243 	 * number of pages used by the bitmaps.
244 	 * Roundup will be done in the file allocation code.
245 	 */
246 	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
247 	    (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
248 	    (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
249 	    mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
250 
251 	/*
252 	 * If the estimated statefile is not big enough,
253 	 * go retry now to save un-necessary operations.
254 	 */
255 	if (!(CPR->c_flags & C_COMPRESSING) &&
256 	    (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
257 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
258 			prom_printf("cpr_write_header: "
259 			    "STAT->cs_nocomp_statefsz > "
260 			    "STAT->cs_est_statefsz\n");
261 		return (ENOSPC);
262 	}
263 
264 	/* now write cpr dump descriptor */
265 	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
266 }
267 
268 
269 /*
270  * CPR dump tail record contains the following information:
271  *	1. header magic -- unique to cpr state file
272  *	2. all misc info that needs to be passed to cprboot or resumed kernel
273  */
274 static int
275 cpr_write_terminator(vnode_t *vp)
276 {
277 	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
278 	cpr_term.va = (cpr_ptr)&cpr_term;
279 	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
280 
281 	/* count the last one (flush) */
282 	cpr_term.real_statef_size = STAT->cs_real_statefsz +
283 	    btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
284 
285 	CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
286 	    STAT->cs_real_statefsz);
287 
288 	cpr_tod_get(&cpr_term.tm_shutdown);
289 
290 	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
291 }
292 
293 /*
294  * Write bitmap descriptor array, followed by merged bitmaps.
295  */
296 static int
297 cpr_write_bitmap(vnode_t *vp)
298 {
299 	char *rmap, *vmap, *dst, *tail;
300 	size_t size, bytes;
301 	cbd_t *dp;
302 	int err;
303 
304 	dp = CPR->c_bmda;
305 	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
306 		return (err);
307 
308 	/*
309 	 * merge regular and volatile bitmaps into tmp space
310 	 * and write to disk
311 	 */
312 	for (; dp->cbd_size; dp++) {
313 		rmap = (char *)dp->cbd_reg_bitmap;
314 		vmap = (char *)dp->cbd_vlt_bitmap;
315 		for (size = dp->cbd_size; size; size -= bytes) {
316 			bytes = min(size, sizeof (cpr_pagecopy));
317 			tail = &cpr_pagecopy[bytes];
318 			for (dst = cpr_pagecopy; dst < tail; dst++)
319 				*dst = *rmap++ | *vmap++;
320 			if (err = cpr_write(vp, cpr_pagecopy, bytes))
321 				break;
322 		}
323 	}
324 
325 	return (err);
326 }
327 
328 
329 static int
330 cpr_write_statefile(vnode_t *vp)
331 {
332 	uint_t error = 0;
333 	extern	int	i_cpr_check_pgs_dumped();
334 	void flush_windows(void);
335 	pgcnt_t spages;
336 	char *str;
337 
338 	flush_windows();
339 
340 	/*
341 	 * to get an accurate view of kas, we need to untag sensitive
342 	 * pages *before* dumping them because the disk driver makes
343 	 * allocations and changes kas along the way.  The remaining
344 	 * pages referenced in the bitmaps are dumped out later as
345 	 * regular kpages.
346 	 */
347 	str = "cpr_write_statefile:";
348 	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
349 	CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
350 
351 	/*
352 	 * now it's OK to call a driver that makes allocations
353 	 */
354 	cpr_disk_writes_ok = 1;
355 
356 	/*
357 	 * now write out the clean sensitive kpages
358 	 * according to the sensitive descriptors
359 	 */
360 	error = i_cpr_dump_sensitive_kpages(vp);
361 	if (error) {
362 		CPR_DEBUG(CPR_DEBUG7,
363 		    "%s cpr_dump_sensitive_kpages() failed!\n", str);
364 		return (error);
365 	}
366 
367 	/*
368 	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
369 	 */
370 	error = cpr_dump_regular_pages(vp);
371 	if (error) {
372 		CPR_DEBUG(CPR_DEBUG7,
373 		    "%s cpr_dump_regular_pages() failed!\n", str);
374 		return (error);
375 	}
376 
377 	/*
378 	 * sanity check to verify the right number of pages were dumped
379 	 */
380 	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
381 	    cpr_regular_pgs_dumped);
382 
383 	if (error) {
384 		prom_printf("\n%s page count mismatch!\n", str);
385 #ifdef DEBUG
386 		if (cpr_test_mode)
387 			debug_enter(NULL);
388 #endif
389 	}
390 
391 	return (error);
392 }
393 #endif
394 
395 
396 /*
397  * creates the CPR state file, the following sections are
398  * written out in sequence:
399  *    - writes the cpr dump header
400  *    - writes the memory usage bitmaps
401  *    - writes the platform dependent info
402  *    - writes the remaining user pages
403  *    - writes the kernel pages
404  */
405 #if defined(__x86)
406 	_NOTE(ARGSUSED(0))
407 #endif
408 int
409 cpr_dump(vnode_t *vp)
410 {
411 #if defined(__sparc)
412 	int error;
413 
414 	if (cpr_buf == NULL) {
415 		ASSERT(cpr_pagedata == NULL);
416 		if (error = cpr_alloc_bufs())
417 			return (error);
418 	}
419 	/* point to top of internal buffer */
420 	cpr_wptr = cpr_buf;
421 
422 	/* initialize global variables used by the write operation */
423 	cpr_file_bn = cpr_statefile_offset();
424 	cpr_dev_space = 0;
425 
426 	/* allocate bitmaps */
427 	if (CPR->c_bmda == NULL) {
428 		if (error = i_cpr_alloc_bitmaps()) {
429 			cpr_err(CE_WARN, "cannot allocate bitmaps");
430 			return (error);
431 		}
432 	}
433 
434 	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
435 		return (error);
436 
437 	if (error = i_cpr_dump_setup(vp))
438 		return (error);
439 
440 	/*
441 	 * set internal cross checking; we dont want to call
442 	 * a disk driver that makes allocations until after
443 	 * sensitive pages are saved
444 	 */
445 	cpr_disk_writes_ok = 0;
446 
447 	/*
448 	 * 1253112: heap corruption due to memory allocation when dumpping
449 	 *	    statefile.
450 	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
451 	 * kvseg segments can be contaminated should memory allocations happen
452 	 * during sddump, which is not supposed to happen after the system
453 	 * is quiesced. Let's call the kernel pages that tend to be affected
454 	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
455 	 * will allocate some storage space to save the clean sensitive pages
456 	 * aside before statefile dumping takes place. Since there may not be
457 	 * much memory left at this stage, the sensitive pages will be
458 	 * compressed before they are saved into the storage area.
459 	 */
460 	if (error = i_cpr_save_sensitive_kpages()) {
461 		CPR_DEBUG(CPR_DEBUG7,
462 		    "cpr_dump: save_sensitive_kpages failed!\n");
463 		return (error);
464 	}
465 
466 	/*
467 	 * since all cpr allocations are done (space for sensitive kpages,
468 	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
469 	 * count regular and sensitive kpages.
470 	 */
471 	if (error = cpr_write_header(vp)) {
472 		CPR_DEBUG(CPR_DEBUG7,
473 		    "cpr_dump: cpr_write_header() failed!\n");
474 		return (error);
475 	}
476 
477 	if (error = i_cpr_write_machdep(vp))
478 		return (error);
479 
480 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
481 		return (error);
482 
483 	if (error = cpr_write_bitmap(vp))
484 		return (error);
485 
486 	if (error = cpr_write_statefile(vp)) {
487 		CPR_DEBUG(CPR_DEBUG7,
488 		    "cpr_dump: cpr_write_statefile() failed!\n");
489 		return (error);
490 	}
491 
492 	if (error = cpr_write_terminator(vp))
493 		return (error);
494 
495 	if (error = cpr_flush_write(vp))
496 		return (error);
497 
498 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
499 		return (error);
500 #endif
501 
502 	return (0);
503 }
504 
505 
506 #if defined(__sparc)
507 /*
508  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
509  * a page-count from each range is accumulated at arg->pages.
510  */
511 static void
512 cpr_xwalk(void *arg, void *base, size_t size)
513 {
514 	struct cpr_walkinfo *cwip = arg;
515 
516 	cwip->pages += cpr_count_pages(base, size,
517 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
518 	cwip->size += size;
519 	cwip->ranges++;
520 }
521 
522 /*
523  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
524  * a page-count from each range is accumulated at arg->pages.
525  */
526 static void
527 cpr_walk(void *arg, void *base, size_t size)
528 {
529 	caddr_t addr = base;
530 	caddr_t addr_end = addr + size;
531 
532 	/*
533 	 * If we are about to start walking the range of addresses we
534 	 * carved out of the kernel heap for the large page heap walk
535 	 * heap_lp_arena to find what segments are actually populated
536 	 */
537 	if (SEGKMEM_USE_LARGEPAGES &&
538 	    addr == heap_lp_base && addr_end == heap_lp_end &&
539 	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
540 		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
541 	} else {
542 		cpr_xwalk(arg, base, size);
543 	}
544 }
545 
546 
547 /*
548  * faster scan of kvseg using vmem_walk() to visit
549  * allocated ranges.
550  */
551 pgcnt_t
552 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
553 {
554 	struct cpr_walkinfo cwinfo;
555 
556 	bzero(&cwinfo, sizeof (cwinfo));
557 	cwinfo.mapflag = mapflag;
558 	cwinfo.bitfunc = bitfunc;
559 
560 	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
561 
562 	if (cpr_debug & CPR_DEBUG7) {
563 		prom_printf("walked %d sub-ranges, total pages %ld\n",
564 		    cwinfo.ranges, mmu_btop(cwinfo.size));
565 		cpr_show_range(seg->s_base, seg->s_size,
566 		    mapflag, bitfunc, cwinfo.pages);
567 	}
568 
569 	return (cwinfo.pages);
570 }
571 
572 
573 /*
574  * cpr_walk_kpm() is called for every used area within the large
575  * segkpm virtual address window. A page-count is accumulated at
576  * arg->pages.
577  */
578 static void
579 cpr_walk_kpm(void *arg, void *base, size_t size)
580 {
581 	struct cpr_walkinfo *cwip = arg;
582 
583 	cwip->pages += cpr_count_pages(base, size,
584 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
585 	cwip->size += size;
586 	cwip->ranges++;
587 }
588 
589 
590 /*
591  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
592  */
593 /*ARGSUSED*/
594 static pgcnt_t
595 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
596 {
597 	struct cpr_walkinfo cwinfo;
598 
599 	if (kpm_enable == 0)
600 		return (0);
601 
602 	bzero(&cwinfo, sizeof (cwinfo));
603 	cwinfo.mapflag = mapflag;
604 	cwinfo.bitfunc = bitfunc;
605 	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
606 
607 	if (cpr_debug & CPR_DEBUG7) {
608 		prom_printf("walked %d sub-ranges, total pages %ld\n",
609 		    cwinfo.ranges, mmu_btop(cwinfo.size));
610 		cpr_show_range(segkpm->s_base, segkpm->s_size,
611 		    mapflag, bitfunc, cwinfo.pages);
612 	}
613 
614 	return (cwinfo.pages);
615 }
616 
617 
618 /*
619  * Sparsely filled kernel segments are registered in kseg_table for
620  * easier lookup. See also block comment for cpr_count_seg_pages.
621  */
622 
623 #define	KSEG_SEG_ADDR	0	/* address of struct seg */
624 #define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
625 
626 typedef struct {
627 	struct seg **st_seg;		/* segment pointer or segment address */
628 	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
629 	int	st_addrtype;		/* address type in st_seg */
630 } ksegtbl_entry_t;
631 
632 ksegtbl_entry_t kseg_table[] = {
633 	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
634 	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
635 	{NULL,				0,			0}
636 };
637 
638 
639 /*
640  * Compare seg with each entry in kseg_table; when there is a match
641  * return the entry pointer, otherwise return NULL.
642  */
643 static ksegtbl_entry_t *
644 cpr_sparse_seg_check(struct seg *seg)
645 {
646 	ksegtbl_entry_t *ste = &kseg_table[0];
647 	struct seg *tseg;
648 
649 	for (; ste->st_seg; ste++) {
650 		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
651 		    *ste->st_seg : (struct seg *)ste->st_seg;
652 
653 		if (seg == tseg)
654 			return (ste);
655 	}
656 
657 	return ((ksegtbl_entry_t *)NULL);
658 }
659 
660 
661 /*
662  * Count pages within each kernel segment; call cpr_sparse_seg_check()
663  * to find out whether a sparsely filled segment needs special
664  * treatment (e.g. kvseg).
665  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
666  *       module shouldn't need to know segment details like if it is
667  *       sparsely filled or not (makes kseg_table obsolete).
668  */
669 pgcnt_t
670 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
671 {
672 	struct seg *segp;
673 	pgcnt_t pages;
674 	ksegtbl_entry_t *ste;
675 
676 	pages = 0;
677 	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
678 		if (ste = cpr_sparse_seg_check(segp)) {
679 			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
680 		} else {
681 			pages += cpr_count_pages(segp->s_base,
682 			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
683 		}
684 	}
685 
686 	return (pages);
687 }
688 
689 
690 /*
691  * count kernel pages within kas and any special ranges
692  */
693 pgcnt_t
694 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
695 {
696 	pgcnt_t kas_cnt;
697 
698 	/*
699 	 * Some pages need to be taken care of differently.
700 	 * eg: panicbuf pages of sun4m are not in kas but they need
701 	 * to be saved.  On sun4u, the physical pages of panicbuf are
702 	 * allocated via prom_retain().
703 	 */
704 	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
705 	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
706 
707 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
708 	CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
709 	    kas_cnt, mmu_ptob(kas_cnt));
710 
711 	return (kas_cnt);
712 }
713 
714 
715 /*
716  * Set a bit corresponding to the arg phys page number;
717  * returns 0 when the ppn is valid and the corresponding
718  * map bit was clear, otherwise returns 1.
719  */
720 int
721 cpr_setbit(pfn_t ppn, int mapflag)
722 {
723 	char *bitmap;
724 	cbd_t *dp;
725 	pfn_t rel;
726 	int clr;
727 
728 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
729 		if (PPN_IN_RANGE(ppn, dp)) {
730 			bitmap = DESC_TO_MAP(dp, mapflag);
731 			rel = ppn - dp->cbd_spfn;
732 			if ((clr = isclr(bitmap, rel)) != 0)
733 				setbit(bitmap, rel);
734 			return (clr == 0);
735 		}
736 	}
737 
738 	return (1);
739 }
740 
741 
742 /*
743  * Clear a bit corresponding to the arg phys page number.
744  */
745 int
746 cpr_clrbit(pfn_t ppn, int mapflag)
747 {
748 	char *bitmap;
749 	cbd_t *dp;
750 	pfn_t rel;
751 	int set;
752 
753 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
754 		if (PPN_IN_RANGE(ppn, dp)) {
755 			bitmap = DESC_TO_MAP(dp, mapflag);
756 			rel = ppn - dp->cbd_spfn;
757 			if ((set = isset(bitmap, rel)) != 0)
758 				clrbit(bitmap, rel);
759 			return (set == 0);
760 		}
761 	}
762 
763 	return (1);
764 }
765 
766 
767 /* ARGSUSED */
768 int
769 cpr_nobit(pfn_t ppn, int mapflag)
770 {
771 	return (0);
772 }
773 
774 
775 /*
776  * Lookup a bit corresponding to the arg phys page number.
777  */
778 int
779 cpr_isset(pfn_t ppn, int mapflag)
780 {
781 	char *bitmap;
782 	cbd_t *dp;
783 	pfn_t rel;
784 
785 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
786 		if (PPN_IN_RANGE(ppn, dp)) {
787 			bitmap = DESC_TO_MAP(dp, mapflag);
788 			rel = ppn - dp->cbd_spfn;
789 			return (isset(bitmap, rel));
790 		}
791 	}
792 
793 	return (0);
794 }
795 
796 
797 /*
798  * Go thru all pages and pick up any page not caught during the invalidation
799  * stage. This is also used to save pages with cow lock or phys page lock held
800  * (none zero p_lckcnt or p_cowcnt)
801  */
802 static	int
803 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
804 {
805 	page_t *pp, *page0;
806 	pgcnt_t dcnt = 0, tcnt = 0;
807 	pfn_t pfn;
808 
809 	page0 = pp = page_first();
810 
811 	do {
812 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
813 		    PP_ISFREE(pp) && PP_ISAGED(pp))
814 			continue;
815 
816 		pfn = page_pptonum(pp);
817 		if (pf_is_memory(pfn)) {
818 			tcnt++;
819 			if ((*bitfunc)(pfn, mapflag) == 0)
820 				dcnt++; /* dirty count */
821 		}
822 	} while ((pp = page_next(pp)) != page0);
823 
824 	STAT->cs_upage2statef = dcnt;
825 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
826 	    dcnt, tcnt);
827 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
828 	    dcnt, mmu_ptob(dcnt));
829 	page0 = NULL; /* for Lint */
830 	return (dcnt);
831 }
832 
833 
834 /*
835  * try compressing pages based on cflag,
836  * and for DEBUG kernels, verify uncompressed data checksum;
837  *
838  * this routine replaces common code from
839  * i_cpr_compress_and_save() and cpr_compress_and_write()
840  */
841 char *
842 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
843 {
844 	size_t nbytes, clen, len;
845 	uint32_t test_sum;
846 	char *datap;
847 
848 	nbytes = mmu_ptob(pages);
849 
850 	/*
851 	 * set length to the original uncompressed data size;
852 	 * always init cpd_flag to zero
853 	 */
854 	dp->cpd_length = nbytes;
855 	dp->cpd_flag = 0;
856 
857 #ifdef	DEBUG
858 	/*
859 	 * Make a copy of the uncompressed data so we can checksum it.
860 	 * Compress that copy so the checksum works at the other end
861 	 */
862 	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
863 	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
864 	dp->cpd_flag |= CPD_USUM;
865 	datap = cpr_pagecopy;
866 #else
867 	datap = CPR->c_mapping_area;
868 	dp->cpd_usum = 0;
869 #endif
870 
871 	/*
872 	 * try compressing the raw data to cpr_pagedata;
873 	 * if there was a size reduction: record the new length,
874 	 * flag the compression, and point to the compressed data.
875 	 */
876 	dp->cpd_csum = 0;
877 	if (cflag) {
878 		clen = compress(datap, cpr_pagedata, nbytes);
879 		if (clen < nbytes) {
880 			dp->cpd_flag |= CPD_COMPRESS;
881 			dp->cpd_length = clen;
882 			datap = cpr_pagedata;
883 #ifdef	DEBUG
884 			dp->cpd_csum = checksum32(datap, clen);
885 			dp->cpd_flag |= CPD_CSUM;
886 
887 			/*
888 			 * decompress the data back to a scratch area
889 			 * and compare the new checksum with the original
890 			 * checksum to verify the compression.
891 			 */
892 			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
893 			len = decompress(datap, cpr_pagecopy,
894 			    clen, sizeof (cpr_pagecopy));
895 			test_sum = checksum32(cpr_pagecopy, len);
896 			ASSERT(test_sum == dp->cpd_usum);
897 #endif
898 		}
899 	}
900 
901 	return (datap);
902 }
903 
904 
905 /*
906  * 1. Prepare cpr page descriptor and write it to file
907  * 2. Compress page data and write it out
908  */
909 static int
910 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
911 {
912 	int error = 0;
913 	char *datap;
914 	cpd_t cpd;	/* cpr page descriptor */
915 	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
916 	extern void i_cpr_mapout(caddr_t, uint_t);
917 
918 	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
919 
920 	CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
921 	    npg, (void *)CPR->c_mapping_area, pfn);
922 
923 	/*
924 	 * Fill cpr page descriptor.
925 	 */
926 	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
927 	cpd.cpd_pfn = pfn;
928 	cpd.cpd_pages = npg;
929 
930 	STAT->cs_dumped_statefsz += mmu_ptob(npg);
931 
932 	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
933 
934 	/* Write cpr page descriptor */
935 	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
936 
937 	/* Write compressed page data */
938 	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
939 
940 	/*
941 	 * Unmap the pages for tlb and vac flushing
942 	 */
943 	i_cpr_mapout(CPR->c_mapping_area, npg);
944 
945 	if (error) {
946 		CPR_DEBUG(CPR_DEBUG1,
947 		    "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
948 		CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
949 		    pfn, cpr_file_bn, error);
950 	} else {
951 		cpr_regular_pgs_dumped += npg;
952 	}
953 
954 	return (error);
955 }
956 
957 
958 int
959 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
960 {
961 	caddr_t	fromp = buffer;
962 	size_t bytes, wbytes;
963 	int error;
964 
965 	if (cpr_dev_space == 0) {
966 		if (vp->v_type == VBLK) {
967 			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
968 			ASSERT(cpr_dev_space);
969 		} else
970 			cpr_dev_space = 1;	/* not used in this case */
971 	}
972 
973 	/*
974 	 * break the write into multiple part if request is large,
975 	 * calculate count up to buf page boundary, then write it out.
976 	 * repeat until done.
977 	 */
978 	while (size) {
979 		bytes = MIN(size, cpr_buf_end - cpr_wptr);
980 		cprbcopy(fromp, cpr_wptr, bytes);
981 		cpr_wptr += bytes;
982 		fromp += bytes;
983 		size -= bytes;
984 		if (cpr_wptr < cpr_buf_end)
985 			return (0);	/* buffer not full yet */
986 		ASSERT(cpr_wptr == cpr_buf_end);
987 
988 		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
989 		if (vp->v_type == VBLK) {
990 			if (wbytes > cpr_dev_space)
991 				return (ENOSPC);
992 		} else {
993 			if (wbytes > VTOI(vp)->i_size)
994 				return (ENOSPC);
995 		}
996 
997 		CPR_DEBUG(CPR_DEBUG3,
998 		    "cpr_write: frmp=%p wptr=%p cnt=%lx...",
999 		    (void *)fromp, (void *)cpr_wptr, bytes);
1000 		/*
1001 		 * cross check, this should not happen!
1002 		 */
1003 		if (cpr_disk_writes_ok == 0) {
1004 			prom_printf("cpr_write: disk write too early!\n");
1005 			return (EINVAL);
1006 		}
1007 
1008 		do_polled_io = 1;
1009 		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
1010 		    NULL);
1011 		do_polled_io = 0;
1012 		CPR_DEBUG(CPR_DEBUG3, "done\n");
1013 
1014 		STAT->cs_real_statefsz += cpr_buf_size;
1015 
1016 		if (error) {
1017 			cpr_err(CE_WARN, "cpr_write error %d", error);
1018 			return (error);
1019 		}
1020 		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
1021 		cpr_wptr = cpr_buf;		/* back to top of buffer */
1022 	}
1023 	return (0);
1024 }
1025 
1026 
1027 int
1028 cpr_flush_write(vnode_t *vp)
1029 {
1030 	int	nblk;
1031 	int	error;
1032 
1033 	/*
1034 	 * Calculate remaining blocks in buffer, rounded up to nearest
1035 	 * disk block
1036 	 */
1037 	nblk = btod(cpr_wptr - cpr_buf);
1038 
1039 	do_polled_io = 1;
1040 	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
1041 	do_polled_io = 0;
1042 
1043 	cpr_file_bn += nblk;
1044 	if (error)
1045 		CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
1046 		    error);
1047 	return (error);
1048 }
1049 
1050 void
1051 cpr_clear_bitmaps(void)
1052 {
1053 	cbd_t *dp;
1054 
1055 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1056 		bzero((void *)dp->cbd_reg_bitmap,
1057 		    (size_t)dp->cbd_size * 2);
1058 	}
1059 	CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
1060 }
1061 
1062 int
1063 cpr_contig_pages(vnode_t *vp, int flag)
1064 {
1065 	int chunks = 0, error = 0;
1066 	pgcnt_t i, j, totbit;
1067 	pfn_t spfn;
1068 	cbd_t *dp;
1069 	uint_t	spin_cnt = 0;
1070 	extern	int i_cpr_compress_and_save();
1071 
1072 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1073 		spfn = dp->cbd_spfn;
1074 		totbit = BTOb(dp->cbd_size);
1075 		i = 0; /* Beginning of bitmap */
1076 		j = 0;
1077 		while (i < totbit) {
1078 			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1079 				if (isset((char *)dp->cbd_reg_bitmap, j+i))
1080 					j++;
1081 				else /* not contiguous anymore */
1082 					break;
1083 			}
1084 
1085 			if (j) {
1086 				chunks++;
1087 				if (flag == SAVE_TO_STORAGE) {
1088 					error = i_cpr_compress_and_save(
1089 					    chunks, spfn + i, j);
1090 					if (error)
1091 						return (error);
1092 				} else if (flag == WRITE_TO_STATEFILE) {
1093 					error = cpr_compress_and_write(vp, 0,
1094 					    spfn + i, j);
1095 					if (error)
1096 						return (error);
1097 					else {
1098 						spin_cnt++;
1099 						if ((spin_cnt & 0x5F) == 1)
1100 							cpr_spinning_bar();
1101 					}
1102 				}
1103 			}
1104 
1105 			i += j;
1106 			if (j != CPR_MAXCONTIG) {
1107 				/* Stopped on a non-tagged page */
1108 				i++;
1109 			}
1110 
1111 			j = 0;
1112 		}
1113 	}
1114 
1115 	if (flag == STORAGE_DESC_ALLOC)
1116 		return (chunks);
1117 	else
1118 		return (0);
1119 }
1120 
1121 
1122 void
1123 cpr_show_range(caddr_t vaddr, size_t size,
1124     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1125 {
1126 	char *action, *bname;
1127 
1128 	bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1129 	if (bitfunc == cpr_setbit)
1130 		action = "tag";
1131 	else if (bitfunc == cpr_clrbit)
1132 		action = "untag";
1133 	else
1134 		action = "none";
1135 	prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1136 	    (void *)vaddr, (void *)(vaddr + size), bname, action, count);
1137 }
1138 
1139 
1140 pgcnt_t
1141 cpr_count_pages(caddr_t sva, size_t size,
1142     int mapflag, bitfunc_t bitfunc, int showrange)
1143 {
1144 	caddr_t	va, eva;
1145 	pfn_t pfn;
1146 	pgcnt_t count = 0;
1147 
1148 	eva = sva + PAGE_ROUNDUP(size);
1149 	for (va = sva; va < eva; va += MMU_PAGESIZE) {
1150 		pfn = va_to_pfn(va);
1151 		if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1152 			if ((*bitfunc)(pfn, mapflag) == 0)
1153 				count++;
1154 		}
1155 	}
1156 
1157 	if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
1158 		cpr_show_range(sva, size, mapflag, bitfunc, count);
1159 
1160 	return (count);
1161 }
1162 
1163 
1164 pgcnt_t
1165 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1166 {
1167 	pgcnt_t count = 0;
1168 
1169 	if (cpr_buf) {
1170 		count += cpr_count_pages(cpr_buf, cpr_buf_size,
1171 		    mapflag, bitfunc, DBG_SHOWRANGE);
1172 	}
1173 	if (cpr_pagedata) {
1174 		count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1175 		    mapflag, bitfunc, DBG_SHOWRANGE);
1176 	}
1177 	count += i_cpr_count_storage_pages(mapflag, bitfunc);
1178 
1179 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1180 	    count, mmu_ptob(count));
1181 	return (count);
1182 }
1183 
1184 
1185 static int
1186 cpr_dump_regular_pages(vnode_t *vp)
1187 {
1188 	int error;
1189 
1190 	cpr_regular_pgs_dumped = 0;
1191 	error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1192 	if (!error)
1193 		CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
1194 	return (error);
1195 }
1196 #endif
1197