xref: /titanic_41/usr/src/uts/common/cpr/cpr_dump.c (revision 02e56f3f1bfc8d9977bafb8cb5202f576dcded27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Fill in and write out the cpr state file
31  *	1. Allocate and write headers, ELF and cpr dump header
32  *	2. Allocate bitmaps according to phys_install
33  *	3. Tag kernel pages into corresponding bitmap
34  *	4. Write bitmaps to state file
35  *	5. Write actual physical page data to state file
36  */
37 
38 #include <sys/types.h>
39 #include <sys/systm.h>
40 #include <sys/vm.h>
41 #include <sys/memlist.h>
42 #include <sys/kmem.h>
43 #include <sys/vnode.h>
44 #include <sys/fs/ufs_inode.h>
45 #include <sys/errno.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <vm/page.h>
49 #include <vm/seg.h>
50 #include <vm/seg_kmem.h>
51 #include <vm/seg_kpm.h>
52 #include <vm/hat.h>
53 #include <sys/cpr.h>
54 #include <sys/conf.h>
55 #include <sys/ddi.h>
56 #include <sys/panic.h>
57 #include <sys/thread.h>
58 
59 /* Local defines and variables */
60 #define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
61 #define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
62 
63 static uint_t cpr_pages_tobe_dumped;
64 static uint_t cpr_regular_pgs_dumped;
65 
66 static int cpr_dump_regular_pages(vnode_t *);
67 static int cpr_count_upages(int, bitfunc_t);
68 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
69 int cpr_flush_write(vnode_t *);
70 
71 int cpr_contig_pages(vnode_t *, int);
72 
73 void cpr_clear_bitmaps();
74 
75 extern size_t cpr_get_devsize(dev_t);
76 extern int i_cpr_dump_setup(vnode_t *);
77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
78 extern int cpr_test_mode;
79 
80 ctrm_t cpr_term;
81 
82 char *cpr_buf, *cpr_buf_end;
83 int cpr_buf_blocks;		/* size of cpr_buf in blocks */
84 size_t cpr_buf_size;		/* size of cpr_buf in bytes */
85 size_t cpr_bitmap_size;
86 int cpr_nbitmaps;
87 
88 char *cpr_pagedata;		/* page buffer for compression / tmp copy */
89 size_t cpr_pagedata_size;	/* page buffer size in bytes */
90 
91 static char *cpr_wptr;		/* keep track of where to write to next */
92 static int cpr_file_bn;		/* cpr state-file block offset */
93 static int cpr_disk_writes_ok;
94 static size_t cpr_dev_space = 0;
95 
96 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
97 
98 /*
99  * On some platforms bcopy may modify the thread structure
100  * during bcopy (eg, to prevent cpu migration).  If the
101  * range we are currently writing out includes our own
102  * thread structure then it will be snapshotted by bcopy
103  * including those modified members - and the updates made
104  * on exit from bcopy will no longer be seen when we later
105  * restore the mid-bcopy kthread_t.  So if the range we
106  * need to copy overlaps with our thread structure we will
107  * use a simple byte copy.
108  */
109 void
110 cprbcopy(void *from, void *to, size_t bytes)
111 {
112 	extern int curthreadremapped;
113 	caddr_t kthrend;
114 
115 	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
116 	if (curthreadremapped || (kthrend >= (caddr_t)from &&
117 	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
118 		caddr_t src = from, dst = to;
119 
120 		while (bytes-- > 0)
121 			*dst++ = *src++;
122 	} else {
123 		bcopy(from, to, bytes);
124 	}
125 }
126 
127 /*
128  * Allocate pages for buffers used in writing out the statefile
129  */
130 static int
131 cpr_alloc_bufs(void)
132 {
133 	char *allocerr = "Unable to allocate memory for cpr buffer";
134 	size_t size;
135 
136 	/*
137 	 * set the cpr write buffer size to at least the historic
138 	 * size (128k) or large enough to store the both the early
139 	 * set of statefile structures (well under 0x800) plus the
140 	 * bitmaps, and roundup to the next pagesize.
141 	 */
142 	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
143 	cpr_buf_size = MAX(size, CPRBUFSZ);
144 	cpr_buf_blocks = btodb(cpr_buf_size);
145 	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
146 	if (cpr_buf == NULL) {
147 		cpr_err(CE_WARN, allocerr);
148 		return (ENOMEM);
149 	}
150 	cpr_buf_end = cpr_buf + cpr_buf_size;
151 
152 	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
153 	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
154 	if (cpr_pagedata == NULL) {
155 		kmem_free(cpr_buf, cpr_buf_size);
156 		cpr_buf = NULL;
157 		cpr_err(CE_WARN, allocerr);
158 		return (ENOMEM);
159 	}
160 
161 	return (0);
162 }
163 
164 
165 /*
166  * Set bitmap size in bytes based on phys_install.
167  */
168 void
169 cpr_set_bitmap_size(void)
170 {
171 	struct memlist *pmem;
172 	size_t size = 0;
173 
174 	memlist_read_lock();
175 	for (pmem = phys_install; pmem; pmem = pmem->next)
176 		size += pmem->size;
177 	memlist_read_unlock();
178 	cpr_bitmap_size = BITMAP_BYTES(size);
179 }
180 
181 
182 /*
183  * CPR dump header contains the following information:
184  *	1. header magic -- unique to cpr state file
185  *	2. kernel return pc & ppn for resume
186  *	3. current thread info
187  *	4. debug level and test mode
188  *	5. number of bitmaps allocated
189  *	6. number of page records
190  */
191 static int
192 cpr_write_header(vnode_t *vp)
193 {
194 	extern ushort_t cpr_mach_type;
195 	struct cpr_dump_desc cdump;
196 	pgcnt_t bitmap_pages;
197 	pgcnt_t kpages, vpages, upages;
198 
199 	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
200 	cdump.cdd_version = CPR_VERSION;
201 	cdump.cdd_machine = cpr_mach_type;
202 	cdump.cdd_debug = cpr_debug;
203 	cdump.cdd_test_mode = cpr_test_mode;
204 	cdump.cdd_bitmaprec = cpr_nbitmaps;
205 
206 	cpr_clear_bitmaps();
207 
208 	/*
209 	 * Remember how many pages we plan to save to statefile.
210 	 * This information will be used for sanity checks.
211 	 * Untag those pages that will not be saved to statefile.
212 	 */
213 	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
214 	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
215 	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
216 	cdump.cdd_dumppgsize = kpages - vpages + upages;
217 	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
218 	DEBUG7(errp(
219 	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
220 	    kpages, vpages, upages, cdump.cdd_dumppgsize));
221 
222 	/*
223 	 * Some pages contain volatile data (cpr_buf and storage area for
224 	 * sensitive kpages), which are no longer needed after the statefile
225 	 * is dumped to disk.  We have already untagged them from regular
226 	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
227 	 * volatile bitmaps will be claimed during resume, and the resumed
228 	 * kernel will free them.
229 	 */
230 	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
231 
232 	bitmap_pages = mmu_btopr(cpr_bitmap_size);
233 
234 	/*
235 	 * Export accurate statefile size for statefile allocation retry.
236 	 * statefile_size = all the headers + total pages +
237 	 * number of pages used by the bitmaps.
238 	 * Roundup will be done in the file allocation code.
239 	 */
240 	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
241 		(sizeof (cbd_t) * cdump.cdd_bitmaprec) +
242 		(sizeof (cpd_t) * cdump.cdd_dumppgsize) +
243 		mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
244 
245 	/*
246 	 * If the estimated statefile is not big enough,
247 	 * go retry now to save un-necessary operations.
248 	 */
249 	if (!(CPR->c_flags & C_COMPRESSING) &&
250 		(STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
251 		if (cpr_debug & (LEVEL1 | LEVEL7))
252 		    errp("cpr_write_header: STAT->cs_nocomp_statefsz > "
253 			"STAT->cs_est_statefsz\n");
254 		return (ENOSPC);
255 	}
256 
257 	/* now write cpr dump descriptor */
258 	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
259 }
260 
261 
262 /*
263  * CPR dump tail record contains the following information:
264  *	1. header magic -- unique to cpr state file
265  *	2. all misc info that needs to be passed to cprboot or resumed kernel
266  */
267 static int
268 cpr_write_terminator(vnode_t *vp)
269 {
270 	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
271 	cpr_term.va = (cpr_ptr)&cpr_term;
272 	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
273 
274 	/* count the last one (flush) */
275 	cpr_term.real_statef_size = STAT->cs_real_statefsz +
276 		btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
277 
278 	DEBUG9(errp("cpr_dump: Real Statefile Size: %d\n",
279 		STAT->cs_real_statefsz));
280 
281 	cpr_tod_get(&cpr_term.tm_shutdown);
282 
283 	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
284 }
285 
286 /*
287  * Write bitmap descriptor array, followed by merged bitmaps.
288  */
289 static int
290 cpr_write_bitmap(vnode_t *vp)
291 {
292 	char *rmap, *vmap, *dst, *tail;
293 	size_t size, bytes;
294 	cbd_t *dp;
295 	int err;
296 
297 	dp = CPR->c_bmda;
298 	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
299 		return (err);
300 
301 	/*
302 	 * merge regular and volatile bitmaps into tmp space
303 	 * and write to disk
304 	 */
305 	for (; dp->cbd_size; dp++) {
306 		rmap = (char *)dp->cbd_reg_bitmap;
307 		vmap = (char *)dp->cbd_vlt_bitmap;
308 		for (size = dp->cbd_size; size; size -= bytes) {
309 			bytes = min(size, sizeof (cpr_pagecopy));
310 			tail = &cpr_pagecopy[bytes];
311 			for (dst = cpr_pagecopy; dst < tail; dst++)
312 				*dst = *rmap++ | *vmap++;
313 			if (err = cpr_write(vp, cpr_pagecopy, bytes))
314 				break;
315 		}
316 	}
317 
318 	return (err);
319 }
320 
321 
322 static int
323 cpr_write_statefile(vnode_t *vp)
324 {
325 	uint_t error = 0;
326 	extern	int	i_cpr_check_pgs_dumped();
327 	void flush_windows(void);
328 	pgcnt_t spages;
329 	char *str;
330 
331 	flush_windows();
332 
333 	/*
334 	 * to get an accurate view of kas, we need to untag sensitive
335 	 * pages *before* dumping them because the disk driver makes
336 	 * allocations and changes kas along the way.  The remaining
337 	 * pages referenced in the bitmaps are dumped out later as
338 	 * regular kpages.
339 	 */
340 	str = "cpr_write_statefile:";
341 	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
342 	DEBUG7(errp("%s untag %ld sens pages\n", str, spages));
343 
344 	/*
345 	 * now it's OK to call a driver that makes allocations
346 	 */
347 	cpr_disk_writes_ok = 1;
348 
349 	/*
350 	 * now write out the clean sensitive kpages
351 	 * according to the sensitive descriptors
352 	 */
353 	error = i_cpr_dump_sensitive_kpages(vp);
354 	if (error) {
355 		DEBUG7(errp("%s cpr_dump_sensitive_kpages() failed!\n", str));
356 		return (error);
357 	}
358 
359 	/*
360 	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
361 	 */
362 	error = cpr_dump_regular_pages(vp);
363 	if (error) {
364 		DEBUG7(errp("%s cpr_dump_regular_pages() failed!\n", str));
365 		return (error);
366 	}
367 
368 	/*
369 	 * sanity check to verify the right number of pages were dumped
370 	 */
371 	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
372 	    cpr_regular_pgs_dumped);
373 
374 	if (error) {
375 		errp("\n%s page count mismatch!\n", str);
376 #ifdef DEBUG
377 		if (cpr_test_mode)
378 			debug_enter(NULL);
379 #endif
380 	}
381 
382 	return (error);
383 }
384 
385 
386 /*
387  * creates the CPR state file, the following sections are
388  * written out in sequence:
389  *    - writes the cpr dump header
390  *    - writes the memory usage bitmaps
391  *    - writes the platform dependent info
392  *    - writes the remaining user pages
393  *    - writes the kernel pages
394  */
395 int
396 cpr_dump(vnode_t *vp)
397 {
398 	int error;
399 
400 	if (cpr_buf == NULL) {
401 		ASSERT(cpr_pagedata == NULL);
402 		if (error = cpr_alloc_bufs())
403 			return (error);
404 	}
405 	/* point to top of internal buffer */
406 	cpr_wptr = cpr_buf;
407 
408 	/* initialize global variables used by the write operation */
409 	cpr_file_bn = cpr_statefile_offset();
410 	cpr_dev_space = 0;
411 
412 	/* allocate bitmaps */
413 	if (CPR->c_bmda == NULL) {
414 		if (error = i_cpr_alloc_bitmaps()) {
415 			cpr_err(CE_WARN, "cannot allocate bitmaps");
416 			return (error);
417 		}
418 	}
419 
420 	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
421 		return (error);
422 
423 	if (error = i_cpr_dump_setup(vp))
424 		return (error);
425 
426 	/*
427 	 * set internal cross checking; we dont want to call
428 	 * a disk driver that makes allocations until after
429 	 * sensitive pages are saved
430 	 */
431 	cpr_disk_writes_ok = 0;
432 
433 	/*
434 	 * 1253112: heap corruption due to memory allocation when dumpping
435 	 *	    statefile.
436 	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
437 	 * kvseg segments can be contaminated should memory allocations happen
438 	 * during sddump, which is not supposed to happen after the system
439 	 * is quiesced. Let's call the kernel pages that tend to be affected
440 	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
441 	 * will allocate some storage space to save the clean sensitive pages
442 	 * aside before statefile dumping takes place. Since there may not be
443 	 * much memory left at this stage, the sensitive pages will be
444 	 * compressed before they are saved into the storage area.
445 	 */
446 	if (error = i_cpr_save_sensitive_kpages()) {
447 		DEBUG7(errp("cpr_dump: save_sensitive_kpages failed!\n"));
448 		return (error);
449 	}
450 
451 	/*
452 	 * since all cpr allocations are done (space for sensitive kpages,
453 	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
454 	 * count regular and sensitive kpages.
455 	 */
456 	if (error = cpr_write_header(vp)) {
457 		DEBUG7(errp("cpr_dump: cpr_write_header() failed!\n"));
458 		return (error);
459 	}
460 
461 	if (error = i_cpr_write_machdep(vp))
462 		return (error);
463 
464 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
465 		return (error);
466 
467 	if (error = cpr_write_bitmap(vp))
468 		return (error);
469 
470 	if (error = cpr_write_statefile(vp)) {
471 		DEBUG7(errp("cpr_dump: cpr_write_statefile() failed!\n"));
472 		return (error);
473 	}
474 
475 	if (error = cpr_write_terminator(vp))
476 		return (error);
477 
478 	if (error = cpr_flush_write(vp))
479 		return (error);
480 
481 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
482 		return (error);
483 
484 	return (0);
485 }
486 
487 
488 /*
489  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
490  * a page-count from each range is accumulated at arg->pages.
491  */
492 static void
493 cpr_xwalk(void *arg, void *base, size_t size)
494 {
495 	struct cpr_walkinfo *cwip = arg;
496 
497 	cwip->pages += cpr_count_pages(base, size,
498 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
499 	cwip->size += size;
500 	cwip->ranges++;
501 }
502 
503 /*
504  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
505  * a page-count from each range is accumulated at arg->pages.
506  */
507 static void
508 cpr_walk(void *arg, void *base, size_t size)
509 {
510 	caddr_t addr = base;
511 	caddr_t addr_end = addr + size;
512 
513 	/*
514 	 * If we are about to start walking the range of addresses we
515 	 * carved out of the kernel heap for the large page heap walk
516 	 * heap_lp_arena to find what segments are actually populated
517 	 */
518 	if (SEGKMEM_USE_LARGEPAGES &&
519 	    addr == heap_lp_base && addr_end == heap_lp_end &&
520 	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
521 		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
522 	} else {
523 		cpr_xwalk(arg, base, size);
524 	}
525 }
526 
527 
528 /*
529  * faster scan of kvseg using vmem_walk() to visit
530  * allocated ranges.
531  */
532 pgcnt_t
533 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
534 {
535 	struct cpr_walkinfo cwinfo;
536 
537 	bzero(&cwinfo, sizeof (cwinfo));
538 	cwinfo.mapflag = mapflag;
539 	cwinfo.bitfunc = bitfunc;
540 
541 	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
542 
543 	if (cpr_debug & LEVEL7) {
544 		errp("walked %d sub-ranges, total pages %ld\n",
545 		    cwinfo.ranges, mmu_btop(cwinfo.size));
546 		cpr_show_range(seg->s_base, seg->s_size,
547 		    mapflag, bitfunc, cwinfo.pages);
548 	}
549 
550 	return (cwinfo.pages);
551 }
552 
553 
554 /*
555  * cpr_walk_kpm() is called for every used area within the large
556  * segkpm virtual address window. A page-count is accumulated at
557  * arg->pages.
558  */
559 static void
560 cpr_walk_kpm(void *arg, void *base, size_t size)
561 {
562 	struct cpr_walkinfo *cwip = arg;
563 
564 	cwip->pages += cpr_count_pages(base, size,
565 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
566 	cwip->size += size;
567 	cwip->ranges++;
568 }
569 
570 
571 /*
572  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
573  */
574 /*ARGSUSED*/
575 static pgcnt_t
576 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
577 {
578 	struct cpr_walkinfo cwinfo;
579 
580 	if (kpm_enable == 0)
581 		return (0);
582 
583 	bzero(&cwinfo, sizeof (cwinfo));
584 	cwinfo.mapflag = mapflag;
585 	cwinfo.bitfunc = bitfunc;
586 	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
587 
588 	if (cpr_debug & LEVEL7) {
589 		errp("walked %d sub-ranges, total pages %ld\n",
590 		    cwinfo.ranges, mmu_btop(cwinfo.size));
591 		cpr_show_range(segkpm->s_base, segkpm->s_size,
592 		    mapflag, bitfunc, cwinfo.pages);
593 	}
594 
595 	return (cwinfo.pages);
596 }
597 
598 
599 /*
600  * Sparsely filled kernel segments are registered in kseg_table for
601  * easier lookup. See also block comment for cpr_count_seg_pages.
602  */
603 
604 #define	KSEG_SEG_ADDR	0	/* address of struct seg */
605 #define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
606 
607 typedef struct {
608 	struct seg **st_seg;		/* segment pointer or segment address */
609 	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
610 	int	st_addrtype;		/* address type in st_seg */
611 } ksegtbl_entry_t;
612 
613 ksegtbl_entry_t kseg_table[] = {
614 	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
615 	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
616 	{NULL,				0,			0}
617 };
618 
619 
620 /*
621  * Compare seg with each entry in kseg_table; when there is a match
622  * return the entry pointer, otherwise return NULL.
623  */
624 static ksegtbl_entry_t *
625 cpr_sparse_seg_check(struct seg *seg)
626 {
627 	ksegtbl_entry_t *ste = &kseg_table[0];
628 	struct seg *tseg;
629 
630 	for (; ste->st_seg; ste++) {
631 		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
632 				*ste->st_seg : (struct seg *)ste->st_seg;
633 		if (seg == tseg)
634 			return (ste);
635 	}
636 
637 	return ((ksegtbl_entry_t *)NULL);
638 }
639 
640 
641 /*
642  * Count pages within each kernel segment; call cpr_sparse_seg_check()
643  * to find out whether a sparsely filled segment needs special
644  * treatment (e.g. kvseg).
645  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
646  *       module shouldn't need to know segment details like if it is
647  *       sparsely filled or not (makes kseg_table obsolete).
648  */
649 pgcnt_t
650 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
651 {
652 	struct seg *segp;
653 	pgcnt_t pages;
654 	ksegtbl_entry_t *ste;
655 
656 	pages = 0;
657 	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
658 		if (ste = cpr_sparse_seg_check(segp)) {
659 			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
660 		} else {
661 			pages += cpr_count_pages(segp->s_base,
662 			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
663 		}
664 	}
665 
666 	return (pages);
667 }
668 
669 
670 /*
671  * count kernel pages within kas and any special ranges
672  */
673 pgcnt_t
674 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
675 {
676 	pgcnt_t kas_cnt;
677 
678 	/*
679 	 * Some pages need to be taken care of differently.
680 	 * eg: panicbuf pages of sun4m are not in kas but they need
681 	 * to be saved.  On sun4u, the physical pages of panicbuf are
682 	 * allocated via prom_retain().
683 	 */
684 	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
685 	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
686 
687 	DEBUG9(errp("cpr_count_kpages: kas_cnt=%d\n", kas_cnt));
688 	DEBUG7(errp("\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
689 		kas_cnt, mmu_ptob(kas_cnt)));
690 	return (kas_cnt);
691 }
692 
693 
694 /*
695  * Set a bit corresponding to the arg phys page number;
696  * returns 0 when the ppn is valid and the corresponding
697  * map bit was clear, otherwise returns 1.
698  */
699 int
700 cpr_setbit(pfn_t ppn, int mapflag)
701 {
702 	char *bitmap;
703 	cbd_t *dp;
704 	pfn_t rel;
705 	int clr;
706 
707 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
708 		if (PPN_IN_RANGE(ppn, dp)) {
709 			bitmap = DESC_TO_MAP(dp, mapflag);
710 			rel = ppn - dp->cbd_spfn;
711 			if ((clr = isclr(bitmap, rel)) != 0)
712 				setbit(bitmap, rel);
713 			return (clr == 0);
714 		}
715 	}
716 
717 	return (1);
718 }
719 
720 
721 /*
722  * Clear a bit corresponding to the arg phys page number.
723  */
724 int
725 cpr_clrbit(pfn_t ppn, int mapflag)
726 {
727 	char *bitmap;
728 	cbd_t *dp;
729 	pfn_t rel;
730 	int set;
731 
732 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
733 		if (PPN_IN_RANGE(ppn, dp)) {
734 			bitmap = DESC_TO_MAP(dp, mapflag);
735 			rel = ppn - dp->cbd_spfn;
736 			if ((set = isset(bitmap, rel)) != 0)
737 				clrbit(bitmap, rel);
738 			return (set == 0);
739 		}
740 	}
741 
742 	return (1);
743 }
744 
745 
746 /* ARGSUSED */
747 int
748 cpr_nobit(pfn_t ppn, int mapflag)
749 {
750 	return (0);
751 }
752 
753 
754 /*
755  * Lookup a bit corresponding to the arg phys page number.
756  */
757 int
758 cpr_isset(pfn_t ppn, int mapflag)
759 {
760 	char *bitmap;
761 	cbd_t *dp;
762 	pfn_t rel;
763 
764 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
765 		if (PPN_IN_RANGE(ppn, dp)) {
766 			bitmap = DESC_TO_MAP(dp, mapflag);
767 			rel = ppn - dp->cbd_spfn;
768 			return (isset(bitmap, rel));
769 		}
770 	}
771 
772 	return (0);
773 }
774 
775 
776 /*
777  * Go thru all pages and pick up any page not caught during the invalidation
778  * stage. This is also used to save pages with cow lock or phys page lock held
779  * (none zero p_lckcnt or p_cowcnt)
780  */
781 static	int
782 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
783 {
784 	page_t *pp, *page0;
785 	pgcnt_t dcnt = 0, tcnt = 0;
786 	pfn_t pfn;
787 
788 	page0 = pp = page_first();
789 
790 	do {
791 #if defined(__sparc)
792 		extern struct vnode prom_ppages;
793 		if (pp->p_vnode == NULL || pp->p_vnode == &kvp ||
794 		    pp->p_vnode == &prom_ppages ||
795 			PP_ISFREE(pp) && PP_ISAGED(pp))
796 #else
797 		if (pp->p_vnode == NULL || pp->p_vnode == &kvp ||
798 		    PP_ISFREE(pp) && PP_ISAGED(pp))
799 #endif /* __sparc */
800 			continue;
801 
802 		pfn = page_pptonum(pp);
803 		if (pf_is_memory(pfn)) {
804 			tcnt++;
805 			if ((*bitfunc)(pfn, mapflag) == 0)
806 				dcnt++; /* dirty count */
807 		}
808 	} while ((pp = page_next(pp)) != page0);
809 
810 	STAT->cs_upage2statef = dcnt;
811 	DEBUG9(errp("cpr_count_upages: dirty=%ld total=%ld\n",
812 		dcnt, tcnt));
813 	DEBUG7(errp("cpr_count_upages: %ld pages, 0x%lx bytes\n",
814 		dcnt, mmu_ptob(dcnt)));
815 	return (dcnt);
816 }
817 
818 
819 /*
820  * try compressing pages based on cflag,
821  * and for DEBUG kernels, verify uncompressed data checksum;
822  *
823  * this routine replaces common code from
824  * i_cpr_compress_and_save() and cpr_compress_and_write()
825  */
826 char *
827 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
828 {
829 	size_t nbytes, clen, len;
830 	uint32_t test_sum;
831 	char *datap;
832 
833 	nbytes = mmu_ptob(pages);
834 
835 	/*
836 	 * set length to the original uncompressed data size;
837 	 * always init cpd_flag to zero
838 	 */
839 	dp->cpd_length = nbytes;
840 	dp->cpd_flag = 0;
841 
842 #ifdef	DEBUG
843 	/*
844 	 * Make a copy of the uncompressed data so we can checksum it.
845 	 * Compress that copy so the checksum works at the other end
846 	 */
847 	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
848 	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
849 	dp->cpd_flag |= CPD_USUM;
850 	datap = cpr_pagecopy;
851 #else
852 	datap = CPR->c_mapping_area;
853 	dp->cpd_usum = 0;
854 #endif
855 
856 	/*
857 	 * try compressing the raw data to cpr_pagedata;
858 	 * if there was a size reduction: record the new length,
859 	 * flag the compression, and point to the compressed data.
860 	 */
861 	dp->cpd_csum = 0;
862 	if (cflag) {
863 		clen = compress(datap, cpr_pagedata, nbytes);
864 		if (clen < nbytes) {
865 			dp->cpd_flag |= CPD_COMPRESS;
866 			dp->cpd_length = clen;
867 			datap = cpr_pagedata;
868 #ifdef	DEBUG
869 			dp->cpd_csum = checksum32(datap, clen);
870 			dp->cpd_flag |= CPD_CSUM;
871 
872 			/*
873 			 * decompress the data back to a scratch area
874 			 * and compare the new checksum with the original
875 			 * checksum to verify the compression.
876 			 */
877 			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
878 			len = decompress(datap, cpr_pagecopy,
879 			    clen, sizeof (cpr_pagecopy));
880 			test_sum = checksum32(cpr_pagecopy, len);
881 			ASSERT(test_sum == dp->cpd_usum);
882 #endif
883 		}
884 	}
885 
886 	return (datap);
887 }
888 
889 
890 /*
891  * 1. Prepare cpr page descriptor and write it to file
892  * 2. Compress page data and write it out
893  */
894 static int
895 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
896 {
897 	int error = 0;
898 	char *datap;
899 	cpd_t cpd;	/* cpr page descriptor */
900 	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
901 	extern void i_cpr_mapout(caddr_t, uint_t);
902 
903 	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
904 
905 	DEBUG3(errp("mapped-in %d pages, vaddr 0x%p, pfn 0x%x\n",
906 		npg, CPR->c_mapping_area, pfn));
907 
908 	/*
909 	 * Fill cpr page descriptor.
910 	 */
911 	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
912 	cpd.cpd_pfn = pfn;
913 	cpd.cpd_pages = npg;
914 
915 	STAT->cs_dumped_statefsz += mmu_ptob(npg);
916 
917 	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
918 
919 	/* Write cpr page descriptor */
920 	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
921 
922 	/* Write compressed page data */
923 	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
924 
925 	/*
926 	 * Unmap the pages for tlb and vac flushing
927 	 */
928 	i_cpr_mapout(CPR->c_mapping_area, npg);
929 
930 	if (error) {
931 		DEBUG1(errp("cpr_compress_and_write: vp 0x%p va 0x%x ",
932 		    vp, va));
933 		DEBUG1(errp("pfn 0x%lx blk %d err %d\n",
934 		    pfn, cpr_file_bn, error));
935 	} else {
936 		cpr_regular_pgs_dumped += npg;
937 	}
938 
939 	return (error);
940 }
941 
942 
943 int
944 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
945 {
946 	caddr_t	fromp = buffer;
947 	size_t bytes, wbytes;
948 	int error;
949 
950 	if (cpr_dev_space == 0) {
951 		if (vp->v_type == VBLK) {
952 			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
953 			ASSERT(cpr_dev_space);
954 		} else
955 			cpr_dev_space = 1;	/* not used in this case */
956 	}
957 
958 	/*
959 	 * break the write into multiple part if request is large,
960 	 * calculate count up to buf page boundary, then write it out.
961 	 * repeat until done.
962 	 */
963 	while (size) {
964 		bytes = MIN(size, cpr_buf_end - cpr_wptr);
965 		cprbcopy(fromp, cpr_wptr, bytes);
966 		cpr_wptr += bytes;
967 		fromp += bytes;
968 		size -= bytes;
969 		if (cpr_wptr < cpr_buf_end)
970 			return (0);	/* buffer not full yet */
971 		ASSERT(cpr_wptr == cpr_buf_end);
972 
973 		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
974 		if (vp->v_type == VBLK) {
975 			if (wbytes > cpr_dev_space)
976 				return (ENOSPC);
977 		} else {
978 			if (wbytes > VTOI(vp)->i_size)
979 				return (ENOSPC);
980 		}
981 
982 		DEBUG3(errp("cpr_write: frmp=%x wptr=%x cnt=%x...",
983 			fromp, cpr_wptr, bytes));
984 		/*
985 		 * cross check, this should not happen!
986 		 */
987 		if (cpr_disk_writes_ok == 0) {
988 			errp("cpr_write: disk write too early!\n");
989 			return (EINVAL);
990 		}
991 
992 		do_polled_io = 1;
993 		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks);
994 		do_polled_io = 0;
995 		DEBUG3(errp("done\n"));
996 
997 		STAT->cs_real_statefsz += cpr_buf_size;
998 
999 		if (error) {
1000 			cpr_err(CE_WARN, "cpr_write error %d", error);
1001 			return (error);
1002 		}
1003 		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
1004 		cpr_wptr = cpr_buf;		/* back to top of buffer */
1005 	}
1006 	return (0);
1007 }
1008 
1009 
1010 int
1011 cpr_flush_write(vnode_t *vp)
1012 {
1013 	int	nblk;
1014 	int	error;
1015 
1016 	/*
1017 	 * Calculate remaining blocks in buffer, rounded up to nearest
1018 	 * disk block
1019 	 */
1020 	nblk = btod(cpr_wptr - cpr_buf);
1021 
1022 	do_polled_io = 1;
1023 	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk);
1024 	do_polled_io = 0;
1025 
1026 	cpr_file_bn += nblk;
1027 	if (error)
1028 		DEBUG2(errp("cpr_flush_write: error (%d)\n", error));
1029 	return (error);
1030 }
1031 
1032 void
1033 cpr_clear_bitmaps(void)
1034 {
1035 	cbd_t *dp;
1036 
1037 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1038 		bzero((void *)dp->cbd_reg_bitmap,
1039 		    (size_t)dp->cbd_size * 2);
1040 	}
1041 	DEBUG7(errp("\ncleared reg and vlt bitmaps\n"));
1042 }
1043 
1044 int
1045 cpr_contig_pages(vnode_t *vp, int flag)
1046 {
1047 	int chunks = 0, error = 0;
1048 	pgcnt_t i, j, totbit;
1049 	pfn_t spfn;
1050 	cbd_t *dp;
1051 	uint_t	spin_cnt = 0;
1052 	extern	int i_cpr_compress_and_save();
1053 
1054 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1055 		spfn = dp->cbd_spfn;
1056 		totbit = BTOb(dp->cbd_size);
1057 		i = 0; /* Beginning of bitmap */
1058 		j = 0;
1059 		while (i < totbit) {
1060 			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1061 				if (isset((char *)dp->cbd_reg_bitmap, j+i))
1062 					j++;
1063 				else /* not contiguous anymore */
1064 					break;
1065 			}
1066 
1067 			if (j) {
1068 				chunks++;
1069 				if (flag == SAVE_TO_STORAGE) {
1070 					error = i_cpr_compress_and_save(
1071 					    chunks, spfn + i, j);
1072 					if (error)
1073 						return (error);
1074 				} else if (flag == WRITE_TO_STATEFILE) {
1075 					error = cpr_compress_and_write(vp, 0,
1076 					    spfn + i, j);
1077 					if (error)
1078 						return (error);
1079 					else {
1080 						spin_cnt++;
1081 						if ((spin_cnt & 0x5F) == 1)
1082 							cpr_spinning_bar();
1083 					}
1084 				}
1085 			}
1086 
1087 			i += j;
1088 			if (j != CPR_MAXCONTIG) {
1089 				/* Stopped on a non-tagged page */
1090 				i++;
1091 			}
1092 
1093 			j = 0;
1094 		}
1095 	}
1096 
1097 	if (flag == STORAGE_DESC_ALLOC)
1098 		return (chunks);
1099 	else
1100 		return (0);
1101 }
1102 
1103 
1104 void
1105 cpr_show_range(caddr_t vaddr, size_t size,
1106     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1107 {
1108 	char *action, *bname;
1109 
1110 	bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1111 	if (bitfunc == cpr_setbit)
1112 		action = "tag";
1113 	else if (bitfunc == cpr_clrbit)
1114 		action = "untag";
1115 	else
1116 		action = "none";
1117 	errp("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1118 	    vaddr, vaddr + size, bname, action, count);
1119 }
1120 
1121 
1122 pgcnt_t
1123 cpr_count_pages(caddr_t sva, size_t size,
1124     int mapflag, bitfunc_t bitfunc, int showrange)
1125 {
1126 	caddr_t	va, eva;
1127 	pfn_t pfn;
1128 	pgcnt_t count = 0;
1129 
1130 	eva = sva + PAGE_ROUNDUP(size);
1131 	for (va = sva; va < eva; va += MMU_PAGESIZE) {
1132 		pfn = va_to_pfn(va);
1133 		if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1134 			if ((*bitfunc)(pfn, mapflag) == 0)
1135 				count++;
1136 		}
1137 	}
1138 
1139 	if ((cpr_debug & LEVEL7) && showrange == DBG_SHOWRANGE)
1140 		cpr_show_range(sva, size, mapflag, bitfunc, count);
1141 
1142 	return (count);
1143 }
1144 
1145 
1146 pgcnt_t
1147 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1148 {
1149 	pgcnt_t count = 0;
1150 
1151 	if (cpr_buf) {
1152 		count += cpr_count_pages(cpr_buf, cpr_buf_size,
1153 		    mapflag, bitfunc, DBG_SHOWRANGE);
1154 	}
1155 	if (cpr_pagedata) {
1156 		count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1157 		    mapflag, bitfunc, DBG_SHOWRANGE);
1158 	}
1159 	count += i_cpr_count_storage_pages(mapflag, bitfunc);
1160 
1161 	DEBUG7(errp("cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1162 	    count, mmu_ptob(count)));
1163 	return (count);
1164 }
1165 
1166 
1167 static int
1168 cpr_dump_regular_pages(vnode_t *vp)
1169 {
1170 	int error;
1171 
1172 	cpr_regular_pgs_dumped = 0;
1173 	error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1174 	if (!error)
1175 		DEBUG7(errp("cpr_dump_regular_pages() done.\n"));
1176 	return (error);
1177 }
1178