xref: /titanic_50/usr/src/uts/common/cpr/cpr_dump.c (revision df0345f7d6cc87cde9e532e8362f1aca053d98cc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Fill in and write out the cpr state file
30  *	1. Allocate and write headers, ELF and cpr dump header
31  *	2. Allocate bitmaps according to phys_install
32  *	3. Tag kernel pages into corresponding bitmap
33  *	4. Write bitmaps to state file
34  *	5. Write actual physical page data to state file
35  */
36 
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/vm.h>
40 #include <sys/memlist.h>
41 #include <sys/kmem.h>
42 #include <sys/vnode.h>
43 #include <sys/fs/ufs_inode.h>
44 #include <sys/errno.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>
47 #include <vm/page.h>
48 #include <vm/seg.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kpm.h>
51 #include <vm/hat.h>
52 #include <sys/cpr.h>
53 #include <sys/conf.h>
54 #include <sys/ddi.h>
55 #include <sys/panic.h>
56 #include <sys/thread.h>
57 #include <sys/note.h>
58 
59 /* Local defines and variables */
60 #define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
61 #define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
62 
63 #if defined(__sparc)
64 static uint_t cpr_pages_tobe_dumped;
65 static uint_t cpr_regular_pgs_dumped;
66 static int cpr_dump_regular_pages(vnode_t *);
67 static int cpr_count_upages(int, bitfunc_t);
68 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
69 #endif
70 
71 int cpr_flush_write(vnode_t *);
72 
73 int cpr_contig_pages(vnode_t *, int);
74 
75 void cpr_clear_bitmaps();
76 
77 extern size_t cpr_get_devsize(dev_t);
78 extern int i_cpr_dump_setup(vnode_t *);
79 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
80 extern int cpr_test_mode;
81 int cpr_setbit(pfn_t, int);
82 int cpr_clrbit(pfn_t, int);
83 
84 ctrm_t cpr_term;
85 
86 char *cpr_buf, *cpr_buf_end;
87 int cpr_buf_blocks;		/* size of cpr_buf in blocks */
88 size_t cpr_buf_size;		/* size of cpr_buf in bytes */
89 size_t cpr_bitmap_size;
90 int cpr_nbitmaps;
91 
92 char *cpr_pagedata;		/* page buffer for compression / tmp copy */
93 size_t cpr_pagedata_size;	/* page buffer size in bytes */
94 
95 #if defined(__sparc)
96 static char *cpr_wptr;		/* keep track of where to write to next */
97 static int cpr_file_bn;		/* cpr state-file block offset */
98 static int cpr_disk_writes_ok;
99 static size_t cpr_dev_space = 0;
100 #endif
101 
102 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
103 
104 #if defined(__sparc)
105 /*
106  * On some platforms bcopy may modify the thread structure
107  * during bcopy (eg, to prevent cpu migration).  If the
108  * range we are currently writing out includes our own
109  * thread structure then it will be snapshotted by bcopy
110  * including those modified members - and the updates made
111  * on exit from bcopy will no longer be seen when we later
112  * restore the mid-bcopy kthread_t.  So if the range we
113  * need to copy overlaps with our thread structure we will
114  * use a simple byte copy.
115  */
116 void
117 cprbcopy(void *from, void *to, size_t bytes)
118 {
119 	extern int curthreadremapped;
120 	caddr_t kthrend;
121 
122 	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
123 	if (curthreadremapped || (kthrend >= (caddr_t)from &&
124 	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
125 		caddr_t src = from, dst = to;
126 
127 		while (bytes-- > 0)
128 			*dst++ = *src++;
129 	} else {
130 		bcopy(from, to, bytes);
131 	}
132 }
133 
134 /*
135  * Allocate pages for buffers used in writing out the statefile
136  */
137 static int
138 cpr_alloc_bufs(void)
139 {
140 	char *allocerr = "Unable to allocate memory for cpr buffer";
141 	size_t size;
142 
143 	/*
144 	 * set the cpr write buffer size to at least the historic
145 	 * size (128k) or large enough to store the both the early
146 	 * set of statefile structures (well under 0x800) plus the
147 	 * bitmaps, and roundup to the next pagesize.
148 	 */
149 	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
150 	cpr_buf_size = MAX(size, CPRBUFSZ);
151 	cpr_buf_blocks = btodb(cpr_buf_size);
152 	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
153 	if (cpr_buf == NULL) {
154 		cpr_err(CE_WARN, allocerr);
155 		return (ENOMEM);
156 	}
157 	cpr_buf_end = cpr_buf + cpr_buf_size;
158 
159 	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
160 	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
161 	if (cpr_pagedata == NULL) {
162 		kmem_free(cpr_buf, cpr_buf_size);
163 		cpr_buf = NULL;
164 		cpr_err(CE_WARN, allocerr);
165 		return (ENOMEM);
166 	}
167 
168 	return (0);
169 }
170 
171 
172 /*
173  * Set bitmap size in bytes based on phys_install.
174  */
175 void
176 cpr_set_bitmap_size(void)
177 {
178 	struct memlist *pmem;
179 	size_t size = 0;
180 
181 	memlist_read_lock();
182 	for (pmem = phys_install; pmem; pmem = pmem->next)
183 		size += pmem->size;
184 	memlist_read_unlock();
185 	cpr_bitmap_size = BITMAP_BYTES(size);
186 }
187 
188 
189 /*
190  * CPR dump header contains the following information:
191  *	1. header magic -- unique to cpr state file
192  *	2. kernel return pc & ppn for resume
193  *	3. current thread info
194  *	4. debug level and test mode
195  *	5. number of bitmaps allocated
196  *	6. number of page records
197  */
198 static int
199 cpr_write_header(vnode_t *vp)
200 {
201 	extern ushort_t cpr_mach_type;
202 	struct cpr_dump_desc cdump;
203 	pgcnt_t bitmap_pages;
204 	pgcnt_t kpages, vpages, upages;
205 	pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
206 
207 	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
208 	cdump.cdd_version = CPR_VERSION;
209 	cdump.cdd_machine = cpr_mach_type;
210 	cdump.cdd_debug = cpr_debug;
211 	cdump.cdd_test_mode = cpr_test_mode;
212 	cdump.cdd_bitmaprec = cpr_nbitmaps;
213 
214 	cpr_clear_bitmaps();
215 
216 	/*
217 	 * Remember how many pages we plan to save to statefile.
218 	 * This information will be used for sanity checks.
219 	 * Untag those pages that will not be saved to statefile.
220 	 */
221 	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
222 	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
223 	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
224 	cdump.cdd_dumppgsize = kpages - vpages + upages;
225 	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
226 	CPR_DEBUG(CPR_DEBUG7,
227 	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
228 	    kpages, vpages, upages, cdump.cdd_dumppgsize);
229 
230 	/*
231 	 * Some pages contain volatile data (cpr_buf and storage area for
232 	 * sensitive kpages), which are no longer needed after the statefile
233 	 * is dumped to disk.  We have already untagged them from regular
234 	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
235 	 * volatile bitmaps will be claimed during resume, and the resumed
236 	 * kernel will free them.
237 	 */
238 	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
239 
240 	bitmap_pages = mmu_btopr(cpr_bitmap_size);
241 
242 	/*
243 	 * Export accurate statefile size for statefile allocation retry.
244 	 * statefile_size = all the headers + total pages +
245 	 * number of pages used by the bitmaps.
246 	 * Roundup will be done in the file allocation code.
247 	 */
248 	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
249 	    (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
250 	    (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
251 	    mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
252 
253 	/*
254 	 * If the estimated statefile is not big enough,
255 	 * go retry now to save un-necessary operations.
256 	 */
257 	if (!(CPR->c_flags & C_COMPRESSING) &&
258 	    (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
259 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
260 			prom_printf("cpr_write_header: "
261 			    "STAT->cs_nocomp_statefsz > "
262 			    "STAT->cs_est_statefsz\n");
263 		return (ENOSPC);
264 	}
265 
266 	/* now write cpr dump descriptor */
267 	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
268 }
269 
270 
271 /*
272  * CPR dump tail record contains the following information:
273  *	1. header magic -- unique to cpr state file
274  *	2. all misc info that needs to be passed to cprboot or resumed kernel
275  */
276 static int
277 cpr_write_terminator(vnode_t *vp)
278 {
279 	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
280 	cpr_term.va = (cpr_ptr)&cpr_term;
281 	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
282 
283 	/* count the last one (flush) */
284 	cpr_term.real_statef_size = STAT->cs_real_statefsz +
285 	    btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
286 
287 	CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
288 	    STAT->cs_real_statefsz);
289 
290 	cpr_tod_get(&cpr_term.tm_shutdown);
291 
292 	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
293 }
294 
295 /*
296  * Write bitmap descriptor array, followed by merged bitmaps.
297  */
298 static int
299 cpr_write_bitmap(vnode_t *vp)
300 {
301 	char *rmap, *vmap, *dst, *tail;
302 	size_t size, bytes;
303 	cbd_t *dp;
304 	int err;
305 
306 	dp = CPR->c_bmda;
307 	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
308 		return (err);
309 
310 	/*
311 	 * merge regular and volatile bitmaps into tmp space
312 	 * and write to disk
313 	 */
314 	for (; dp->cbd_size; dp++) {
315 		rmap = (char *)dp->cbd_reg_bitmap;
316 		vmap = (char *)dp->cbd_vlt_bitmap;
317 		for (size = dp->cbd_size; size; size -= bytes) {
318 			bytes = min(size, sizeof (cpr_pagecopy));
319 			tail = &cpr_pagecopy[bytes];
320 			for (dst = cpr_pagecopy; dst < tail; dst++)
321 				*dst = *rmap++ | *vmap++;
322 			if (err = cpr_write(vp, cpr_pagecopy, bytes))
323 				break;
324 		}
325 	}
326 
327 	return (err);
328 }
329 
330 
331 static int
332 cpr_write_statefile(vnode_t *vp)
333 {
334 	uint_t error = 0;
335 	extern	int	i_cpr_check_pgs_dumped();
336 	void flush_windows(void);
337 	pgcnt_t spages;
338 	char *str;
339 
340 	flush_windows();
341 
342 	/*
343 	 * to get an accurate view of kas, we need to untag sensitive
344 	 * pages *before* dumping them because the disk driver makes
345 	 * allocations and changes kas along the way.  The remaining
346 	 * pages referenced in the bitmaps are dumped out later as
347 	 * regular kpages.
348 	 */
349 	str = "cpr_write_statefile:";
350 	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
351 	CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
352 
353 	/*
354 	 * now it's OK to call a driver that makes allocations
355 	 */
356 	cpr_disk_writes_ok = 1;
357 
358 	/*
359 	 * now write out the clean sensitive kpages
360 	 * according to the sensitive descriptors
361 	 */
362 	error = i_cpr_dump_sensitive_kpages(vp);
363 	if (error) {
364 		CPR_DEBUG(CPR_DEBUG7,
365 		    "%s cpr_dump_sensitive_kpages() failed!\n", str);
366 		return (error);
367 	}
368 
369 	/*
370 	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
371 	 */
372 	error = cpr_dump_regular_pages(vp);
373 	if (error) {
374 		CPR_DEBUG(CPR_DEBUG7,
375 		    "%s cpr_dump_regular_pages() failed!\n", str);
376 		return (error);
377 	}
378 
379 	/*
380 	 * sanity check to verify the right number of pages were dumped
381 	 */
382 	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
383 	    cpr_regular_pgs_dumped);
384 
385 	if (error) {
386 		prom_printf("\n%s page count mismatch!\n", str);
387 #ifdef DEBUG
388 		if (cpr_test_mode)
389 			debug_enter(NULL);
390 #endif
391 	}
392 
393 	return (error);
394 }
395 #endif
396 
397 
398 /*
399  * creates the CPR state file, the following sections are
400  * written out in sequence:
401  *    - writes the cpr dump header
402  *    - writes the memory usage bitmaps
403  *    - writes the platform dependent info
404  *    - writes the remaining user pages
405  *    - writes the kernel pages
406  */
407 #if defined(__x86)
408 	_NOTE(ARGSUSED(0))
409 #endif
410 int
411 cpr_dump(vnode_t *vp)
412 {
413 #if defined(__sparc)
414 	int error;
415 
416 	if (cpr_buf == NULL) {
417 		ASSERT(cpr_pagedata == NULL);
418 		if (error = cpr_alloc_bufs())
419 			return (error);
420 	}
421 	/* point to top of internal buffer */
422 	cpr_wptr = cpr_buf;
423 
424 	/* initialize global variables used by the write operation */
425 	cpr_file_bn = cpr_statefile_offset();
426 	cpr_dev_space = 0;
427 
428 	/* allocate bitmaps */
429 	if (CPR->c_bmda == NULL) {
430 		if (error = i_cpr_alloc_bitmaps()) {
431 			cpr_err(CE_WARN, "cannot allocate bitmaps");
432 			return (error);
433 		}
434 	}
435 
436 	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
437 		return (error);
438 
439 	if (error = i_cpr_dump_setup(vp))
440 		return (error);
441 
442 	/*
443 	 * set internal cross checking; we dont want to call
444 	 * a disk driver that makes allocations until after
445 	 * sensitive pages are saved
446 	 */
447 	cpr_disk_writes_ok = 0;
448 
449 	/*
450 	 * 1253112: heap corruption due to memory allocation when dumpping
451 	 *	    statefile.
452 	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
453 	 * kvseg segments can be contaminated should memory allocations happen
454 	 * during sddump, which is not supposed to happen after the system
455 	 * is quiesced. Let's call the kernel pages that tend to be affected
456 	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
457 	 * will allocate some storage space to save the clean sensitive pages
458 	 * aside before statefile dumping takes place. Since there may not be
459 	 * much memory left at this stage, the sensitive pages will be
460 	 * compressed before they are saved into the storage area.
461 	 */
462 	if (error = i_cpr_save_sensitive_kpages()) {
463 		CPR_DEBUG(CPR_DEBUG7,
464 		    "cpr_dump: save_sensitive_kpages failed!\n");
465 		return (error);
466 	}
467 
468 	/*
469 	 * since all cpr allocations are done (space for sensitive kpages,
470 	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
471 	 * count regular and sensitive kpages.
472 	 */
473 	if (error = cpr_write_header(vp)) {
474 		CPR_DEBUG(CPR_DEBUG7,
475 		    "cpr_dump: cpr_write_header() failed!\n");
476 		return (error);
477 	}
478 
479 	if (error = i_cpr_write_machdep(vp))
480 		return (error);
481 
482 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
483 		return (error);
484 
485 	if (error = cpr_write_bitmap(vp))
486 		return (error);
487 
488 	if (error = cpr_write_statefile(vp)) {
489 		CPR_DEBUG(CPR_DEBUG7,
490 		    "cpr_dump: cpr_write_statefile() failed!\n");
491 		return (error);
492 	}
493 
494 	if (error = cpr_write_terminator(vp))
495 		return (error);
496 
497 	if (error = cpr_flush_write(vp))
498 		return (error);
499 
500 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
501 		return (error);
502 #endif
503 
504 	return (0);
505 }
506 
507 
508 #if defined(__sparc)
509 /*
510  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
511  * a page-count from each range is accumulated at arg->pages.
512  */
513 static void
514 cpr_xwalk(void *arg, void *base, size_t size)
515 {
516 	struct cpr_walkinfo *cwip = arg;
517 
518 	cwip->pages += cpr_count_pages(base, size,
519 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
520 	cwip->size += size;
521 	cwip->ranges++;
522 }
523 
524 /*
525  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
526  * a page-count from each range is accumulated at arg->pages.
527  */
528 static void
529 cpr_walk(void *arg, void *base, size_t size)
530 {
531 	caddr_t addr = base;
532 	caddr_t addr_end = addr + size;
533 
534 	/*
535 	 * If we are about to start walking the range of addresses we
536 	 * carved out of the kernel heap for the large page heap walk
537 	 * heap_lp_arena to find what segments are actually populated
538 	 */
539 	if (SEGKMEM_USE_LARGEPAGES &&
540 	    addr == heap_lp_base && addr_end == heap_lp_end &&
541 	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
542 		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
543 	} else {
544 		cpr_xwalk(arg, base, size);
545 	}
546 }
547 
548 
549 /*
550  * faster scan of kvseg using vmem_walk() to visit
551  * allocated ranges.
552  */
553 pgcnt_t
554 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
555 {
556 	struct cpr_walkinfo cwinfo;
557 
558 	bzero(&cwinfo, sizeof (cwinfo));
559 	cwinfo.mapflag = mapflag;
560 	cwinfo.bitfunc = bitfunc;
561 
562 	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
563 
564 	if (cpr_debug & CPR_DEBUG7) {
565 		prom_printf("walked %d sub-ranges, total pages %ld\n",
566 		    cwinfo.ranges, mmu_btop(cwinfo.size));
567 		cpr_show_range(seg->s_base, seg->s_size,
568 		    mapflag, bitfunc, cwinfo.pages);
569 	}
570 
571 	return (cwinfo.pages);
572 }
573 
574 
575 /*
576  * cpr_walk_kpm() is called for every used area within the large
577  * segkpm virtual address window. A page-count is accumulated at
578  * arg->pages.
579  */
580 static void
581 cpr_walk_kpm(void *arg, void *base, size_t size)
582 {
583 	struct cpr_walkinfo *cwip = arg;
584 
585 	cwip->pages += cpr_count_pages(base, size,
586 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
587 	cwip->size += size;
588 	cwip->ranges++;
589 }
590 
591 
592 /*
593  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
594  */
595 /*ARGSUSED*/
596 static pgcnt_t
597 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
598 {
599 	struct cpr_walkinfo cwinfo;
600 
601 	if (kpm_enable == 0)
602 		return (0);
603 
604 	bzero(&cwinfo, sizeof (cwinfo));
605 	cwinfo.mapflag = mapflag;
606 	cwinfo.bitfunc = bitfunc;
607 	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
608 
609 	if (cpr_debug & CPR_DEBUG7) {
610 		prom_printf("walked %d sub-ranges, total pages %ld\n",
611 		    cwinfo.ranges, mmu_btop(cwinfo.size));
612 		cpr_show_range(segkpm->s_base, segkpm->s_size,
613 		    mapflag, bitfunc, cwinfo.pages);
614 	}
615 
616 	return (cwinfo.pages);
617 }
618 
619 
620 /*
621  * Sparsely filled kernel segments are registered in kseg_table for
622  * easier lookup. See also block comment for cpr_count_seg_pages.
623  */
624 
625 #define	KSEG_SEG_ADDR	0	/* address of struct seg */
626 #define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
627 
628 typedef struct {
629 	struct seg **st_seg;		/* segment pointer or segment address */
630 	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
631 	int	st_addrtype;		/* address type in st_seg */
632 } ksegtbl_entry_t;
633 
634 ksegtbl_entry_t kseg_table[] = {
635 	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
636 	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
637 	{NULL,				0,			0}
638 };
639 
640 
641 /*
642  * Compare seg with each entry in kseg_table; when there is a match
643  * return the entry pointer, otherwise return NULL.
644  */
645 static ksegtbl_entry_t *
646 cpr_sparse_seg_check(struct seg *seg)
647 {
648 	ksegtbl_entry_t *ste = &kseg_table[0];
649 	struct seg *tseg;
650 
651 	for (; ste->st_seg; ste++) {
652 		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
653 		    *ste->st_seg : (struct seg *)ste->st_seg;
654 
655 		if (seg == tseg)
656 			return (ste);
657 	}
658 
659 	return ((ksegtbl_entry_t *)NULL);
660 }
661 
662 
663 /*
664  * Count pages within each kernel segment; call cpr_sparse_seg_check()
665  * to find out whether a sparsely filled segment needs special
666  * treatment (e.g. kvseg).
667  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
668  *       module shouldn't need to know segment details like if it is
669  *       sparsely filled or not (makes kseg_table obsolete).
670  */
671 pgcnt_t
672 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
673 {
674 	struct seg *segp;
675 	pgcnt_t pages;
676 	ksegtbl_entry_t *ste;
677 
678 	pages = 0;
679 	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
680 		if (ste = cpr_sparse_seg_check(segp)) {
681 			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
682 		} else {
683 			pages += cpr_count_pages(segp->s_base,
684 			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
685 		}
686 	}
687 
688 	return (pages);
689 }
690 
691 
692 /*
693  * count kernel pages within kas and any special ranges
694  */
695 pgcnt_t
696 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
697 {
698 	pgcnt_t kas_cnt;
699 
700 	/*
701 	 * Some pages need to be taken care of differently.
702 	 * eg: panicbuf pages of sun4m are not in kas but they need
703 	 * to be saved.  On sun4u, the physical pages of panicbuf are
704 	 * allocated via prom_retain().
705 	 */
706 	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
707 	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
708 
709 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
710 	CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
711 	    kas_cnt, mmu_ptob(kas_cnt));
712 
713 	return (kas_cnt);
714 }
715 
716 
717 /*
718  * Set a bit corresponding to the arg phys page number;
719  * returns 0 when the ppn is valid and the corresponding
720  * map bit was clear, otherwise returns 1.
721  */
722 int
723 cpr_setbit(pfn_t ppn, int mapflag)
724 {
725 	char *bitmap;
726 	cbd_t *dp;
727 	pfn_t rel;
728 	int clr;
729 
730 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
731 		if (PPN_IN_RANGE(ppn, dp)) {
732 			bitmap = DESC_TO_MAP(dp, mapflag);
733 			rel = ppn - dp->cbd_spfn;
734 			if ((clr = isclr(bitmap, rel)) != 0)
735 				setbit(bitmap, rel);
736 			return (clr == 0);
737 		}
738 	}
739 
740 	return (1);
741 }
742 
743 
744 /*
745  * Clear a bit corresponding to the arg phys page number.
746  */
747 int
748 cpr_clrbit(pfn_t ppn, int mapflag)
749 {
750 	char *bitmap;
751 	cbd_t *dp;
752 	pfn_t rel;
753 	int set;
754 
755 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
756 		if (PPN_IN_RANGE(ppn, dp)) {
757 			bitmap = DESC_TO_MAP(dp, mapflag);
758 			rel = ppn - dp->cbd_spfn;
759 			if ((set = isset(bitmap, rel)) != 0)
760 				clrbit(bitmap, rel);
761 			return (set == 0);
762 		}
763 	}
764 
765 	return (1);
766 }
767 
768 
769 /* ARGSUSED */
770 int
771 cpr_nobit(pfn_t ppn, int mapflag)
772 {
773 	return (0);
774 }
775 
776 
777 /*
778  * Lookup a bit corresponding to the arg phys page number.
779  */
780 int
781 cpr_isset(pfn_t ppn, int mapflag)
782 {
783 	char *bitmap;
784 	cbd_t *dp;
785 	pfn_t rel;
786 
787 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
788 		if (PPN_IN_RANGE(ppn, dp)) {
789 			bitmap = DESC_TO_MAP(dp, mapflag);
790 			rel = ppn - dp->cbd_spfn;
791 			return (isset(bitmap, rel));
792 		}
793 	}
794 
795 	return (0);
796 }
797 
798 
799 /*
800  * Go thru all pages and pick up any page not caught during the invalidation
801  * stage. This is also used to save pages with cow lock or phys page lock held
802  * (none zero p_lckcnt or p_cowcnt)
803  */
804 static	int
805 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
806 {
807 	page_t *pp, *page0;
808 	pgcnt_t dcnt = 0, tcnt = 0;
809 	pfn_t pfn;
810 
811 	page0 = pp = page_first();
812 
813 	do {
814 #if defined(__sparc)
815 		extern struct vnode prom_ppages;
816 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
817 		    pp->p_vnode == &prom_ppages ||
818 		    PP_ISFREE(pp) && PP_ISAGED(pp))
819 #else
820 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
821 		    PP_ISFREE(pp) && PP_ISAGED(pp))
822 #endif /* __sparc */
823 			continue;
824 
825 		pfn = page_pptonum(pp);
826 		if (pf_is_memory(pfn)) {
827 			tcnt++;
828 			if ((*bitfunc)(pfn, mapflag) == 0)
829 				dcnt++; /* dirty count */
830 		}
831 	} while ((pp = page_next(pp)) != page0);
832 
833 	STAT->cs_upage2statef = dcnt;
834 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
835 	    dcnt, tcnt);
836 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
837 	    dcnt, mmu_ptob(dcnt));
838 
839 	return (dcnt);
840 }
841 
842 
843 /*
844  * try compressing pages based on cflag,
845  * and for DEBUG kernels, verify uncompressed data checksum;
846  *
847  * this routine replaces common code from
848  * i_cpr_compress_and_save() and cpr_compress_and_write()
849  */
850 char *
851 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
852 {
853 	size_t nbytes, clen, len;
854 	uint32_t test_sum;
855 	char *datap;
856 
857 	nbytes = mmu_ptob(pages);
858 
859 	/*
860 	 * set length to the original uncompressed data size;
861 	 * always init cpd_flag to zero
862 	 */
863 	dp->cpd_length = nbytes;
864 	dp->cpd_flag = 0;
865 
866 #ifdef	DEBUG
867 	/*
868 	 * Make a copy of the uncompressed data so we can checksum it.
869 	 * Compress that copy so the checksum works at the other end
870 	 */
871 	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
872 	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
873 	dp->cpd_flag |= CPD_USUM;
874 	datap = cpr_pagecopy;
875 #else
876 	datap = CPR->c_mapping_area;
877 	dp->cpd_usum = 0;
878 #endif
879 
880 	/*
881 	 * try compressing the raw data to cpr_pagedata;
882 	 * if there was a size reduction: record the new length,
883 	 * flag the compression, and point to the compressed data.
884 	 */
885 	dp->cpd_csum = 0;
886 	if (cflag) {
887 		clen = compress(datap, cpr_pagedata, nbytes);
888 		if (clen < nbytes) {
889 			dp->cpd_flag |= CPD_COMPRESS;
890 			dp->cpd_length = clen;
891 			datap = cpr_pagedata;
892 #ifdef	DEBUG
893 			dp->cpd_csum = checksum32(datap, clen);
894 			dp->cpd_flag |= CPD_CSUM;
895 
896 			/*
897 			 * decompress the data back to a scratch area
898 			 * and compare the new checksum with the original
899 			 * checksum to verify the compression.
900 			 */
901 			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
902 			len = decompress(datap, cpr_pagecopy,
903 			    clen, sizeof (cpr_pagecopy));
904 			test_sum = checksum32(cpr_pagecopy, len);
905 			ASSERT(test_sum == dp->cpd_usum);
906 #endif
907 		}
908 	}
909 
910 	return (datap);
911 }
912 
913 
914 /*
915  * 1. Prepare cpr page descriptor and write it to file
916  * 2. Compress page data and write it out
917  */
918 static int
919 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
920 {
921 	int error = 0;
922 	char *datap;
923 	cpd_t cpd;	/* cpr page descriptor */
924 	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
925 	extern void i_cpr_mapout(caddr_t, uint_t);
926 
927 	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
928 
929 	CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
930 	    npg, (void *)CPR->c_mapping_area, pfn);
931 
932 	/*
933 	 * Fill cpr page descriptor.
934 	 */
935 	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
936 	cpd.cpd_pfn = pfn;
937 	cpd.cpd_pages = npg;
938 
939 	STAT->cs_dumped_statefsz += mmu_ptob(npg);
940 
941 	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
942 
943 	/* Write cpr page descriptor */
944 	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
945 
946 	/* Write compressed page data */
947 	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
948 
949 	/*
950 	 * Unmap the pages for tlb and vac flushing
951 	 */
952 	i_cpr_mapout(CPR->c_mapping_area, npg);
953 
954 	if (error) {
955 		CPR_DEBUG(CPR_DEBUG1,
956 		    "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
957 		CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
958 		    pfn, cpr_file_bn, error);
959 	} else {
960 		cpr_regular_pgs_dumped += npg;
961 	}
962 
963 	return (error);
964 }
965 
966 
967 int
968 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
969 {
970 	caddr_t	fromp = buffer;
971 	size_t bytes, wbytes;
972 	int error;
973 
974 	if (cpr_dev_space == 0) {
975 		if (vp->v_type == VBLK) {
976 			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
977 			ASSERT(cpr_dev_space);
978 		} else
979 			cpr_dev_space = 1;	/* not used in this case */
980 	}
981 
982 	/*
983 	 * break the write into multiple part if request is large,
984 	 * calculate count up to buf page boundary, then write it out.
985 	 * repeat until done.
986 	 */
987 	while (size) {
988 		bytes = MIN(size, cpr_buf_end - cpr_wptr);
989 		cprbcopy(fromp, cpr_wptr, bytes);
990 		cpr_wptr += bytes;
991 		fromp += bytes;
992 		size -= bytes;
993 		if (cpr_wptr < cpr_buf_end)
994 			return (0);	/* buffer not full yet */
995 		ASSERT(cpr_wptr == cpr_buf_end);
996 
997 		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
998 		if (vp->v_type == VBLK) {
999 			if (wbytes > cpr_dev_space)
1000 				return (ENOSPC);
1001 		} else {
1002 			if (wbytes > VTOI(vp)->i_size)
1003 				return (ENOSPC);
1004 		}
1005 
1006 		CPR_DEBUG(CPR_DEBUG3,
1007 		    "cpr_write: frmp=%p wptr=%p cnt=%lx...",
1008 		    (void *)fromp, (void *)cpr_wptr, bytes);
1009 		/*
1010 		 * cross check, this should not happen!
1011 		 */
1012 		if (cpr_disk_writes_ok == 0) {
1013 			prom_printf("cpr_write: disk write too early!\n");
1014 			return (EINVAL);
1015 		}
1016 
1017 		do_polled_io = 1;
1018 		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
1019 		    NULL);
1020 		do_polled_io = 0;
1021 		CPR_DEBUG(CPR_DEBUG3, "done\n");
1022 
1023 		STAT->cs_real_statefsz += cpr_buf_size;
1024 
1025 		if (error) {
1026 			cpr_err(CE_WARN, "cpr_write error %d", error);
1027 			return (error);
1028 		}
1029 		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
1030 		cpr_wptr = cpr_buf;		/* back to top of buffer */
1031 	}
1032 	return (0);
1033 }
1034 
1035 
1036 int
1037 cpr_flush_write(vnode_t *vp)
1038 {
1039 	int	nblk;
1040 	int	error;
1041 
1042 	/*
1043 	 * Calculate remaining blocks in buffer, rounded up to nearest
1044 	 * disk block
1045 	 */
1046 	nblk = btod(cpr_wptr - cpr_buf);
1047 
1048 	do_polled_io = 1;
1049 	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
1050 	do_polled_io = 0;
1051 
1052 	cpr_file_bn += nblk;
1053 	if (error)
1054 		CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
1055 		    error);
1056 	return (error);
1057 }
1058 
1059 void
1060 cpr_clear_bitmaps(void)
1061 {
1062 	cbd_t *dp;
1063 
1064 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1065 		bzero((void *)dp->cbd_reg_bitmap,
1066 		    (size_t)dp->cbd_size * 2);
1067 	}
1068 	CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
1069 }
1070 
1071 int
1072 cpr_contig_pages(vnode_t *vp, int flag)
1073 {
1074 	int chunks = 0, error = 0;
1075 	pgcnt_t i, j, totbit;
1076 	pfn_t spfn;
1077 	cbd_t *dp;
1078 	uint_t	spin_cnt = 0;
1079 	extern	int i_cpr_compress_and_save();
1080 
1081 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1082 		spfn = dp->cbd_spfn;
1083 		totbit = BTOb(dp->cbd_size);
1084 		i = 0; /* Beginning of bitmap */
1085 		j = 0;
1086 		while (i < totbit) {
1087 			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1088 				if (isset((char *)dp->cbd_reg_bitmap, j+i))
1089 					j++;
1090 				else /* not contiguous anymore */
1091 					break;
1092 			}
1093 
1094 			if (j) {
1095 				chunks++;
1096 				if (flag == SAVE_TO_STORAGE) {
1097 					error = i_cpr_compress_and_save(
1098 					    chunks, spfn + i, j);
1099 					if (error)
1100 						return (error);
1101 				} else if (flag == WRITE_TO_STATEFILE) {
1102 					error = cpr_compress_and_write(vp, 0,
1103 					    spfn + i, j);
1104 					if (error)
1105 						return (error);
1106 					else {
1107 						spin_cnt++;
1108 						if ((spin_cnt & 0x5F) == 1)
1109 							cpr_spinning_bar();
1110 					}
1111 				}
1112 			}
1113 
1114 			i += j;
1115 			if (j != CPR_MAXCONTIG) {
1116 				/* Stopped on a non-tagged page */
1117 				i++;
1118 			}
1119 
1120 			j = 0;
1121 		}
1122 	}
1123 
1124 	if (flag == STORAGE_DESC_ALLOC)
1125 		return (chunks);
1126 	else
1127 		return (0);
1128 }
1129 
1130 
1131 void
1132 cpr_show_range(caddr_t vaddr, size_t size,
1133     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1134 {
1135 	char *action, *bname;
1136 
1137 	bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1138 	if (bitfunc == cpr_setbit)
1139 		action = "tag";
1140 	else if (bitfunc == cpr_clrbit)
1141 		action = "untag";
1142 	else
1143 		action = "none";
1144 	prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1145 	    (void *)vaddr, (void *)(vaddr + size), bname, action, count);
1146 }
1147 
1148 
1149 pgcnt_t
1150 cpr_count_pages(caddr_t sva, size_t size,
1151     int mapflag, bitfunc_t bitfunc, int showrange)
1152 {
1153 	caddr_t	va, eva;
1154 	pfn_t pfn;
1155 	pgcnt_t count = 0;
1156 
1157 	eva = sva + PAGE_ROUNDUP(size);
1158 	for (va = sva; va < eva; va += MMU_PAGESIZE) {
1159 		pfn = va_to_pfn(va);
1160 		if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1161 			if ((*bitfunc)(pfn, mapflag) == 0)
1162 				count++;
1163 		}
1164 	}
1165 
1166 	if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
1167 		cpr_show_range(sva, size, mapflag, bitfunc, count);
1168 
1169 	return (count);
1170 }
1171 
1172 
1173 pgcnt_t
1174 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1175 {
1176 	pgcnt_t count = 0;
1177 
1178 	if (cpr_buf) {
1179 		count += cpr_count_pages(cpr_buf, cpr_buf_size,
1180 		    mapflag, bitfunc, DBG_SHOWRANGE);
1181 	}
1182 	if (cpr_pagedata) {
1183 		count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1184 		    mapflag, bitfunc, DBG_SHOWRANGE);
1185 	}
1186 	count += i_cpr_count_storage_pages(mapflag, bitfunc);
1187 
1188 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1189 	    count, mmu_ptob(count));
1190 	return (count);
1191 }
1192 
1193 
1194 static int
1195 cpr_dump_regular_pages(vnode_t *vp)
1196 {
1197 	int error;
1198 
1199 	cpr_regular_pgs_dumped = 0;
1200 	error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1201 	if (!error)
1202 		CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
1203 	return (error);
1204 }
1205 #endif
1206