1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Fill in and write out the cpr state file
28 * 1. Allocate and write headers, ELF and cpr dump header
29 * 2. Allocate bitmaps according to phys_install
30 * 3. Tag kernel pages into corresponding bitmap
31 * 4. Write bitmaps to state file
32 * 5. Write actual physical page data to state file
33 */
34
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/vm.h>
38 #include <sys/memlist.h>
39 #include <sys/kmem.h>
40 #include <sys/vnode.h>
41 #include <sys/fs/ufs_inode.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <vm/page.h>
46 #include <vm/seg.h>
47 #include <vm/seg_kmem.h>
48 #include <vm/seg_kpm.h>
49 #include <vm/hat.h>
50 #include <sys/cpr.h>
51 #include <sys/conf.h>
52 #include <sys/ddi.h>
53 #include <sys/panic.h>
54 #include <sys/thread.h>
55 #include <sys/note.h>
56
57 /* Local defines and variables */
58 #define BTOb(bytes) ((bytes) << 3) /* Bytes to bits, log2(NBBY) */
59 #define bTOB(bits) ((bits) >> 3) /* bits to Bytes, log2(NBBY) */
60
61 #if defined(__sparc)
62 static uint_t cpr_pages_tobe_dumped;
63 static uint_t cpr_regular_pgs_dumped;
64 static int cpr_dump_regular_pages(vnode_t *);
65 static int cpr_count_upages(int, bitfunc_t);
66 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
67 #endif
68
69 int cpr_flush_write(vnode_t *);
70
71 int cpr_contig_pages(vnode_t *, int);
72
73 void cpr_clear_bitmaps();
74
75 extern size_t cpr_get_devsize(dev_t);
76 extern int i_cpr_dump_setup(vnode_t *);
77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
78 extern int cpr_test_mode;
79 int cpr_setbit(pfn_t, int);
80 int cpr_clrbit(pfn_t, int);
81
82 ctrm_t cpr_term;
83
84 char *cpr_buf, *cpr_buf_end;
85 int cpr_buf_blocks; /* size of cpr_buf in blocks */
86 size_t cpr_buf_size; /* size of cpr_buf in bytes */
87 size_t cpr_bitmap_size;
88 int cpr_nbitmaps;
89
90 char *cpr_pagedata; /* page buffer for compression / tmp copy */
91 size_t cpr_pagedata_size; /* page buffer size in bytes */
92
93 #if defined(__sparc)
94 static char *cpr_wptr; /* keep track of where to write to next */
95 static int cpr_file_bn; /* cpr state-file block offset */
96 static int cpr_disk_writes_ok;
97 static size_t cpr_dev_space = 0;
98 #endif
99
100 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
101
102 #if defined(__sparc)
103 /*
104 * On some platforms bcopy may modify the thread structure
105 * during bcopy (eg, to prevent cpu migration). If the
106 * range we are currently writing out includes our own
107 * thread structure then it will be snapshotted by bcopy
108 * including those modified members - and the updates made
109 * on exit from bcopy will no longer be seen when we later
110 * restore the mid-bcopy kthread_t. So if the range we
111 * need to copy overlaps with our thread structure we will
112 * use a simple byte copy.
113 */
114 void
cprbcopy(void * from,void * to,size_t bytes)115 cprbcopy(void *from, void *to, size_t bytes)
116 {
117 extern int curthreadremapped;
118 caddr_t kthrend;
119
120 kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
121 if (curthreadremapped || (kthrend >= (caddr_t)from &&
122 kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
123 caddr_t src = from, dst = to;
124
125 while (bytes-- > 0)
126 *dst++ = *src++;
127 } else {
128 bcopy(from, to, bytes);
129 }
130 }
131
132 /*
133 * Allocate pages for buffers used in writing out the statefile
134 */
135 static int
cpr_alloc_bufs(void)136 cpr_alloc_bufs(void)
137 {
138 char *allocerr = "Unable to allocate memory for cpr buffer";
139 size_t size;
140
141 /*
142 * set the cpr write buffer size to at least the historic
143 * size (128k) or large enough to store the both the early
144 * set of statefile structures (well under 0x800) plus the
145 * bitmaps, and roundup to the next pagesize.
146 */
147 size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
148 cpr_buf_size = MAX(size, CPRBUFSZ);
149 cpr_buf_blocks = btodb(cpr_buf_size);
150 cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
151 if (cpr_buf == NULL) {
152 cpr_err(CE_WARN, allocerr);
153 return (ENOMEM);
154 }
155 cpr_buf_end = cpr_buf + cpr_buf_size;
156
157 cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
158 cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
159 if (cpr_pagedata == NULL) {
160 kmem_free(cpr_buf, cpr_buf_size);
161 cpr_buf = NULL;
162 cpr_err(CE_WARN, allocerr);
163 return (ENOMEM);
164 }
165
166 return (0);
167 }
168
169
170 /*
171 * Set bitmap size in bytes based on phys_install.
172 */
173 void
cpr_set_bitmap_size(void)174 cpr_set_bitmap_size(void)
175 {
176 struct memlist *pmem;
177 size_t size = 0;
178
179 memlist_read_lock();
180 for (pmem = phys_install; pmem; pmem = pmem->ml_next)
181 size += pmem->ml_size;
182 memlist_read_unlock();
183 cpr_bitmap_size = BITMAP_BYTES(size);
184 }
185
186
187 /*
188 * CPR dump header contains the following information:
189 * 1. header magic -- unique to cpr state file
190 * 2. kernel return pc & ppn for resume
191 * 3. current thread info
192 * 4. debug level and test mode
193 * 5. number of bitmaps allocated
194 * 6. number of page records
195 */
196 static int
cpr_write_header(vnode_t * vp)197 cpr_write_header(vnode_t *vp)
198 {
199 extern ushort_t cpr_mach_type;
200 struct cpr_dump_desc cdump;
201 pgcnt_t bitmap_pages;
202 pgcnt_t kpages, vpages, upages;
203 pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
204
205 cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
206 cdump.cdd_version = CPR_VERSION;
207 cdump.cdd_machine = cpr_mach_type;
208 cdump.cdd_debug = cpr_debug;
209 cdump.cdd_test_mode = cpr_test_mode;
210 cdump.cdd_bitmaprec = cpr_nbitmaps;
211
212 cpr_clear_bitmaps();
213
214 /*
215 * Remember how many pages we plan to save to statefile.
216 * This information will be used for sanity checks.
217 * Untag those pages that will not be saved to statefile.
218 */
219 kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
220 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
221 upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
222 cdump.cdd_dumppgsize = kpages - vpages + upages;
223 cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
224 CPR_DEBUG(CPR_DEBUG7,
225 "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
226 kpages, vpages, upages, cdump.cdd_dumppgsize);
227
228 /*
229 * Some pages contain volatile data (cpr_buf and storage area for
230 * sensitive kpages), which are no longer needed after the statefile
231 * is dumped to disk. We have already untagged them from regular
232 * bitmaps. Now tag them into the volatile bitmaps. The pages in
233 * volatile bitmaps will be claimed during resume, and the resumed
234 * kernel will free them.
235 */
236 (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
237
238 bitmap_pages = mmu_btopr(cpr_bitmap_size);
239
240 /*
241 * Export accurate statefile size for statefile allocation retry.
242 * statefile_size = all the headers + total pages +
243 * number of pages used by the bitmaps.
244 * Roundup will be done in the file allocation code.
245 */
246 STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
247 (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
248 (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
249 mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
250
251 /*
252 * If the estimated statefile is not big enough,
253 * go retry now to save un-necessary operations.
254 */
255 if (!(CPR->c_flags & C_COMPRESSING) &&
256 (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
257 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
258 prom_printf("cpr_write_header: "
259 "STAT->cs_nocomp_statefsz > "
260 "STAT->cs_est_statefsz\n");
261 return (ENOSPC);
262 }
263
264 /* now write cpr dump descriptor */
265 return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
266 }
267
268
269 /*
270 * CPR dump tail record contains the following information:
271 * 1. header magic -- unique to cpr state file
272 * 2. all misc info that needs to be passed to cprboot or resumed kernel
273 */
274 static int
cpr_write_terminator(vnode_t * vp)275 cpr_write_terminator(vnode_t *vp)
276 {
277 cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
278 cpr_term.va = (cpr_ptr)&cpr_term;
279 cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
280
281 /* count the last one (flush) */
282 cpr_term.real_statef_size = STAT->cs_real_statefsz +
283 btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
284
285 CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
286 STAT->cs_real_statefsz);
287
288 cpr_tod_get(&cpr_term.tm_shutdown);
289
290 return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
291 }
292
293 /*
294 * Write bitmap descriptor array, followed by merged bitmaps.
295 */
296 static int
cpr_write_bitmap(vnode_t * vp)297 cpr_write_bitmap(vnode_t *vp)
298 {
299 char *rmap, *vmap, *dst, *tail;
300 size_t size, bytes;
301 cbd_t *dp;
302 int err;
303
304 dp = CPR->c_bmda;
305 if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
306 return (err);
307
308 /*
309 * merge regular and volatile bitmaps into tmp space
310 * and write to disk
311 */
312 for (; dp->cbd_size; dp++) {
313 rmap = (char *)dp->cbd_reg_bitmap;
314 vmap = (char *)dp->cbd_vlt_bitmap;
315 for (size = dp->cbd_size; size; size -= bytes) {
316 bytes = min(size, sizeof (cpr_pagecopy));
317 tail = &cpr_pagecopy[bytes];
318 for (dst = cpr_pagecopy; dst < tail; dst++)
319 *dst = *rmap++ | *vmap++;
320 if (err = cpr_write(vp, cpr_pagecopy, bytes))
321 break;
322 }
323 }
324
325 return (err);
326 }
327
328
329 static int
cpr_write_statefile(vnode_t * vp)330 cpr_write_statefile(vnode_t *vp)
331 {
332 uint_t error = 0;
333 extern int i_cpr_check_pgs_dumped();
334 void flush_windows(void);
335 pgcnt_t spages;
336 char *str;
337
338 flush_windows();
339
340 /*
341 * to get an accurate view of kas, we need to untag sensitive
342 * pages *before* dumping them because the disk driver makes
343 * allocations and changes kas along the way. The remaining
344 * pages referenced in the bitmaps are dumped out later as
345 * regular kpages.
346 */
347 str = "cpr_write_statefile:";
348 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
349 CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
350
351 /*
352 * now it's OK to call a driver that makes allocations
353 */
354 cpr_disk_writes_ok = 1;
355
356 /*
357 * now write out the clean sensitive kpages
358 * according to the sensitive descriptors
359 */
360 error = i_cpr_dump_sensitive_kpages(vp);
361 if (error) {
362 CPR_DEBUG(CPR_DEBUG7,
363 "%s cpr_dump_sensitive_kpages() failed!\n", str);
364 return (error);
365 }
366
367 /*
368 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
369 */
370 error = cpr_dump_regular_pages(vp);
371 if (error) {
372 CPR_DEBUG(CPR_DEBUG7,
373 "%s cpr_dump_regular_pages() failed!\n", str);
374 return (error);
375 }
376
377 /*
378 * sanity check to verify the right number of pages were dumped
379 */
380 error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
381 cpr_regular_pgs_dumped);
382
383 if (error) {
384 prom_printf("\n%s page count mismatch!\n", str);
385 #ifdef DEBUG
386 if (cpr_test_mode)
387 debug_enter(NULL);
388 #endif
389 }
390
391 return (error);
392 }
393 #endif
394
395
396 /*
397 * creates the CPR state file, the following sections are
398 * written out in sequence:
399 * - writes the cpr dump header
400 * - writes the memory usage bitmaps
401 * - writes the platform dependent info
402 * - writes the remaining user pages
403 * - writes the kernel pages
404 */
405 #if defined(__x86)
406 _NOTE(ARGSUSED(0))
407 #endif
408 int
cpr_dump(vnode_t * vp)409 cpr_dump(vnode_t *vp)
410 {
411 #if defined(__sparc)
412 int error;
413
414 if (cpr_buf == NULL) {
415 ASSERT(cpr_pagedata == NULL);
416 if (error = cpr_alloc_bufs())
417 return (error);
418 }
419 /* point to top of internal buffer */
420 cpr_wptr = cpr_buf;
421
422 /* initialize global variables used by the write operation */
423 cpr_file_bn = cpr_statefile_offset();
424 cpr_dev_space = 0;
425
426 /* allocate bitmaps */
427 if (CPR->c_bmda == NULL) {
428 if (error = i_cpr_alloc_bitmaps()) {
429 cpr_err(CE_WARN, "cannot allocate bitmaps");
430 return (error);
431 }
432 }
433
434 if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
435 return (error);
436
437 if (error = i_cpr_dump_setup(vp))
438 return (error);
439
440 /*
441 * set internal cross checking; we dont want to call
442 * a disk driver that makes allocations until after
443 * sensitive pages are saved
444 */
445 cpr_disk_writes_ok = 0;
446
447 /*
448 * 1253112: heap corruption due to memory allocation when dumpping
449 * statefile.
450 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
451 * kvseg segments can be contaminated should memory allocations happen
452 * during sddump, which is not supposed to happen after the system
453 * is quiesced. Let's call the kernel pages that tend to be affected
454 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
455 * will allocate some storage space to save the clean sensitive pages
456 * aside before statefile dumping takes place. Since there may not be
457 * much memory left at this stage, the sensitive pages will be
458 * compressed before they are saved into the storage area.
459 */
460 if (error = i_cpr_save_sensitive_kpages()) {
461 CPR_DEBUG(CPR_DEBUG7,
462 "cpr_dump: save_sensitive_kpages failed!\n");
463 return (error);
464 }
465
466 /*
467 * since all cpr allocations are done (space for sensitive kpages,
468 * bitmaps, cpr_buf), kas is stable, and now we can accurately
469 * count regular and sensitive kpages.
470 */
471 if (error = cpr_write_header(vp)) {
472 CPR_DEBUG(CPR_DEBUG7,
473 "cpr_dump: cpr_write_header() failed!\n");
474 return (error);
475 }
476
477 if (error = i_cpr_write_machdep(vp))
478 return (error);
479
480 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
481 return (error);
482
483 if (error = cpr_write_bitmap(vp))
484 return (error);
485
486 if (error = cpr_write_statefile(vp)) {
487 CPR_DEBUG(CPR_DEBUG7,
488 "cpr_dump: cpr_write_statefile() failed!\n");
489 return (error);
490 }
491
492 if (error = cpr_write_terminator(vp))
493 return (error);
494
495 if (error = cpr_flush_write(vp))
496 return (error);
497
498 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
499 return (error);
500 #endif
501
502 return (0);
503 }
504
505
506 #if defined(__sparc)
507 /*
508 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
509 * a page-count from each range is accumulated at arg->pages.
510 */
511 static void
cpr_xwalk(void * arg,void * base,size_t size)512 cpr_xwalk(void *arg, void *base, size_t size)
513 {
514 struct cpr_walkinfo *cwip = arg;
515
516 cwip->pages += cpr_count_pages(base, size,
517 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
518 cwip->size += size;
519 cwip->ranges++;
520 }
521
522 /*
523 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
524 * a page-count from each range is accumulated at arg->pages.
525 */
526 static void
cpr_walk(void * arg,void * base,size_t size)527 cpr_walk(void *arg, void *base, size_t size)
528 {
529 caddr_t addr = base;
530 caddr_t addr_end = addr + size;
531
532 /*
533 * If we are about to start walking the range of addresses we
534 * carved out of the kernel heap for the large page heap walk
535 * heap_lp_arena to find what segments are actually populated
536 */
537 if (SEGKMEM_USE_LARGEPAGES &&
538 addr == heap_lp_base && addr_end == heap_lp_end &&
539 vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
540 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
541 } else {
542 cpr_xwalk(arg, base, size);
543 }
544 }
545
546
547 /*
548 * faster scan of kvseg using vmem_walk() to visit
549 * allocated ranges.
550 */
551 pgcnt_t
cpr_scan_kvseg(int mapflag,bitfunc_t bitfunc,struct seg * seg)552 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
553 {
554 struct cpr_walkinfo cwinfo;
555
556 bzero(&cwinfo, sizeof (cwinfo));
557 cwinfo.mapflag = mapflag;
558 cwinfo.bitfunc = bitfunc;
559
560 vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
561
562 if (cpr_debug & CPR_DEBUG7) {
563 prom_printf("walked %d sub-ranges, total pages %ld\n",
564 cwinfo.ranges, mmu_btop(cwinfo.size));
565 cpr_show_range(seg->s_base, seg->s_size,
566 mapflag, bitfunc, cwinfo.pages);
567 }
568
569 return (cwinfo.pages);
570 }
571
572
573 /*
574 * cpr_walk_kpm() is called for every used area within the large
575 * segkpm virtual address window. A page-count is accumulated at
576 * arg->pages.
577 */
578 static void
cpr_walk_kpm(void * arg,void * base,size_t size)579 cpr_walk_kpm(void *arg, void *base, size_t size)
580 {
581 struct cpr_walkinfo *cwip = arg;
582
583 cwip->pages += cpr_count_pages(base, size,
584 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
585 cwip->size += size;
586 cwip->ranges++;
587 }
588
589
590 /*
591 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
592 */
593 /*ARGSUSED*/
594 static pgcnt_t
cpr_scan_segkpm(int mapflag,bitfunc_t bitfunc,struct seg * seg)595 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
596 {
597 struct cpr_walkinfo cwinfo;
598
599 if (kpm_enable == 0)
600 return (0);
601
602 bzero(&cwinfo, sizeof (cwinfo));
603 cwinfo.mapflag = mapflag;
604 cwinfo.bitfunc = bitfunc;
605 hat_kpm_walk(cpr_walk_kpm, &cwinfo);
606
607 if (cpr_debug & CPR_DEBUG7) {
608 prom_printf("walked %d sub-ranges, total pages %ld\n",
609 cwinfo.ranges, mmu_btop(cwinfo.size));
610 cpr_show_range(segkpm->s_base, segkpm->s_size,
611 mapflag, bitfunc, cwinfo.pages);
612 }
613
614 return (cwinfo.pages);
615 }
616
617
618 /*
619 * Sparsely filled kernel segments are registered in kseg_table for
620 * easier lookup. See also block comment for cpr_count_seg_pages.
621 */
622
623 #define KSEG_SEG_ADDR 0 /* address of struct seg */
624 #define KSEG_PTR_ADDR 1 /* address of pointer to struct seg */
625
626 typedef struct {
627 struct seg **st_seg; /* segment pointer or segment address */
628 pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
629 int st_addrtype; /* address type in st_seg */
630 } ksegtbl_entry_t;
631
632 ksegtbl_entry_t kseg_table[] = {
633 {(struct seg **)&kvseg, cpr_scan_kvseg, KSEG_SEG_ADDR},
634 {&segkpm, cpr_scan_segkpm, KSEG_PTR_ADDR},
635 {NULL, 0, 0}
636 };
637
638
639 /*
640 * Compare seg with each entry in kseg_table; when there is a match
641 * return the entry pointer, otherwise return NULL.
642 */
643 static ksegtbl_entry_t *
cpr_sparse_seg_check(struct seg * seg)644 cpr_sparse_seg_check(struct seg *seg)
645 {
646 ksegtbl_entry_t *ste = &kseg_table[0];
647 struct seg *tseg;
648
649 for (; ste->st_seg; ste++) {
650 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
651 *ste->st_seg : (struct seg *)ste->st_seg;
652
653 if (seg == tseg)
654 return (ste);
655 }
656
657 return ((ksegtbl_entry_t *)NULL);
658 }
659
660
661 /*
662 * Count pages within each kernel segment; call cpr_sparse_seg_check()
663 * to find out whether a sparsely filled segment needs special
664 * treatment (e.g. kvseg).
665 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
666 * module shouldn't need to know segment details like if it is
667 * sparsely filled or not (makes kseg_table obsolete).
668 */
669 pgcnt_t
cpr_count_seg_pages(int mapflag,bitfunc_t bitfunc)670 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
671 {
672 struct seg *segp;
673 pgcnt_t pages;
674 ksegtbl_entry_t *ste;
675
676 pages = 0;
677 for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
678 if (ste = cpr_sparse_seg_check(segp)) {
679 pages += (ste->st_fcn)(mapflag, bitfunc, segp);
680 } else {
681 pages += cpr_count_pages(segp->s_base,
682 segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
683 }
684 }
685
686 return (pages);
687 }
688
689
690 /*
691 * count kernel pages within kas and any special ranges
692 */
693 pgcnt_t
cpr_count_kpages(int mapflag,bitfunc_t bitfunc)694 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
695 {
696 pgcnt_t kas_cnt;
697
698 /*
699 * Some pages need to be taken care of differently.
700 * eg: panicbuf pages of sun4m are not in kas but they need
701 * to be saved. On sun4u, the physical pages of panicbuf are
702 * allocated via prom_retain().
703 */
704 kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
705 kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
706
707 CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
708 CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
709 kas_cnt, mmu_ptob(kas_cnt));
710
711 return (kas_cnt);
712 }
713
714
715 /*
716 * Set a bit corresponding to the arg phys page number;
717 * returns 0 when the ppn is valid and the corresponding
718 * map bit was clear, otherwise returns 1.
719 */
720 int
cpr_setbit(pfn_t ppn,int mapflag)721 cpr_setbit(pfn_t ppn, int mapflag)
722 {
723 char *bitmap;
724 cbd_t *dp;
725 pfn_t rel;
726 int clr;
727
728 for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
729 if (PPN_IN_RANGE(ppn, dp)) {
730 bitmap = DESC_TO_MAP(dp, mapflag);
731 rel = ppn - dp->cbd_spfn;
732 if ((clr = isclr(bitmap, rel)) != 0)
733 setbit(bitmap, rel);
734 return (clr == 0);
735 }
736 }
737
738 return (1);
739 }
740
741
742 /*
743 * Clear a bit corresponding to the arg phys page number.
744 */
745 int
cpr_clrbit(pfn_t ppn,int mapflag)746 cpr_clrbit(pfn_t ppn, int mapflag)
747 {
748 char *bitmap;
749 cbd_t *dp;
750 pfn_t rel;
751 int set;
752
753 for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
754 if (PPN_IN_RANGE(ppn, dp)) {
755 bitmap = DESC_TO_MAP(dp, mapflag);
756 rel = ppn - dp->cbd_spfn;
757 if ((set = isset(bitmap, rel)) != 0)
758 clrbit(bitmap, rel);
759 return (set == 0);
760 }
761 }
762
763 return (1);
764 }
765
766
767 /* ARGSUSED */
768 int
cpr_nobit(pfn_t ppn,int mapflag)769 cpr_nobit(pfn_t ppn, int mapflag)
770 {
771 return (0);
772 }
773
774
775 /*
776 * Lookup a bit corresponding to the arg phys page number.
777 */
778 int
cpr_isset(pfn_t ppn,int mapflag)779 cpr_isset(pfn_t ppn, int mapflag)
780 {
781 char *bitmap;
782 cbd_t *dp;
783 pfn_t rel;
784
785 for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
786 if (PPN_IN_RANGE(ppn, dp)) {
787 bitmap = DESC_TO_MAP(dp, mapflag);
788 rel = ppn - dp->cbd_spfn;
789 return (isset(bitmap, rel));
790 }
791 }
792
793 return (0);
794 }
795
796
797 /*
798 * Go thru all pages and pick up any page not caught during the invalidation
799 * stage. This is also used to save pages with cow lock or phys page lock held
800 * (none zero p_lckcnt or p_cowcnt)
801 */
802 static int
cpr_count_upages(int mapflag,bitfunc_t bitfunc)803 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
804 {
805 page_t *pp, *page0;
806 pgcnt_t dcnt = 0, tcnt = 0;
807 pfn_t pfn;
808
809 page0 = pp = page_first();
810
811 do {
812 if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
813 PP_ISFREE(pp) && PP_ISAGED(pp))
814 continue;
815
816 pfn = page_pptonum(pp);
817 if (pf_is_memory(pfn)) {
818 tcnt++;
819 if ((*bitfunc)(pfn, mapflag) == 0)
820 dcnt++; /* dirty count */
821 }
822 } while ((pp = page_next(pp)) != page0);
823
824 STAT->cs_upage2statef = dcnt;
825 CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
826 dcnt, tcnt);
827 CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
828 dcnt, mmu_ptob(dcnt));
829 page0 = NULL; /* for Lint */
830 return (dcnt);
831 }
832
833
834 /*
835 * try compressing pages based on cflag,
836 * and for DEBUG kernels, verify uncompressed data checksum;
837 *
838 * this routine replaces common code from
839 * i_cpr_compress_and_save() and cpr_compress_and_write()
840 */
841 char *
cpr_compress_pages(cpd_t * dp,pgcnt_t pages,int cflag)842 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
843 {
844 size_t nbytes, clen, len;
845 uint32_t test_sum;
846 char *datap;
847
848 nbytes = mmu_ptob(pages);
849
850 /*
851 * set length to the original uncompressed data size;
852 * always init cpd_flag to zero
853 */
854 dp->cpd_length = nbytes;
855 dp->cpd_flag = 0;
856
857 #ifdef DEBUG
858 /*
859 * Make a copy of the uncompressed data so we can checksum it.
860 * Compress that copy so the checksum works at the other end
861 */
862 cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
863 dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
864 dp->cpd_flag |= CPD_USUM;
865 datap = cpr_pagecopy;
866 #else
867 datap = CPR->c_mapping_area;
868 dp->cpd_usum = 0;
869 #endif
870
871 /*
872 * try compressing the raw data to cpr_pagedata;
873 * if there was a size reduction: record the new length,
874 * flag the compression, and point to the compressed data.
875 */
876 dp->cpd_csum = 0;
877 if (cflag) {
878 clen = compress(datap, cpr_pagedata, nbytes);
879 if (clen < nbytes) {
880 dp->cpd_flag |= CPD_COMPRESS;
881 dp->cpd_length = clen;
882 datap = cpr_pagedata;
883 #ifdef DEBUG
884 dp->cpd_csum = checksum32(datap, clen);
885 dp->cpd_flag |= CPD_CSUM;
886
887 /*
888 * decompress the data back to a scratch area
889 * and compare the new checksum with the original
890 * checksum to verify the compression.
891 */
892 bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
893 len = decompress(datap, cpr_pagecopy,
894 clen, sizeof (cpr_pagecopy));
895 test_sum = checksum32(cpr_pagecopy, len);
896 ASSERT(test_sum == dp->cpd_usum);
897 #endif
898 }
899 }
900
901 return (datap);
902 }
903
904
905 /*
906 * 1. Prepare cpr page descriptor and write it to file
907 * 2. Compress page data and write it out
908 */
909 static int
cpr_compress_and_write(vnode_t * vp,uint_t va,pfn_t pfn,pgcnt_t npg)910 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
911 {
912 int error = 0;
913 char *datap;
914 cpd_t cpd; /* cpr page descriptor */
915 extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
916 extern void i_cpr_mapout(caddr_t, uint_t);
917
918 i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
919
920 CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
921 npg, (void *)CPR->c_mapping_area, pfn);
922
923 /*
924 * Fill cpr page descriptor.
925 */
926 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
927 cpd.cpd_pfn = pfn;
928 cpd.cpd_pages = npg;
929
930 STAT->cs_dumped_statefsz += mmu_ptob(npg);
931
932 datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
933
934 /* Write cpr page descriptor */
935 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
936
937 /* Write compressed page data */
938 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
939
940 /*
941 * Unmap the pages for tlb and vac flushing
942 */
943 i_cpr_mapout(CPR->c_mapping_area, npg);
944
945 if (error) {
946 CPR_DEBUG(CPR_DEBUG1,
947 "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
948 CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
949 pfn, cpr_file_bn, error);
950 } else {
951 cpr_regular_pgs_dumped += npg;
952 }
953
954 return (error);
955 }
956
957
958 int
cpr_write(vnode_t * vp,caddr_t buffer,size_t size)959 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
960 {
961 caddr_t fromp = buffer;
962 size_t bytes, wbytes;
963 int error;
964
965 if (cpr_dev_space == 0) {
966 if (vp->v_type == VBLK) {
967 cpr_dev_space = cpr_get_devsize(vp->v_rdev);
968 ASSERT(cpr_dev_space);
969 } else
970 cpr_dev_space = 1; /* not used in this case */
971 }
972
973 /*
974 * break the write into multiple part if request is large,
975 * calculate count up to buf page boundary, then write it out.
976 * repeat until done.
977 */
978 while (size) {
979 bytes = MIN(size, cpr_buf_end - cpr_wptr);
980 cprbcopy(fromp, cpr_wptr, bytes);
981 cpr_wptr += bytes;
982 fromp += bytes;
983 size -= bytes;
984 if (cpr_wptr < cpr_buf_end)
985 return (0); /* buffer not full yet */
986 ASSERT(cpr_wptr == cpr_buf_end);
987
988 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
989 if (vp->v_type == VBLK) {
990 if (wbytes > cpr_dev_space)
991 return (ENOSPC);
992 } else {
993 if (wbytes > VTOI(vp)->i_size)
994 return (ENOSPC);
995 }
996
997 CPR_DEBUG(CPR_DEBUG3,
998 "cpr_write: frmp=%p wptr=%p cnt=%lx...",
999 (void *)fromp, (void *)cpr_wptr, bytes);
1000 /*
1001 * cross check, this should not happen!
1002 */
1003 if (cpr_disk_writes_ok == 0) {
1004 prom_printf("cpr_write: disk write too early!\n");
1005 return (EINVAL);
1006 }
1007
1008 do_polled_io = 1;
1009 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
1010 NULL);
1011 do_polled_io = 0;
1012 CPR_DEBUG(CPR_DEBUG3, "done\n");
1013
1014 STAT->cs_real_statefsz += cpr_buf_size;
1015
1016 if (error) {
1017 cpr_err(CE_WARN, "cpr_write error %d", error);
1018 return (error);
1019 }
1020 cpr_file_bn += cpr_buf_blocks; /* Increment block count */
1021 cpr_wptr = cpr_buf; /* back to top of buffer */
1022 }
1023 return (0);
1024 }
1025
1026
1027 int
cpr_flush_write(vnode_t * vp)1028 cpr_flush_write(vnode_t *vp)
1029 {
1030 int nblk;
1031 int error;
1032
1033 /*
1034 * Calculate remaining blocks in buffer, rounded up to nearest
1035 * disk block
1036 */
1037 nblk = btod(cpr_wptr - cpr_buf);
1038
1039 do_polled_io = 1;
1040 error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
1041 do_polled_io = 0;
1042
1043 cpr_file_bn += nblk;
1044 if (error)
1045 CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
1046 error);
1047 return (error);
1048 }
1049
1050 void
cpr_clear_bitmaps(void)1051 cpr_clear_bitmaps(void)
1052 {
1053 cbd_t *dp;
1054
1055 for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1056 bzero((void *)dp->cbd_reg_bitmap,
1057 (size_t)dp->cbd_size * 2);
1058 }
1059 CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
1060 }
1061
1062 int
cpr_contig_pages(vnode_t * vp,int flag)1063 cpr_contig_pages(vnode_t *vp, int flag)
1064 {
1065 int chunks = 0, error = 0;
1066 pgcnt_t i, j, totbit;
1067 pfn_t spfn;
1068 cbd_t *dp;
1069 uint_t spin_cnt = 0;
1070 extern int i_cpr_compress_and_save();
1071
1072 for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1073 spfn = dp->cbd_spfn;
1074 totbit = BTOb(dp->cbd_size);
1075 i = 0; /* Beginning of bitmap */
1076 j = 0;
1077 while (i < totbit) {
1078 while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1079 if (isset((char *)dp->cbd_reg_bitmap, j+i))
1080 j++;
1081 else /* not contiguous anymore */
1082 break;
1083 }
1084
1085 if (j) {
1086 chunks++;
1087 if (flag == SAVE_TO_STORAGE) {
1088 error = i_cpr_compress_and_save(
1089 chunks, spfn + i, j);
1090 if (error)
1091 return (error);
1092 } else if (flag == WRITE_TO_STATEFILE) {
1093 error = cpr_compress_and_write(vp, 0,
1094 spfn + i, j);
1095 if (error)
1096 return (error);
1097 else {
1098 spin_cnt++;
1099 if ((spin_cnt & 0x5F) == 1)
1100 cpr_spinning_bar();
1101 }
1102 }
1103 }
1104
1105 i += j;
1106 if (j != CPR_MAXCONTIG) {
1107 /* Stopped on a non-tagged page */
1108 i++;
1109 }
1110
1111 j = 0;
1112 }
1113 }
1114
1115 if (flag == STORAGE_DESC_ALLOC)
1116 return (chunks);
1117 else
1118 return (0);
1119 }
1120
1121
1122 void
cpr_show_range(caddr_t vaddr,size_t size,int mapflag,bitfunc_t bitfunc,pgcnt_t count)1123 cpr_show_range(caddr_t vaddr, size_t size,
1124 int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1125 {
1126 char *action, *bname;
1127
1128 bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1129 if (bitfunc == cpr_setbit)
1130 action = "tag";
1131 else if (bitfunc == cpr_clrbit)
1132 action = "untag";
1133 else
1134 action = "none";
1135 prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1136 (void *)vaddr, (void *)(vaddr + size), bname, action, count);
1137 }
1138
1139
1140 pgcnt_t
cpr_count_pages(caddr_t sva,size_t size,int mapflag,bitfunc_t bitfunc,int showrange)1141 cpr_count_pages(caddr_t sva, size_t size,
1142 int mapflag, bitfunc_t bitfunc, int showrange)
1143 {
1144 caddr_t va, eva;
1145 pfn_t pfn;
1146 pgcnt_t count = 0;
1147
1148 eva = sva + PAGE_ROUNDUP(size);
1149 for (va = sva; va < eva; va += MMU_PAGESIZE) {
1150 pfn = va_to_pfn(va);
1151 if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1152 if ((*bitfunc)(pfn, mapflag) == 0)
1153 count++;
1154 }
1155 }
1156
1157 if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
1158 cpr_show_range(sva, size, mapflag, bitfunc, count);
1159
1160 return (count);
1161 }
1162
1163
1164 pgcnt_t
cpr_count_volatile_pages(int mapflag,bitfunc_t bitfunc)1165 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1166 {
1167 pgcnt_t count = 0;
1168
1169 if (cpr_buf) {
1170 count += cpr_count_pages(cpr_buf, cpr_buf_size,
1171 mapflag, bitfunc, DBG_SHOWRANGE);
1172 }
1173 if (cpr_pagedata) {
1174 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1175 mapflag, bitfunc, DBG_SHOWRANGE);
1176 }
1177 count += i_cpr_count_storage_pages(mapflag, bitfunc);
1178
1179 CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1180 count, mmu_ptob(count));
1181 return (count);
1182 }
1183
1184
1185 static int
cpr_dump_regular_pages(vnode_t * vp)1186 cpr_dump_regular_pages(vnode_t *vp)
1187 {
1188 int error;
1189
1190 cpr_regular_pgs_dumped = 0;
1191 error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1192 if (!error)
1193 CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
1194 return (error);
1195 }
1196 #endif
1197