1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Fill in and write out the cpr state file 28 * 1. Allocate and write headers, ELF and cpr dump header 29 * 2. Allocate bitmaps according to phys_install 30 * 3. Tag kernel pages into corresponding bitmap 31 * 4. Write bitmaps to state file 32 * 5. Write actual physical page data to state file 33 */ 34 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/vm.h> 38 #include <sys/memlist.h> 39 #include <sys/kmem.h> 40 #include <sys/vnode.h> 41 #include <sys/fs/ufs_inode.h> 42 #include <sys/errno.h> 43 #include <sys/cmn_err.h> 44 #include <sys/debug.h> 45 #include <vm/page.h> 46 #include <vm/seg.h> 47 #include <vm/seg_kmem.h> 48 #include <vm/seg_kpm.h> 49 #include <vm/hat.h> 50 #include <sys/cpr.h> 51 #include <sys/conf.h> 52 #include <sys/ddi.h> 53 #include <sys/panic.h> 54 #include <sys/thread.h> 55 #include <sys/note.h> 56 57 /* Local defines and variables */ 58 #define BTOb(bytes) ((bytes) << 3) /* Bytes to bits, log2(NBBY) */ 59 #define bTOB(bits) ((bits) >> 3) /* bits to Bytes, log2(NBBY) */ 60 61 #if defined(__sparc) 62 static uint_t cpr_pages_tobe_dumped; 63 static uint_t cpr_regular_pgs_dumped; 64 static int cpr_dump_regular_pages(vnode_t *); 65 static int cpr_count_upages(int, bitfunc_t); 66 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t); 67 #endif 68 69 int cpr_flush_write(vnode_t *); 70 71 int cpr_contig_pages(vnode_t *, int); 72 73 void cpr_clear_bitmaps(); 74 75 extern size_t cpr_get_devsize(dev_t); 76 extern int i_cpr_dump_setup(vnode_t *); 77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *); 78 extern int cpr_test_mode; 79 int cpr_setbit(pfn_t, int); 80 int cpr_clrbit(pfn_t, int); 81 82 ctrm_t cpr_term; 83 84 char *cpr_buf, *cpr_buf_end; 85 int cpr_buf_blocks; /* size of cpr_buf in blocks */ 86 size_t cpr_buf_size; /* size of cpr_buf in bytes */ 87 size_t cpr_bitmap_size; 88 int cpr_nbitmaps; 89 90 char *cpr_pagedata; /* page buffer for compression / tmp copy */ 91 size_t cpr_pagedata_size; /* page buffer size in bytes */ 92 93 #if defined(__sparc) 94 static char *cpr_wptr; /* keep track of where to write to next */ 95 static int cpr_file_bn; /* cpr state-file block offset */ 96 static int cpr_disk_writes_ok; 97 static size_t cpr_dev_space = 0; 98 #endif 99 100 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE]; 101 102 #if defined(__sparc) 103 /* 104 * On some platforms bcopy may modify the thread structure 105 * during bcopy (eg, to prevent cpu migration). If the 106 * range we are currently writing out includes our own 107 * thread structure then it will be snapshotted by bcopy 108 * including those modified members - and the updates made 109 * on exit from bcopy will no longer be seen when we later 110 * restore the mid-bcopy kthread_t. So if the range we 111 * need to copy overlaps with our thread structure we will 112 * use a simple byte copy. 113 */ 114 void 115 cprbcopy(void *from, void *to, size_t bytes) 116 { 117 extern int curthreadremapped; 118 caddr_t kthrend; 119 120 kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1; 121 if (curthreadremapped || (kthrend >= (caddr_t)from && 122 kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) { 123 caddr_t src = from, dst = to; 124 125 while (bytes-- > 0) 126 *dst++ = *src++; 127 } else { 128 bcopy(from, to, bytes); 129 } 130 } 131 132 /* 133 * Allocate pages for buffers used in writing out the statefile 134 */ 135 static int 136 cpr_alloc_bufs(void) 137 { 138 char *allocerr = "Unable to allocate memory for cpr buffer"; 139 size_t size; 140 141 /* 142 * set the cpr write buffer size to at least the historic 143 * size (128k) or large enough to store the both the early 144 * set of statefile structures (well under 0x800) plus the 145 * bitmaps, and roundup to the next pagesize. 146 */ 147 size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size); 148 cpr_buf_size = MAX(size, CPRBUFSZ); 149 cpr_buf_blocks = btodb(cpr_buf_size); 150 cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP); 151 if (cpr_buf == NULL) { 152 cpr_err(CE_WARN, allocerr); 153 return (ENOMEM); 154 } 155 cpr_buf_end = cpr_buf + cpr_buf_size; 156 157 cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1); 158 cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP); 159 if (cpr_pagedata == NULL) { 160 kmem_free(cpr_buf, cpr_buf_size); 161 cpr_buf = NULL; 162 cpr_err(CE_WARN, allocerr); 163 return (ENOMEM); 164 } 165 166 return (0); 167 } 168 169 170 /* 171 * Set bitmap size in bytes based on phys_install. 172 */ 173 void 174 cpr_set_bitmap_size(void) 175 { 176 struct memlist *pmem; 177 size_t size = 0; 178 179 memlist_read_lock(); 180 for (pmem = phys_install; pmem; pmem = pmem->next) 181 size += pmem->size; 182 memlist_read_unlock(); 183 cpr_bitmap_size = BITMAP_BYTES(size); 184 } 185 186 187 /* 188 * CPR dump header contains the following information: 189 * 1. header magic -- unique to cpr state file 190 * 2. kernel return pc & ppn for resume 191 * 3. current thread info 192 * 4. debug level and test mode 193 * 5. number of bitmaps allocated 194 * 6. number of page records 195 */ 196 static int 197 cpr_write_header(vnode_t *vp) 198 { 199 extern ushort_t cpr_mach_type; 200 struct cpr_dump_desc cdump; 201 pgcnt_t bitmap_pages; 202 pgcnt_t kpages, vpages, upages; 203 pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc); 204 205 cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC; 206 cdump.cdd_version = CPR_VERSION; 207 cdump.cdd_machine = cpr_mach_type; 208 cdump.cdd_debug = cpr_debug; 209 cdump.cdd_test_mode = cpr_test_mode; 210 cdump.cdd_bitmaprec = cpr_nbitmaps; 211 212 cpr_clear_bitmaps(); 213 214 /* 215 * Remember how many pages we plan to save to statefile. 216 * This information will be used for sanity checks. 217 * Untag those pages that will not be saved to statefile. 218 */ 219 kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit); 220 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 221 upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit); 222 cdump.cdd_dumppgsize = kpages - vpages + upages; 223 cpr_pages_tobe_dumped = cdump.cdd_dumppgsize; 224 CPR_DEBUG(CPR_DEBUG7, 225 "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n", 226 kpages, vpages, upages, cdump.cdd_dumppgsize); 227 228 /* 229 * Some pages contain volatile data (cpr_buf and storage area for 230 * sensitive kpages), which are no longer needed after the statefile 231 * is dumped to disk. We have already untagged them from regular 232 * bitmaps. Now tag them into the volatile bitmaps. The pages in 233 * volatile bitmaps will be claimed during resume, and the resumed 234 * kernel will free them. 235 */ 236 (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit); 237 238 bitmap_pages = mmu_btopr(cpr_bitmap_size); 239 240 /* 241 * Export accurate statefile size for statefile allocation retry. 242 * statefile_size = all the headers + total pages + 243 * number of pages used by the bitmaps. 244 * Roundup will be done in the file allocation code. 245 */ 246 STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) + 247 (sizeof (cbd_t) * cdump.cdd_bitmaprec) + 248 (sizeof (cpd_t) * cdump.cdd_dumppgsize) + 249 mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages); 250 251 /* 252 * If the estimated statefile is not big enough, 253 * go retry now to save un-necessary operations. 254 */ 255 if (!(CPR->c_flags & C_COMPRESSING) && 256 (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) { 257 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7)) 258 prom_printf("cpr_write_header: " 259 "STAT->cs_nocomp_statefsz > " 260 "STAT->cs_est_statefsz\n"); 261 return (ENOSPC); 262 } 263 264 /* now write cpr dump descriptor */ 265 return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t))); 266 } 267 268 269 /* 270 * CPR dump tail record contains the following information: 271 * 1. header magic -- unique to cpr state file 272 * 2. all misc info that needs to be passed to cprboot or resumed kernel 273 */ 274 static int 275 cpr_write_terminator(vnode_t *vp) 276 { 277 cpr_term.magic = (uint_t)CPR_TERM_MAGIC; 278 cpr_term.va = (cpr_ptr)&cpr_term; 279 cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term); 280 281 /* count the last one (flush) */ 282 cpr_term.real_statef_size = STAT->cs_real_statefsz + 283 btod(cpr_wptr - cpr_buf) * DEV_BSIZE; 284 285 CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n", 286 STAT->cs_real_statefsz); 287 288 cpr_tod_get(&cpr_term.tm_shutdown); 289 290 return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term))); 291 } 292 293 /* 294 * Write bitmap descriptor array, followed by merged bitmaps. 295 */ 296 static int 297 cpr_write_bitmap(vnode_t *vp) 298 { 299 char *rmap, *vmap, *dst, *tail; 300 size_t size, bytes; 301 cbd_t *dp; 302 int err; 303 304 dp = CPR->c_bmda; 305 if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp))) 306 return (err); 307 308 /* 309 * merge regular and volatile bitmaps into tmp space 310 * and write to disk 311 */ 312 for (; dp->cbd_size; dp++) { 313 rmap = (char *)dp->cbd_reg_bitmap; 314 vmap = (char *)dp->cbd_vlt_bitmap; 315 for (size = dp->cbd_size; size; size -= bytes) { 316 bytes = min(size, sizeof (cpr_pagecopy)); 317 tail = &cpr_pagecopy[bytes]; 318 for (dst = cpr_pagecopy; dst < tail; dst++) 319 *dst = *rmap++ | *vmap++; 320 if (err = cpr_write(vp, cpr_pagecopy, bytes)) 321 break; 322 } 323 } 324 325 return (err); 326 } 327 328 329 static int 330 cpr_write_statefile(vnode_t *vp) 331 { 332 uint_t error = 0; 333 extern int i_cpr_check_pgs_dumped(); 334 void flush_windows(void); 335 pgcnt_t spages; 336 char *str; 337 338 flush_windows(); 339 340 /* 341 * to get an accurate view of kas, we need to untag sensitive 342 * pages *before* dumping them because the disk driver makes 343 * allocations and changes kas along the way. The remaining 344 * pages referenced in the bitmaps are dumped out later as 345 * regular kpages. 346 */ 347 str = "cpr_write_statefile:"; 348 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit); 349 CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages); 350 351 /* 352 * now it's OK to call a driver that makes allocations 353 */ 354 cpr_disk_writes_ok = 1; 355 356 /* 357 * now write out the clean sensitive kpages 358 * according to the sensitive descriptors 359 */ 360 error = i_cpr_dump_sensitive_kpages(vp); 361 if (error) { 362 CPR_DEBUG(CPR_DEBUG7, 363 "%s cpr_dump_sensitive_kpages() failed!\n", str); 364 return (error); 365 } 366 367 /* 368 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped 369 */ 370 error = cpr_dump_regular_pages(vp); 371 if (error) { 372 CPR_DEBUG(CPR_DEBUG7, 373 "%s cpr_dump_regular_pages() failed!\n", str); 374 return (error); 375 } 376 377 /* 378 * sanity check to verify the right number of pages were dumped 379 */ 380 error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped, 381 cpr_regular_pgs_dumped); 382 383 if (error) { 384 prom_printf("\n%s page count mismatch!\n", str); 385 #ifdef DEBUG 386 if (cpr_test_mode) 387 debug_enter(NULL); 388 #endif 389 } 390 391 return (error); 392 } 393 #endif 394 395 396 /* 397 * creates the CPR state file, the following sections are 398 * written out in sequence: 399 * - writes the cpr dump header 400 * - writes the memory usage bitmaps 401 * - writes the platform dependent info 402 * - writes the remaining user pages 403 * - writes the kernel pages 404 */ 405 #if defined(__x86) 406 _NOTE(ARGSUSED(0)) 407 #endif 408 int 409 cpr_dump(vnode_t *vp) 410 { 411 #if defined(__sparc) 412 int error; 413 414 if (cpr_buf == NULL) { 415 ASSERT(cpr_pagedata == NULL); 416 if (error = cpr_alloc_bufs()) 417 return (error); 418 } 419 /* point to top of internal buffer */ 420 cpr_wptr = cpr_buf; 421 422 /* initialize global variables used by the write operation */ 423 cpr_file_bn = cpr_statefile_offset(); 424 cpr_dev_space = 0; 425 426 /* allocate bitmaps */ 427 if (CPR->c_bmda == NULL) { 428 if (error = i_cpr_alloc_bitmaps()) { 429 cpr_err(CE_WARN, "cannot allocate bitmaps"); 430 return (error); 431 } 432 } 433 434 if (error = i_cpr_prom_pages(CPR_PROM_SAVE)) 435 return (error); 436 437 if (error = i_cpr_dump_setup(vp)) 438 return (error); 439 440 /* 441 * set internal cross checking; we dont want to call 442 * a disk driver that makes allocations until after 443 * sensitive pages are saved 444 */ 445 cpr_disk_writes_ok = 0; 446 447 /* 448 * 1253112: heap corruption due to memory allocation when dumpping 449 * statefile. 450 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and 451 * kvseg segments can be contaminated should memory allocations happen 452 * during sddump, which is not supposed to happen after the system 453 * is quiesced. Let's call the kernel pages that tend to be affected 454 * 'sensitive kpages' here. To avoid saving inconsistent pages, we 455 * will allocate some storage space to save the clean sensitive pages 456 * aside before statefile dumping takes place. Since there may not be 457 * much memory left at this stage, the sensitive pages will be 458 * compressed before they are saved into the storage area. 459 */ 460 if (error = i_cpr_save_sensitive_kpages()) { 461 CPR_DEBUG(CPR_DEBUG7, 462 "cpr_dump: save_sensitive_kpages failed!\n"); 463 return (error); 464 } 465 466 /* 467 * since all cpr allocations are done (space for sensitive kpages, 468 * bitmaps, cpr_buf), kas is stable, and now we can accurately 469 * count regular and sensitive kpages. 470 */ 471 if (error = cpr_write_header(vp)) { 472 CPR_DEBUG(CPR_DEBUG7, 473 "cpr_dump: cpr_write_header() failed!\n"); 474 return (error); 475 } 476 477 if (error = i_cpr_write_machdep(vp)) 478 return (error); 479 480 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL)) 481 return (error); 482 483 if (error = cpr_write_bitmap(vp)) 484 return (error); 485 486 if (error = cpr_write_statefile(vp)) { 487 CPR_DEBUG(CPR_DEBUG7, 488 "cpr_dump: cpr_write_statefile() failed!\n"); 489 return (error); 490 } 491 492 if (error = cpr_write_terminator(vp)) 493 return (error); 494 495 if (error = cpr_flush_write(vp)) 496 return (error); 497 498 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp)) 499 return (error); 500 #endif 501 502 return (0); 503 } 504 505 506 #if defined(__sparc) 507 /* 508 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc; 509 * a page-count from each range is accumulated at arg->pages. 510 */ 511 static void 512 cpr_xwalk(void *arg, void *base, size_t size) 513 { 514 struct cpr_walkinfo *cwip = arg; 515 516 cwip->pages += cpr_count_pages(base, size, 517 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 518 cwip->size += size; 519 cwip->ranges++; 520 } 521 522 /* 523 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc; 524 * a page-count from each range is accumulated at arg->pages. 525 */ 526 static void 527 cpr_walk(void *arg, void *base, size_t size) 528 { 529 caddr_t addr = base; 530 caddr_t addr_end = addr + size; 531 532 /* 533 * If we are about to start walking the range of addresses we 534 * carved out of the kernel heap for the large page heap walk 535 * heap_lp_arena to find what segments are actually populated 536 */ 537 if (SEGKMEM_USE_LARGEPAGES && 538 addr == heap_lp_base && addr_end == heap_lp_end && 539 vmem_size(heap_lp_arena, VMEM_ALLOC) < size) { 540 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg); 541 } else { 542 cpr_xwalk(arg, base, size); 543 } 544 } 545 546 547 /* 548 * faster scan of kvseg using vmem_walk() to visit 549 * allocated ranges. 550 */ 551 pgcnt_t 552 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg) 553 { 554 struct cpr_walkinfo cwinfo; 555 556 bzero(&cwinfo, sizeof (cwinfo)); 557 cwinfo.mapflag = mapflag; 558 cwinfo.bitfunc = bitfunc; 559 560 vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo); 561 562 if (cpr_debug & CPR_DEBUG7) { 563 prom_printf("walked %d sub-ranges, total pages %ld\n", 564 cwinfo.ranges, mmu_btop(cwinfo.size)); 565 cpr_show_range(seg->s_base, seg->s_size, 566 mapflag, bitfunc, cwinfo.pages); 567 } 568 569 return (cwinfo.pages); 570 } 571 572 573 /* 574 * cpr_walk_kpm() is called for every used area within the large 575 * segkpm virtual address window. A page-count is accumulated at 576 * arg->pages. 577 */ 578 static void 579 cpr_walk_kpm(void *arg, void *base, size_t size) 580 { 581 struct cpr_walkinfo *cwip = arg; 582 583 cwip->pages += cpr_count_pages(base, size, 584 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 585 cwip->size += size; 586 cwip->ranges++; 587 } 588 589 590 /* 591 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges. 592 */ 593 /*ARGSUSED*/ 594 static pgcnt_t 595 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg) 596 { 597 struct cpr_walkinfo cwinfo; 598 599 if (kpm_enable == 0) 600 return (0); 601 602 bzero(&cwinfo, sizeof (cwinfo)); 603 cwinfo.mapflag = mapflag; 604 cwinfo.bitfunc = bitfunc; 605 hat_kpm_walk(cpr_walk_kpm, &cwinfo); 606 607 if (cpr_debug & CPR_DEBUG7) { 608 prom_printf("walked %d sub-ranges, total pages %ld\n", 609 cwinfo.ranges, mmu_btop(cwinfo.size)); 610 cpr_show_range(segkpm->s_base, segkpm->s_size, 611 mapflag, bitfunc, cwinfo.pages); 612 } 613 614 return (cwinfo.pages); 615 } 616 617 618 /* 619 * Sparsely filled kernel segments are registered in kseg_table for 620 * easier lookup. See also block comment for cpr_count_seg_pages. 621 */ 622 623 #define KSEG_SEG_ADDR 0 /* address of struct seg */ 624 #define KSEG_PTR_ADDR 1 /* address of pointer to struct seg */ 625 626 typedef struct { 627 struct seg **st_seg; /* segment pointer or segment address */ 628 pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */ 629 int st_addrtype; /* address type in st_seg */ 630 } ksegtbl_entry_t; 631 632 ksegtbl_entry_t kseg_table[] = { 633 {(struct seg **)&kvseg, cpr_scan_kvseg, KSEG_SEG_ADDR}, 634 {&segkpm, cpr_scan_segkpm, KSEG_PTR_ADDR}, 635 {NULL, 0, 0} 636 }; 637 638 639 /* 640 * Compare seg with each entry in kseg_table; when there is a match 641 * return the entry pointer, otherwise return NULL. 642 */ 643 static ksegtbl_entry_t * 644 cpr_sparse_seg_check(struct seg *seg) 645 { 646 ksegtbl_entry_t *ste = &kseg_table[0]; 647 struct seg *tseg; 648 649 for (; ste->st_seg; ste++) { 650 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ? 651 *ste->st_seg : (struct seg *)ste->st_seg; 652 653 if (seg == tseg) 654 return (ste); 655 } 656 657 return ((ksegtbl_entry_t *)NULL); 658 } 659 660 661 /* 662 * Count pages within each kernel segment; call cpr_sparse_seg_check() 663 * to find out whether a sparsely filled segment needs special 664 * treatment (e.g. kvseg). 665 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr 666 * module shouldn't need to know segment details like if it is 667 * sparsely filled or not (makes kseg_table obsolete). 668 */ 669 pgcnt_t 670 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc) 671 { 672 struct seg *segp; 673 pgcnt_t pages; 674 ksegtbl_entry_t *ste; 675 676 pages = 0; 677 for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) { 678 if (ste = cpr_sparse_seg_check(segp)) { 679 pages += (ste->st_fcn)(mapflag, bitfunc, segp); 680 } else { 681 pages += cpr_count_pages(segp->s_base, 682 segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE); 683 } 684 } 685 686 return (pages); 687 } 688 689 690 /* 691 * count kernel pages within kas and any special ranges 692 */ 693 pgcnt_t 694 cpr_count_kpages(int mapflag, bitfunc_t bitfunc) 695 { 696 pgcnt_t kas_cnt; 697 698 /* 699 * Some pages need to be taken care of differently. 700 * eg: panicbuf pages of sun4m are not in kas but they need 701 * to be saved. On sun4u, the physical pages of panicbuf are 702 * allocated via prom_retain(). 703 */ 704 kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc); 705 kas_cnt += cpr_count_seg_pages(mapflag, bitfunc); 706 707 CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt); 708 CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n", 709 kas_cnt, mmu_ptob(kas_cnt)); 710 711 return (kas_cnt); 712 } 713 714 715 /* 716 * Set a bit corresponding to the arg phys page number; 717 * returns 0 when the ppn is valid and the corresponding 718 * map bit was clear, otherwise returns 1. 719 */ 720 int 721 cpr_setbit(pfn_t ppn, int mapflag) 722 { 723 char *bitmap; 724 cbd_t *dp; 725 pfn_t rel; 726 int clr; 727 728 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 729 if (PPN_IN_RANGE(ppn, dp)) { 730 bitmap = DESC_TO_MAP(dp, mapflag); 731 rel = ppn - dp->cbd_spfn; 732 if ((clr = isclr(bitmap, rel)) != 0) 733 setbit(bitmap, rel); 734 return (clr == 0); 735 } 736 } 737 738 return (1); 739 } 740 741 742 /* 743 * Clear a bit corresponding to the arg phys page number. 744 */ 745 int 746 cpr_clrbit(pfn_t ppn, int mapflag) 747 { 748 char *bitmap; 749 cbd_t *dp; 750 pfn_t rel; 751 int set; 752 753 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 754 if (PPN_IN_RANGE(ppn, dp)) { 755 bitmap = DESC_TO_MAP(dp, mapflag); 756 rel = ppn - dp->cbd_spfn; 757 if ((set = isset(bitmap, rel)) != 0) 758 clrbit(bitmap, rel); 759 return (set == 0); 760 } 761 } 762 763 return (1); 764 } 765 766 767 /* ARGSUSED */ 768 int 769 cpr_nobit(pfn_t ppn, int mapflag) 770 { 771 return (0); 772 } 773 774 775 /* 776 * Lookup a bit corresponding to the arg phys page number. 777 */ 778 int 779 cpr_isset(pfn_t ppn, int mapflag) 780 { 781 char *bitmap; 782 cbd_t *dp; 783 pfn_t rel; 784 785 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 786 if (PPN_IN_RANGE(ppn, dp)) { 787 bitmap = DESC_TO_MAP(dp, mapflag); 788 rel = ppn - dp->cbd_spfn; 789 return (isset(bitmap, rel)); 790 } 791 } 792 793 return (0); 794 } 795 796 797 /* 798 * Go thru all pages and pick up any page not caught during the invalidation 799 * stage. This is also used to save pages with cow lock or phys page lock held 800 * (none zero p_lckcnt or p_cowcnt) 801 */ 802 static int 803 cpr_count_upages(int mapflag, bitfunc_t bitfunc) 804 { 805 page_t *pp, *page0; 806 pgcnt_t dcnt = 0, tcnt = 0; 807 pfn_t pfn; 808 809 page0 = pp = page_first(); 810 811 do { 812 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 813 PP_ISFREE(pp) && PP_ISAGED(pp)) 814 continue; 815 816 pfn = page_pptonum(pp); 817 if (pf_is_memory(pfn)) { 818 tcnt++; 819 if ((*bitfunc)(pfn, mapflag) == 0) 820 dcnt++; /* dirty count */ 821 } 822 } while ((pp = page_next(pp)) != page0); 823 824 STAT->cs_upage2statef = dcnt; 825 CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n", 826 dcnt, tcnt); 827 CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n", 828 dcnt, mmu_ptob(dcnt)); 829 page0 = NULL; /* for Lint */ 830 return (dcnt); 831 } 832 833 834 /* 835 * try compressing pages based on cflag, 836 * and for DEBUG kernels, verify uncompressed data checksum; 837 * 838 * this routine replaces common code from 839 * i_cpr_compress_and_save() and cpr_compress_and_write() 840 */ 841 char * 842 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag) 843 { 844 size_t nbytes, clen, len; 845 uint32_t test_sum; 846 char *datap; 847 848 nbytes = mmu_ptob(pages); 849 850 /* 851 * set length to the original uncompressed data size; 852 * always init cpd_flag to zero 853 */ 854 dp->cpd_length = nbytes; 855 dp->cpd_flag = 0; 856 857 #ifdef DEBUG 858 /* 859 * Make a copy of the uncompressed data so we can checksum it. 860 * Compress that copy so the checksum works at the other end 861 */ 862 cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes); 863 dp->cpd_usum = checksum32(cpr_pagecopy, nbytes); 864 dp->cpd_flag |= CPD_USUM; 865 datap = cpr_pagecopy; 866 #else 867 datap = CPR->c_mapping_area; 868 dp->cpd_usum = 0; 869 #endif 870 871 /* 872 * try compressing the raw data to cpr_pagedata; 873 * if there was a size reduction: record the new length, 874 * flag the compression, and point to the compressed data. 875 */ 876 dp->cpd_csum = 0; 877 if (cflag) { 878 clen = compress(datap, cpr_pagedata, nbytes); 879 if (clen < nbytes) { 880 dp->cpd_flag |= CPD_COMPRESS; 881 dp->cpd_length = clen; 882 datap = cpr_pagedata; 883 #ifdef DEBUG 884 dp->cpd_csum = checksum32(datap, clen); 885 dp->cpd_flag |= CPD_CSUM; 886 887 /* 888 * decompress the data back to a scratch area 889 * and compare the new checksum with the original 890 * checksum to verify the compression. 891 */ 892 bzero(cpr_pagecopy, sizeof (cpr_pagecopy)); 893 len = decompress(datap, cpr_pagecopy, 894 clen, sizeof (cpr_pagecopy)); 895 test_sum = checksum32(cpr_pagecopy, len); 896 ASSERT(test_sum == dp->cpd_usum); 897 #endif 898 } 899 } 900 901 return (datap); 902 } 903 904 905 /* 906 * 1. Prepare cpr page descriptor and write it to file 907 * 2. Compress page data and write it out 908 */ 909 static int 910 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg) 911 { 912 int error = 0; 913 char *datap; 914 cpd_t cpd; /* cpr page descriptor */ 915 extern void i_cpr_mapin(caddr_t, uint_t, pfn_t); 916 extern void i_cpr_mapout(caddr_t, uint_t); 917 918 i_cpr_mapin(CPR->c_mapping_area, npg, pfn); 919 920 CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n", 921 npg, (void *)CPR->c_mapping_area, pfn); 922 923 /* 924 * Fill cpr page descriptor. 925 */ 926 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 927 cpd.cpd_pfn = pfn; 928 cpd.cpd_pages = npg; 929 930 STAT->cs_dumped_statefsz += mmu_ptob(npg); 931 932 datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING); 933 934 /* Write cpr page descriptor */ 935 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t)); 936 937 /* Write compressed page data */ 938 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 939 940 /* 941 * Unmap the pages for tlb and vac flushing 942 */ 943 i_cpr_mapout(CPR->c_mapping_area, npg); 944 945 if (error) { 946 CPR_DEBUG(CPR_DEBUG1, 947 "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va); 948 CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n", 949 pfn, cpr_file_bn, error); 950 } else { 951 cpr_regular_pgs_dumped += npg; 952 } 953 954 return (error); 955 } 956 957 958 int 959 cpr_write(vnode_t *vp, caddr_t buffer, size_t size) 960 { 961 caddr_t fromp = buffer; 962 size_t bytes, wbytes; 963 int error; 964 965 if (cpr_dev_space == 0) { 966 if (vp->v_type == VBLK) { 967 cpr_dev_space = cpr_get_devsize(vp->v_rdev); 968 ASSERT(cpr_dev_space); 969 } else 970 cpr_dev_space = 1; /* not used in this case */ 971 } 972 973 /* 974 * break the write into multiple part if request is large, 975 * calculate count up to buf page boundary, then write it out. 976 * repeat until done. 977 */ 978 while (size) { 979 bytes = MIN(size, cpr_buf_end - cpr_wptr); 980 cprbcopy(fromp, cpr_wptr, bytes); 981 cpr_wptr += bytes; 982 fromp += bytes; 983 size -= bytes; 984 if (cpr_wptr < cpr_buf_end) 985 return (0); /* buffer not full yet */ 986 ASSERT(cpr_wptr == cpr_buf_end); 987 988 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks); 989 if (vp->v_type == VBLK) { 990 if (wbytes > cpr_dev_space) 991 return (ENOSPC); 992 } else { 993 if (wbytes > VTOI(vp)->i_size) 994 return (ENOSPC); 995 } 996 997 CPR_DEBUG(CPR_DEBUG3, 998 "cpr_write: frmp=%p wptr=%p cnt=%lx...", 999 (void *)fromp, (void *)cpr_wptr, bytes); 1000 /* 1001 * cross check, this should not happen! 1002 */ 1003 if (cpr_disk_writes_ok == 0) { 1004 prom_printf("cpr_write: disk write too early!\n"); 1005 return (EINVAL); 1006 } 1007 1008 do_polled_io = 1; 1009 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks, 1010 NULL); 1011 do_polled_io = 0; 1012 CPR_DEBUG(CPR_DEBUG3, "done\n"); 1013 1014 STAT->cs_real_statefsz += cpr_buf_size; 1015 1016 if (error) { 1017 cpr_err(CE_WARN, "cpr_write error %d", error); 1018 return (error); 1019 } 1020 cpr_file_bn += cpr_buf_blocks; /* Increment block count */ 1021 cpr_wptr = cpr_buf; /* back to top of buffer */ 1022 } 1023 return (0); 1024 } 1025 1026 1027 int 1028 cpr_flush_write(vnode_t *vp) 1029 { 1030 int nblk; 1031 int error; 1032 1033 /* 1034 * Calculate remaining blocks in buffer, rounded up to nearest 1035 * disk block 1036 */ 1037 nblk = btod(cpr_wptr - cpr_buf); 1038 1039 do_polled_io = 1; 1040 error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL); 1041 do_polled_io = 0; 1042 1043 cpr_file_bn += nblk; 1044 if (error) 1045 CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n", 1046 error); 1047 return (error); 1048 } 1049 1050 void 1051 cpr_clear_bitmaps(void) 1052 { 1053 cbd_t *dp; 1054 1055 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1056 bzero((void *)dp->cbd_reg_bitmap, 1057 (size_t)dp->cbd_size * 2); 1058 } 1059 CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n"); 1060 } 1061 1062 int 1063 cpr_contig_pages(vnode_t *vp, int flag) 1064 { 1065 int chunks = 0, error = 0; 1066 pgcnt_t i, j, totbit; 1067 pfn_t spfn; 1068 cbd_t *dp; 1069 uint_t spin_cnt = 0; 1070 extern int i_cpr_compress_and_save(); 1071 1072 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1073 spfn = dp->cbd_spfn; 1074 totbit = BTOb(dp->cbd_size); 1075 i = 0; /* Beginning of bitmap */ 1076 j = 0; 1077 while (i < totbit) { 1078 while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) { 1079 if (isset((char *)dp->cbd_reg_bitmap, j+i)) 1080 j++; 1081 else /* not contiguous anymore */ 1082 break; 1083 } 1084 1085 if (j) { 1086 chunks++; 1087 if (flag == SAVE_TO_STORAGE) { 1088 error = i_cpr_compress_and_save( 1089 chunks, spfn + i, j); 1090 if (error) 1091 return (error); 1092 } else if (flag == WRITE_TO_STATEFILE) { 1093 error = cpr_compress_and_write(vp, 0, 1094 spfn + i, j); 1095 if (error) 1096 return (error); 1097 else { 1098 spin_cnt++; 1099 if ((spin_cnt & 0x5F) == 1) 1100 cpr_spinning_bar(); 1101 } 1102 } 1103 } 1104 1105 i += j; 1106 if (j != CPR_MAXCONTIG) { 1107 /* Stopped on a non-tagged page */ 1108 i++; 1109 } 1110 1111 j = 0; 1112 } 1113 } 1114 1115 if (flag == STORAGE_DESC_ALLOC) 1116 return (chunks); 1117 else 1118 return (0); 1119 } 1120 1121 1122 void 1123 cpr_show_range(caddr_t vaddr, size_t size, 1124 int mapflag, bitfunc_t bitfunc, pgcnt_t count) 1125 { 1126 char *action, *bname; 1127 1128 bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile"; 1129 if (bitfunc == cpr_setbit) 1130 action = "tag"; 1131 else if (bitfunc == cpr_clrbit) 1132 action = "untag"; 1133 else 1134 action = "none"; 1135 prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n", 1136 (void *)vaddr, (void *)(vaddr + size), bname, action, count); 1137 } 1138 1139 1140 pgcnt_t 1141 cpr_count_pages(caddr_t sva, size_t size, 1142 int mapflag, bitfunc_t bitfunc, int showrange) 1143 { 1144 caddr_t va, eva; 1145 pfn_t pfn; 1146 pgcnt_t count = 0; 1147 1148 eva = sva + PAGE_ROUNDUP(size); 1149 for (va = sva; va < eva; va += MMU_PAGESIZE) { 1150 pfn = va_to_pfn(va); 1151 if (pfn != PFN_INVALID && pf_is_memory(pfn)) { 1152 if ((*bitfunc)(pfn, mapflag) == 0) 1153 count++; 1154 } 1155 } 1156 1157 if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE) 1158 cpr_show_range(sva, size, mapflag, bitfunc, count); 1159 1160 return (count); 1161 } 1162 1163 1164 pgcnt_t 1165 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc) 1166 { 1167 pgcnt_t count = 0; 1168 1169 if (cpr_buf) { 1170 count += cpr_count_pages(cpr_buf, cpr_buf_size, 1171 mapflag, bitfunc, DBG_SHOWRANGE); 1172 } 1173 if (cpr_pagedata) { 1174 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size, 1175 mapflag, bitfunc, DBG_SHOWRANGE); 1176 } 1177 count += i_cpr_count_storage_pages(mapflag, bitfunc); 1178 1179 CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n", 1180 count, mmu_ptob(count)); 1181 return (count); 1182 } 1183 1184 1185 static int 1186 cpr_dump_regular_pages(vnode_t *vp) 1187 { 1188 int error; 1189 1190 cpr_regular_pgs_dumped = 0; 1191 error = cpr_contig_pages(vp, WRITE_TO_STATEFILE); 1192 if (!error) 1193 CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n"); 1194 return (error); 1195 } 1196 #endif 1197