1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Fill in and write out the cpr state file 30 * 1. Allocate and write headers, ELF and cpr dump header 31 * 2. Allocate bitmaps according to phys_install 32 * 3. Tag kernel pages into corresponding bitmap 33 * 4. Write bitmaps to state file 34 * 5. Write actual physical page data to state file 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/vm.h> 40 #include <sys/memlist.h> 41 #include <sys/kmem.h> 42 #include <sys/vnode.h> 43 #include <sys/fs/ufs_inode.h> 44 #include <sys/errno.h> 45 #include <sys/cmn_err.h> 46 #include <sys/debug.h> 47 #include <vm/page.h> 48 #include <vm/seg.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/seg_kpm.h> 51 #include <vm/hat.h> 52 #include <sys/cpr.h> 53 #include <sys/conf.h> 54 #include <sys/ddi.h> 55 #include <sys/panic.h> 56 #include <sys/thread.h> 57 #include <sys/note.h> 58 59 /* Local defines and variables */ 60 #define BTOb(bytes) ((bytes) << 3) /* Bytes to bits, log2(NBBY) */ 61 #define bTOB(bits) ((bits) >> 3) /* bits to Bytes, log2(NBBY) */ 62 63 #if defined(__sparc) 64 static uint_t cpr_pages_tobe_dumped; 65 static uint_t cpr_regular_pgs_dumped; 66 static int cpr_dump_regular_pages(vnode_t *); 67 static int cpr_count_upages(int, bitfunc_t); 68 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t); 69 #endif 70 71 int cpr_flush_write(vnode_t *); 72 73 int cpr_contig_pages(vnode_t *, int); 74 75 void cpr_clear_bitmaps(); 76 77 extern size_t cpr_get_devsize(dev_t); 78 extern int i_cpr_dump_setup(vnode_t *); 79 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *); 80 extern int cpr_test_mode; 81 int cpr_setbit(pfn_t, int); 82 int cpr_clrbit(pfn_t, int); 83 84 ctrm_t cpr_term; 85 86 char *cpr_buf, *cpr_buf_end; 87 int cpr_buf_blocks; /* size of cpr_buf in blocks */ 88 size_t cpr_buf_size; /* size of cpr_buf in bytes */ 89 size_t cpr_bitmap_size; 90 int cpr_nbitmaps; 91 92 char *cpr_pagedata; /* page buffer for compression / tmp copy */ 93 size_t cpr_pagedata_size; /* page buffer size in bytes */ 94 95 #if defined(__sparc) 96 static char *cpr_wptr; /* keep track of where to write to next */ 97 static int cpr_file_bn; /* cpr state-file block offset */ 98 static int cpr_disk_writes_ok; 99 static size_t cpr_dev_space = 0; 100 #endif 101 102 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE]; 103 104 #if defined(__sparc) 105 /* 106 * On some platforms bcopy may modify the thread structure 107 * during bcopy (eg, to prevent cpu migration). If the 108 * range we are currently writing out includes our own 109 * thread structure then it will be snapshotted by bcopy 110 * including those modified members - and the updates made 111 * on exit from bcopy will no longer be seen when we later 112 * restore the mid-bcopy kthread_t. So if the range we 113 * need to copy overlaps with our thread structure we will 114 * use a simple byte copy. 115 */ 116 void 117 cprbcopy(void *from, void *to, size_t bytes) 118 { 119 extern int curthreadremapped; 120 caddr_t kthrend; 121 122 kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1; 123 if (curthreadremapped || (kthrend >= (caddr_t)from && 124 kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) { 125 caddr_t src = from, dst = to; 126 127 while (bytes-- > 0) 128 *dst++ = *src++; 129 } else { 130 bcopy(from, to, bytes); 131 } 132 } 133 134 /* 135 * Allocate pages for buffers used in writing out the statefile 136 */ 137 static int 138 cpr_alloc_bufs(void) 139 { 140 char *allocerr = "Unable to allocate memory for cpr buffer"; 141 size_t size; 142 143 /* 144 * set the cpr write buffer size to at least the historic 145 * size (128k) or large enough to store the both the early 146 * set of statefile structures (well under 0x800) plus the 147 * bitmaps, and roundup to the next pagesize. 148 */ 149 size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size); 150 cpr_buf_size = MAX(size, CPRBUFSZ); 151 cpr_buf_blocks = btodb(cpr_buf_size); 152 cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP); 153 if (cpr_buf == NULL) { 154 cpr_err(CE_WARN, allocerr); 155 return (ENOMEM); 156 } 157 cpr_buf_end = cpr_buf + cpr_buf_size; 158 159 cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1); 160 cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP); 161 if (cpr_pagedata == NULL) { 162 kmem_free(cpr_buf, cpr_buf_size); 163 cpr_buf = NULL; 164 cpr_err(CE_WARN, allocerr); 165 return (ENOMEM); 166 } 167 168 return (0); 169 } 170 171 172 /* 173 * Set bitmap size in bytes based on phys_install. 174 */ 175 void 176 cpr_set_bitmap_size(void) 177 { 178 struct memlist *pmem; 179 size_t size = 0; 180 181 memlist_read_lock(); 182 for (pmem = phys_install; pmem; pmem = pmem->next) 183 size += pmem->size; 184 memlist_read_unlock(); 185 cpr_bitmap_size = BITMAP_BYTES(size); 186 } 187 188 189 /* 190 * CPR dump header contains the following information: 191 * 1. header magic -- unique to cpr state file 192 * 2. kernel return pc & ppn for resume 193 * 3. current thread info 194 * 4. debug level and test mode 195 * 5. number of bitmaps allocated 196 * 6. number of page records 197 */ 198 static int 199 cpr_write_header(vnode_t *vp) 200 { 201 extern ushort_t cpr_mach_type; 202 struct cpr_dump_desc cdump; 203 pgcnt_t bitmap_pages; 204 pgcnt_t kpages, vpages, upages; 205 pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc); 206 207 cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC; 208 cdump.cdd_version = CPR_VERSION; 209 cdump.cdd_machine = cpr_mach_type; 210 cdump.cdd_debug = cpr_debug; 211 cdump.cdd_test_mode = cpr_test_mode; 212 cdump.cdd_bitmaprec = cpr_nbitmaps; 213 214 cpr_clear_bitmaps(); 215 216 /* 217 * Remember how many pages we plan to save to statefile. 218 * This information will be used for sanity checks. 219 * Untag those pages that will not be saved to statefile. 220 */ 221 kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit); 222 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 223 upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit); 224 cdump.cdd_dumppgsize = kpages - vpages + upages; 225 cpr_pages_tobe_dumped = cdump.cdd_dumppgsize; 226 CPR_DEBUG(CPR_DEBUG7, 227 "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n", 228 kpages, vpages, upages, cdump.cdd_dumppgsize); 229 230 /* 231 * Some pages contain volatile data (cpr_buf and storage area for 232 * sensitive kpages), which are no longer needed after the statefile 233 * is dumped to disk. We have already untagged them from regular 234 * bitmaps. Now tag them into the volatile bitmaps. The pages in 235 * volatile bitmaps will be claimed during resume, and the resumed 236 * kernel will free them. 237 */ 238 (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit); 239 240 bitmap_pages = mmu_btopr(cpr_bitmap_size); 241 242 /* 243 * Export accurate statefile size for statefile allocation retry. 244 * statefile_size = all the headers + total pages + 245 * number of pages used by the bitmaps. 246 * Roundup will be done in the file allocation code. 247 */ 248 STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) + 249 (sizeof (cbd_t) * cdump.cdd_bitmaprec) + 250 (sizeof (cpd_t) * cdump.cdd_dumppgsize) + 251 mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages); 252 253 /* 254 * If the estimated statefile is not big enough, 255 * go retry now to save un-necessary operations. 256 */ 257 if (!(CPR->c_flags & C_COMPRESSING) && 258 (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) { 259 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7)) 260 prom_printf("cpr_write_header: " 261 "STAT->cs_nocomp_statefsz > " 262 "STAT->cs_est_statefsz\n"); 263 return (ENOSPC); 264 } 265 266 /* now write cpr dump descriptor */ 267 return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t))); 268 } 269 270 271 /* 272 * CPR dump tail record contains the following information: 273 * 1. header magic -- unique to cpr state file 274 * 2. all misc info that needs to be passed to cprboot or resumed kernel 275 */ 276 static int 277 cpr_write_terminator(vnode_t *vp) 278 { 279 cpr_term.magic = (uint_t)CPR_TERM_MAGIC; 280 cpr_term.va = (cpr_ptr)&cpr_term; 281 cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term); 282 283 /* count the last one (flush) */ 284 cpr_term.real_statef_size = STAT->cs_real_statefsz + 285 btod(cpr_wptr - cpr_buf) * DEV_BSIZE; 286 287 CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n", 288 STAT->cs_real_statefsz); 289 290 cpr_tod_get(&cpr_term.tm_shutdown); 291 292 return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term))); 293 } 294 295 /* 296 * Write bitmap descriptor array, followed by merged bitmaps. 297 */ 298 static int 299 cpr_write_bitmap(vnode_t *vp) 300 { 301 char *rmap, *vmap, *dst, *tail; 302 size_t size, bytes; 303 cbd_t *dp; 304 int err; 305 306 dp = CPR->c_bmda; 307 if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp))) 308 return (err); 309 310 /* 311 * merge regular and volatile bitmaps into tmp space 312 * and write to disk 313 */ 314 for (; dp->cbd_size; dp++) { 315 rmap = (char *)dp->cbd_reg_bitmap; 316 vmap = (char *)dp->cbd_vlt_bitmap; 317 for (size = dp->cbd_size; size; size -= bytes) { 318 bytes = min(size, sizeof (cpr_pagecopy)); 319 tail = &cpr_pagecopy[bytes]; 320 for (dst = cpr_pagecopy; dst < tail; dst++) 321 *dst = *rmap++ | *vmap++; 322 if (err = cpr_write(vp, cpr_pagecopy, bytes)) 323 break; 324 } 325 } 326 327 return (err); 328 } 329 330 331 static int 332 cpr_write_statefile(vnode_t *vp) 333 { 334 uint_t error = 0; 335 extern int i_cpr_check_pgs_dumped(); 336 void flush_windows(void); 337 pgcnt_t spages; 338 char *str; 339 340 flush_windows(); 341 342 /* 343 * to get an accurate view of kas, we need to untag sensitive 344 * pages *before* dumping them because the disk driver makes 345 * allocations and changes kas along the way. The remaining 346 * pages referenced in the bitmaps are dumped out later as 347 * regular kpages. 348 */ 349 str = "cpr_write_statefile:"; 350 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit); 351 CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages); 352 353 /* 354 * now it's OK to call a driver that makes allocations 355 */ 356 cpr_disk_writes_ok = 1; 357 358 /* 359 * now write out the clean sensitive kpages 360 * according to the sensitive descriptors 361 */ 362 error = i_cpr_dump_sensitive_kpages(vp); 363 if (error) { 364 CPR_DEBUG(CPR_DEBUG7, 365 "%s cpr_dump_sensitive_kpages() failed!\n", str); 366 return (error); 367 } 368 369 /* 370 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped 371 */ 372 error = cpr_dump_regular_pages(vp); 373 if (error) { 374 CPR_DEBUG(CPR_DEBUG7, 375 "%s cpr_dump_regular_pages() failed!\n", str); 376 return (error); 377 } 378 379 /* 380 * sanity check to verify the right number of pages were dumped 381 */ 382 error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped, 383 cpr_regular_pgs_dumped); 384 385 if (error) { 386 prom_printf("\n%s page count mismatch!\n", str); 387 #ifdef DEBUG 388 if (cpr_test_mode) 389 debug_enter(NULL); 390 #endif 391 } 392 393 return (error); 394 } 395 #endif 396 397 398 /* 399 * creates the CPR state file, the following sections are 400 * written out in sequence: 401 * - writes the cpr dump header 402 * - writes the memory usage bitmaps 403 * - writes the platform dependent info 404 * - writes the remaining user pages 405 * - writes the kernel pages 406 */ 407 #if defined(__x86) 408 _NOTE(ARGSUSED(0)) 409 #endif 410 int 411 cpr_dump(vnode_t *vp) 412 { 413 #if defined(__sparc) 414 int error; 415 416 if (cpr_buf == NULL) { 417 ASSERT(cpr_pagedata == NULL); 418 if (error = cpr_alloc_bufs()) 419 return (error); 420 } 421 /* point to top of internal buffer */ 422 cpr_wptr = cpr_buf; 423 424 /* initialize global variables used by the write operation */ 425 cpr_file_bn = cpr_statefile_offset(); 426 cpr_dev_space = 0; 427 428 /* allocate bitmaps */ 429 if (CPR->c_bmda == NULL) { 430 if (error = i_cpr_alloc_bitmaps()) { 431 cpr_err(CE_WARN, "cannot allocate bitmaps"); 432 return (error); 433 } 434 } 435 436 if (error = i_cpr_prom_pages(CPR_PROM_SAVE)) 437 return (error); 438 439 if (error = i_cpr_dump_setup(vp)) 440 return (error); 441 442 /* 443 * set internal cross checking; we dont want to call 444 * a disk driver that makes allocations until after 445 * sensitive pages are saved 446 */ 447 cpr_disk_writes_ok = 0; 448 449 /* 450 * 1253112: heap corruption due to memory allocation when dumpping 451 * statefile. 452 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and 453 * kvseg segments can be contaminated should memory allocations happen 454 * during sddump, which is not supposed to happen after the system 455 * is quiesced. Let's call the kernel pages that tend to be affected 456 * 'sensitive kpages' here. To avoid saving inconsistent pages, we 457 * will allocate some storage space to save the clean sensitive pages 458 * aside before statefile dumping takes place. Since there may not be 459 * much memory left at this stage, the sensitive pages will be 460 * compressed before they are saved into the storage area. 461 */ 462 if (error = i_cpr_save_sensitive_kpages()) { 463 CPR_DEBUG(CPR_DEBUG7, 464 "cpr_dump: save_sensitive_kpages failed!\n"); 465 return (error); 466 } 467 468 /* 469 * since all cpr allocations are done (space for sensitive kpages, 470 * bitmaps, cpr_buf), kas is stable, and now we can accurately 471 * count regular and sensitive kpages. 472 */ 473 if (error = cpr_write_header(vp)) { 474 CPR_DEBUG(CPR_DEBUG7, 475 "cpr_dump: cpr_write_header() failed!\n"); 476 return (error); 477 } 478 479 if (error = i_cpr_write_machdep(vp)) 480 return (error); 481 482 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL)) 483 return (error); 484 485 if (error = cpr_write_bitmap(vp)) 486 return (error); 487 488 if (error = cpr_write_statefile(vp)) { 489 CPR_DEBUG(CPR_DEBUG7, 490 "cpr_dump: cpr_write_statefile() failed!\n"); 491 return (error); 492 } 493 494 if (error = cpr_write_terminator(vp)) 495 return (error); 496 497 if (error = cpr_flush_write(vp)) 498 return (error); 499 500 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp)) 501 return (error); 502 #endif 503 504 return (0); 505 } 506 507 508 #if defined(__sparc) 509 /* 510 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc; 511 * a page-count from each range is accumulated at arg->pages. 512 */ 513 static void 514 cpr_xwalk(void *arg, void *base, size_t size) 515 { 516 struct cpr_walkinfo *cwip = arg; 517 518 cwip->pages += cpr_count_pages(base, size, 519 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 520 cwip->size += size; 521 cwip->ranges++; 522 } 523 524 /* 525 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc; 526 * a page-count from each range is accumulated at arg->pages. 527 */ 528 static void 529 cpr_walk(void *arg, void *base, size_t size) 530 { 531 caddr_t addr = base; 532 caddr_t addr_end = addr + size; 533 534 /* 535 * If we are about to start walking the range of addresses we 536 * carved out of the kernel heap for the large page heap walk 537 * heap_lp_arena to find what segments are actually populated 538 */ 539 if (SEGKMEM_USE_LARGEPAGES && 540 addr == heap_lp_base && addr_end == heap_lp_end && 541 vmem_size(heap_lp_arena, VMEM_ALLOC) < size) { 542 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg); 543 } else { 544 cpr_xwalk(arg, base, size); 545 } 546 } 547 548 549 /* 550 * faster scan of kvseg using vmem_walk() to visit 551 * allocated ranges. 552 */ 553 pgcnt_t 554 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg) 555 { 556 struct cpr_walkinfo cwinfo; 557 558 bzero(&cwinfo, sizeof (cwinfo)); 559 cwinfo.mapflag = mapflag; 560 cwinfo.bitfunc = bitfunc; 561 562 vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo); 563 564 if (cpr_debug & CPR_DEBUG7) { 565 prom_printf("walked %d sub-ranges, total pages %ld\n", 566 cwinfo.ranges, mmu_btop(cwinfo.size)); 567 cpr_show_range(seg->s_base, seg->s_size, 568 mapflag, bitfunc, cwinfo.pages); 569 } 570 571 return (cwinfo.pages); 572 } 573 574 575 /* 576 * cpr_walk_kpm() is called for every used area within the large 577 * segkpm virtual address window. A page-count is accumulated at 578 * arg->pages. 579 */ 580 static void 581 cpr_walk_kpm(void *arg, void *base, size_t size) 582 { 583 struct cpr_walkinfo *cwip = arg; 584 585 cwip->pages += cpr_count_pages(base, size, 586 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 587 cwip->size += size; 588 cwip->ranges++; 589 } 590 591 592 /* 593 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges. 594 */ 595 /*ARGSUSED*/ 596 static pgcnt_t 597 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg) 598 { 599 struct cpr_walkinfo cwinfo; 600 601 if (kpm_enable == 0) 602 return (0); 603 604 bzero(&cwinfo, sizeof (cwinfo)); 605 cwinfo.mapflag = mapflag; 606 cwinfo.bitfunc = bitfunc; 607 hat_kpm_walk(cpr_walk_kpm, &cwinfo); 608 609 if (cpr_debug & CPR_DEBUG7) { 610 prom_printf("walked %d sub-ranges, total pages %ld\n", 611 cwinfo.ranges, mmu_btop(cwinfo.size)); 612 cpr_show_range(segkpm->s_base, segkpm->s_size, 613 mapflag, bitfunc, cwinfo.pages); 614 } 615 616 return (cwinfo.pages); 617 } 618 619 620 /* 621 * Sparsely filled kernel segments are registered in kseg_table for 622 * easier lookup. See also block comment for cpr_count_seg_pages. 623 */ 624 625 #define KSEG_SEG_ADDR 0 /* address of struct seg */ 626 #define KSEG_PTR_ADDR 1 /* address of pointer to struct seg */ 627 628 typedef struct { 629 struct seg **st_seg; /* segment pointer or segment address */ 630 pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */ 631 int st_addrtype; /* address type in st_seg */ 632 } ksegtbl_entry_t; 633 634 ksegtbl_entry_t kseg_table[] = { 635 {(struct seg **)&kvseg, cpr_scan_kvseg, KSEG_SEG_ADDR}, 636 {&segkpm, cpr_scan_segkpm, KSEG_PTR_ADDR}, 637 {NULL, 0, 0} 638 }; 639 640 641 /* 642 * Compare seg with each entry in kseg_table; when there is a match 643 * return the entry pointer, otherwise return NULL. 644 */ 645 static ksegtbl_entry_t * 646 cpr_sparse_seg_check(struct seg *seg) 647 { 648 ksegtbl_entry_t *ste = &kseg_table[0]; 649 struct seg *tseg; 650 651 for (; ste->st_seg; ste++) { 652 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ? 653 *ste->st_seg : (struct seg *)ste->st_seg; 654 655 if (seg == tseg) 656 return (ste); 657 } 658 659 return ((ksegtbl_entry_t *)NULL); 660 } 661 662 663 /* 664 * Count pages within each kernel segment; call cpr_sparse_seg_check() 665 * to find out whether a sparsely filled segment needs special 666 * treatment (e.g. kvseg). 667 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr 668 * module shouldn't need to know segment details like if it is 669 * sparsely filled or not (makes kseg_table obsolete). 670 */ 671 pgcnt_t 672 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc) 673 { 674 struct seg *segp; 675 pgcnt_t pages; 676 ksegtbl_entry_t *ste; 677 678 pages = 0; 679 for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) { 680 if (ste = cpr_sparse_seg_check(segp)) { 681 pages += (ste->st_fcn)(mapflag, bitfunc, segp); 682 } else { 683 pages += cpr_count_pages(segp->s_base, 684 segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE); 685 } 686 } 687 688 return (pages); 689 } 690 691 692 /* 693 * count kernel pages within kas and any special ranges 694 */ 695 pgcnt_t 696 cpr_count_kpages(int mapflag, bitfunc_t bitfunc) 697 { 698 pgcnt_t kas_cnt; 699 700 /* 701 * Some pages need to be taken care of differently. 702 * eg: panicbuf pages of sun4m are not in kas but they need 703 * to be saved. On sun4u, the physical pages of panicbuf are 704 * allocated via prom_retain(). 705 */ 706 kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc); 707 kas_cnt += cpr_count_seg_pages(mapflag, bitfunc); 708 709 CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt); 710 CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n", 711 kas_cnt, mmu_ptob(kas_cnt)); 712 713 return (kas_cnt); 714 } 715 716 717 /* 718 * Set a bit corresponding to the arg phys page number; 719 * returns 0 when the ppn is valid and the corresponding 720 * map bit was clear, otherwise returns 1. 721 */ 722 int 723 cpr_setbit(pfn_t ppn, int mapflag) 724 { 725 char *bitmap; 726 cbd_t *dp; 727 pfn_t rel; 728 int clr; 729 730 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 731 if (PPN_IN_RANGE(ppn, dp)) { 732 bitmap = DESC_TO_MAP(dp, mapflag); 733 rel = ppn - dp->cbd_spfn; 734 if ((clr = isclr(bitmap, rel)) != 0) 735 setbit(bitmap, rel); 736 return (clr == 0); 737 } 738 } 739 740 return (1); 741 } 742 743 744 /* 745 * Clear a bit corresponding to the arg phys page number. 746 */ 747 int 748 cpr_clrbit(pfn_t ppn, int mapflag) 749 { 750 char *bitmap; 751 cbd_t *dp; 752 pfn_t rel; 753 int set; 754 755 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 756 if (PPN_IN_RANGE(ppn, dp)) { 757 bitmap = DESC_TO_MAP(dp, mapflag); 758 rel = ppn - dp->cbd_spfn; 759 if ((set = isset(bitmap, rel)) != 0) 760 clrbit(bitmap, rel); 761 return (set == 0); 762 } 763 } 764 765 return (1); 766 } 767 768 769 /* ARGSUSED */ 770 int 771 cpr_nobit(pfn_t ppn, int mapflag) 772 { 773 return (0); 774 } 775 776 777 /* 778 * Lookup a bit corresponding to the arg phys page number. 779 */ 780 int 781 cpr_isset(pfn_t ppn, int mapflag) 782 { 783 char *bitmap; 784 cbd_t *dp; 785 pfn_t rel; 786 787 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 788 if (PPN_IN_RANGE(ppn, dp)) { 789 bitmap = DESC_TO_MAP(dp, mapflag); 790 rel = ppn - dp->cbd_spfn; 791 return (isset(bitmap, rel)); 792 } 793 } 794 795 return (0); 796 } 797 798 799 /* 800 * Go thru all pages and pick up any page not caught during the invalidation 801 * stage. This is also used to save pages with cow lock or phys page lock held 802 * (none zero p_lckcnt or p_cowcnt) 803 */ 804 static int 805 cpr_count_upages(int mapflag, bitfunc_t bitfunc) 806 { 807 page_t *pp, *page0; 808 pgcnt_t dcnt = 0, tcnt = 0; 809 pfn_t pfn; 810 811 page0 = pp = page_first(); 812 813 do { 814 #if defined(__sparc) 815 extern struct vnode prom_ppages; 816 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 817 pp->p_vnode == &prom_ppages || 818 PP_ISFREE(pp) && PP_ISAGED(pp)) 819 #else 820 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 821 PP_ISFREE(pp) && PP_ISAGED(pp)) 822 #endif /* __sparc */ 823 continue; 824 825 pfn = page_pptonum(pp); 826 if (pf_is_memory(pfn)) { 827 tcnt++; 828 if ((*bitfunc)(pfn, mapflag) == 0) 829 dcnt++; /* dirty count */ 830 } 831 } while ((pp = page_next(pp)) != page0); 832 833 STAT->cs_upage2statef = dcnt; 834 CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n", 835 dcnt, tcnt); 836 CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n", 837 dcnt, mmu_ptob(dcnt)); 838 839 return (dcnt); 840 } 841 842 843 /* 844 * try compressing pages based on cflag, 845 * and for DEBUG kernels, verify uncompressed data checksum; 846 * 847 * this routine replaces common code from 848 * i_cpr_compress_and_save() and cpr_compress_and_write() 849 */ 850 char * 851 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag) 852 { 853 size_t nbytes, clen, len; 854 uint32_t test_sum; 855 char *datap; 856 857 nbytes = mmu_ptob(pages); 858 859 /* 860 * set length to the original uncompressed data size; 861 * always init cpd_flag to zero 862 */ 863 dp->cpd_length = nbytes; 864 dp->cpd_flag = 0; 865 866 #ifdef DEBUG 867 /* 868 * Make a copy of the uncompressed data so we can checksum it. 869 * Compress that copy so the checksum works at the other end 870 */ 871 cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes); 872 dp->cpd_usum = checksum32(cpr_pagecopy, nbytes); 873 dp->cpd_flag |= CPD_USUM; 874 datap = cpr_pagecopy; 875 #else 876 datap = CPR->c_mapping_area; 877 dp->cpd_usum = 0; 878 #endif 879 880 /* 881 * try compressing the raw data to cpr_pagedata; 882 * if there was a size reduction: record the new length, 883 * flag the compression, and point to the compressed data. 884 */ 885 dp->cpd_csum = 0; 886 if (cflag) { 887 clen = compress(datap, cpr_pagedata, nbytes); 888 if (clen < nbytes) { 889 dp->cpd_flag |= CPD_COMPRESS; 890 dp->cpd_length = clen; 891 datap = cpr_pagedata; 892 #ifdef DEBUG 893 dp->cpd_csum = checksum32(datap, clen); 894 dp->cpd_flag |= CPD_CSUM; 895 896 /* 897 * decompress the data back to a scratch area 898 * and compare the new checksum with the original 899 * checksum to verify the compression. 900 */ 901 bzero(cpr_pagecopy, sizeof (cpr_pagecopy)); 902 len = decompress(datap, cpr_pagecopy, 903 clen, sizeof (cpr_pagecopy)); 904 test_sum = checksum32(cpr_pagecopy, len); 905 ASSERT(test_sum == dp->cpd_usum); 906 #endif 907 } 908 } 909 910 return (datap); 911 } 912 913 914 /* 915 * 1. Prepare cpr page descriptor and write it to file 916 * 2. Compress page data and write it out 917 */ 918 static int 919 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg) 920 { 921 int error = 0; 922 char *datap; 923 cpd_t cpd; /* cpr page descriptor */ 924 extern void i_cpr_mapin(caddr_t, uint_t, pfn_t); 925 extern void i_cpr_mapout(caddr_t, uint_t); 926 927 i_cpr_mapin(CPR->c_mapping_area, npg, pfn); 928 929 CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n", 930 npg, (void *)CPR->c_mapping_area, pfn); 931 932 /* 933 * Fill cpr page descriptor. 934 */ 935 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 936 cpd.cpd_pfn = pfn; 937 cpd.cpd_pages = npg; 938 939 STAT->cs_dumped_statefsz += mmu_ptob(npg); 940 941 datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING); 942 943 /* Write cpr page descriptor */ 944 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t)); 945 946 /* Write compressed page data */ 947 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 948 949 /* 950 * Unmap the pages for tlb and vac flushing 951 */ 952 i_cpr_mapout(CPR->c_mapping_area, npg); 953 954 if (error) { 955 CPR_DEBUG(CPR_DEBUG1, 956 "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va); 957 CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n", 958 pfn, cpr_file_bn, error); 959 } else { 960 cpr_regular_pgs_dumped += npg; 961 } 962 963 return (error); 964 } 965 966 967 int 968 cpr_write(vnode_t *vp, caddr_t buffer, size_t size) 969 { 970 caddr_t fromp = buffer; 971 size_t bytes, wbytes; 972 int error; 973 974 if (cpr_dev_space == 0) { 975 if (vp->v_type == VBLK) { 976 cpr_dev_space = cpr_get_devsize(vp->v_rdev); 977 ASSERT(cpr_dev_space); 978 } else 979 cpr_dev_space = 1; /* not used in this case */ 980 } 981 982 /* 983 * break the write into multiple part if request is large, 984 * calculate count up to buf page boundary, then write it out. 985 * repeat until done. 986 */ 987 while (size) { 988 bytes = MIN(size, cpr_buf_end - cpr_wptr); 989 cprbcopy(fromp, cpr_wptr, bytes); 990 cpr_wptr += bytes; 991 fromp += bytes; 992 size -= bytes; 993 if (cpr_wptr < cpr_buf_end) 994 return (0); /* buffer not full yet */ 995 ASSERT(cpr_wptr == cpr_buf_end); 996 997 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks); 998 if (vp->v_type == VBLK) { 999 if (wbytes > cpr_dev_space) 1000 return (ENOSPC); 1001 } else { 1002 if (wbytes > VTOI(vp)->i_size) 1003 return (ENOSPC); 1004 } 1005 1006 CPR_DEBUG(CPR_DEBUG3, 1007 "cpr_write: frmp=%p wptr=%p cnt=%lx...", 1008 (void *)fromp, (void *)cpr_wptr, bytes); 1009 /* 1010 * cross check, this should not happen! 1011 */ 1012 if (cpr_disk_writes_ok == 0) { 1013 prom_printf("cpr_write: disk write too early!\n"); 1014 return (EINVAL); 1015 } 1016 1017 do_polled_io = 1; 1018 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks, 1019 NULL); 1020 do_polled_io = 0; 1021 CPR_DEBUG(CPR_DEBUG3, "done\n"); 1022 1023 STAT->cs_real_statefsz += cpr_buf_size; 1024 1025 if (error) { 1026 cpr_err(CE_WARN, "cpr_write error %d", error); 1027 return (error); 1028 } 1029 cpr_file_bn += cpr_buf_blocks; /* Increment block count */ 1030 cpr_wptr = cpr_buf; /* back to top of buffer */ 1031 } 1032 return (0); 1033 } 1034 1035 1036 int 1037 cpr_flush_write(vnode_t *vp) 1038 { 1039 int nblk; 1040 int error; 1041 1042 /* 1043 * Calculate remaining blocks in buffer, rounded up to nearest 1044 * disk block 1045 */ 1046 nblk = btod(cpr_wptr - cpr_buf); 1047 1048 do_polled_io = 1; 1049 error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL); 1050 do_polled_io = 0; 1051 1052 cpr_file_bn += nblk; 1053 if (error) 1054 CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n", 1055 error); 1056 return (error); 1057 } 1058 1059 void 1060 cpr_clear_bitmaps(void) 1061 { 1062 cbd_t *dp; 1063 1064 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1065 bzero((void *)dp->cbd_reg_bitmap, 1066 (size_t)dp->cbd_size * 2); 1067 } 1068 CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n"); 1069 } 1070 1071 int 1072 cpr_contig_pages(vnode_t *vp, int flag) 1073 { 1074 int chunks = 0, error = 0; 1075 pgcnt_t i, j, totbit; 1076 pfn_t spfn; 1077 cbd_t *dp; 1078 uint_t spin_cnt = 0; 1079 extern int i_cpr_compress_and_save(); 1080 1081 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1082 spfn = dp->cbd_spfn; 1083 totbit = BTOb(dp->cbd_size); 1084 i = 0; /* Beginning of bitmap */ 1085 j = 0; 1086 while (i < totbit) { 1087 while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) { 1088 if (isset((char *)dp->cbd_reg_bitmap, j+i)) 1089 j++; 1090 else /* not contiguous anymore */ 1091 break; 1092 } 1093 1094 if (j) { 1095 chunks++; 1096 if (flag == SAVE_TO_STORAGE) { 1097 error = i_cpr_compress_and_save( 1098 chunks, spfn + i, j); 1099 if (error) 1100 return (error); 1101 } else if (flag == WRITE_TO_STATEFILE) { 1102 error = cpr_compress_and_write(vp, 0, 1103 spfn + i, j); 1104 if (error) 1105 return (error); 1106 else { 1107 spin_cnt++; 1108 if ((spin_cnt & 0x5F) == 1) 1109 cpr_spinning_bar(); 1110 } 1111 } 1112 } 1113 1114 i += j; 1115 if (j != CPR_MAXCONTIG) { 1116 /* Stopped on a non-tagged page */ 1117 i++; 1118 } 1119 1120 j = 0; 1121 } 1122 } 1123 1124 if (flag == STORAGE_DESC_ALLOC) 1125 return (chunks); 1126 else 1127 return (0); 1128 } 1129 1130 1131 void 1132 cpr_show_range(caddr_t vaddr, size_t size, 1133 int mapflag, bitfunc_t bitfunc, pgcnt_t count) 1134 { 1135 char *action, *bname; 1136 1137 bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile"; 1138 if (bitfunc == cpr_setbit) 1139 action = "tag"; 1140 else if (bitfunc == cpr_clrbit) 1141 action = "untag"; 1142 else 1143 action = "none"; 1144 prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n", 1145 (void *)vaddr, (void *)(vaddr + size), bname, action, count); 1146 } 1147 1148 1149 pgcnt_t 1150 cpr_count_pages(caddr_t sva, size_t size, 1151 int mapflag, bitfunc_t bitfunc, int showrange) 1152 { 1153 caddr_t va, eva; 1154 pfn_t pfn; 1155 pgcnt_t count = 0; 1156 1157 eva = sva + PAGE_ROUNDUP(size); 1158 for (va = sva; va < eva; va += MMU_PAGESIZE) { 1159 pfn = va_to_pfn(va); 1160 if (pfn != PFN_INVALID && pf_is_memory(pfn)) { 1161 if ((*bitfunc)(pfn, mapflag) == 0) 1162 count++; 1163 } 1164 } 1165 1166 if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE) 1167 cpr_show_range(sva, size, mapflag, bitfunc, count); 1168 1169 return (count); 1170 } 1171 1172 1173 pgcnt_t 1174 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc) 1175 { 1176 pgcnt_t count = 0; 1177 1178 if (cpr_buf) { 1179 count += cpr_count_pages(cpr_buf, cpr_buf_size, 1180 mapflag, bitfunc, DBG_SHOWRANGE); 1181 } 1182 if (cpr_pagedata) { 1183 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size, 1184 mapflag, bitfunc, DBG_SHOWRANGE); 1185 } 1186 count += i_cpr_count_storage_pages(mapflag, bitfunc); 1187 1188 CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n", 1189 count, mmu_ptob(count)); 1190 return (count); 1191 } 1192 1193 1194 static int 1195 cpr_dump_regular_pages(vnode_t *vp) 1196 { 1197 int error; 1198 1199 cpr_regular_pgs_dumped = 0; 1200 error = cpr_contig_pages(vp, WRITE_TO_STATEFILE); 1201 if (!error) 1202 CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n"); 1203 return (error); 1204 } 1205 #endif 1206