1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Fill in and write out the cpr state file 30 * 1. Allocate and write headers, ELF and cpr dump header 31 * 2. Allocate bitmaps according to phys_install 32 * 3. Tag kernel pages into corresponding bitmap 33 * 4. Write bitmaps to state file 34 * 5. Write actual physical page data to state file 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/vm.h> 40 #include <sys/memlist.h> 41 #include <sys/kmem.h> 42 #include <sys/vnode.h> 43 #include <sys/fs/ufs_inode.h> 44 #include <sys/errno.h> 45 #include <sys/cmn_err.h> 46 #include <sys/debug.h> 47 #include <vm/page.h> 48 #include <vm/seg.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/seg_kpm.h> 51 #include <vm/hat.h> 52 #include <sys/cpr.h> 53 #include <sys/conf.h> 54 #include <sys/ddi.h> 55 #include <sys/panic.h> 56 #include <sys/thread.h> 57 58 /* Local defines and variables */ 59 #define BTOb(bytes) ((bytes) << 3) /* Bytes to bits, log2(NBBY) */ 60 #define bTOB(bits) ((bits) >> 3) /* bits to Bytes, log2(NBBY) */ 61 62 static uint_t cpr_pages_tobe_dumped; 63 static uint_t cpr_regular_pgs_dumped; 64 65 static int cpr_dump_regular_pages(vnode_t *); 66 static int cpr_count_upages(int, bitfunc_t); 67 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t); 68 int cpr_flush_write(vnode_t *); 69 70 int cpr_contig_pages(vnode_t *, int); 71 72 void cpr_clear_bitmaps(); 73 74 extern size_t cpr_get_devsize(dev_t); 75 extern int i_cpr_dump_setup(vnode_t *); 76 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *); 77 extern int cpr_test_mode; 78 79 ctrm_t cpr_term; 80 81 char *cpr_buf, *cpr_buf_end; 82 int cpr_buf_blocks; /* size of cpr_buf in blocks */ 83 size_t cpr_buf_size; /* size of cpr_buf in bytes */ 84 size_t cpr_bitmap_size; 85 int cpr_nbitmaps; 86 87 char *cpr_pagedata; /* page buffer for compression / tmp copy */ 88 size_t cpr_pagedata_size; /* page buffer size in bytes */ 89 90 static char *cpr_wptr; /* keep track of where to write to next */ 91 static int cpr_file_bn; /* cpr state-file block offset */ 92 static int cpr_disk_writes_ok; 93 static size_t cpr_dev_space = 0; 94 95 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE]; 96 97 /* 98 * On some platforms bcopy may modify the thread structure 99 * during bcopy (eg, to prevent cpu migration). If the 100 * range we are currently writing out includes our own 101 * thread structure then it will be snapshotted by bcopy 102 * including those modified members - and the updates made 103 * on exit from bcopy will no longer be seen when we later 104 * restore the mid-bcopy kthread_t. So if the range we 105 * need to copy overlaps with our thread structure we will 106 * use a simple byte copy. 107 */ 108 void 109 cprbcopy(void *from, void *to, size_t bytes) 110 { 111 extern int curthreadremapped; 112 caddr_t kthrend; 113 114 kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1; 115 if (curthreadremapped || (kthrend >= (caddr_t)from && 116 kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) { 117 caddr_t src = from, dst = to; 118 119 while (bytes-- > 0) 120 *dst++ = *src++; 121 } else { 122 bcopy(from, to, bytes); 123 } 124 } 125 126 /* 127 * Allocate pages for buffers used in writing out the statefile 128 */ 129 static int 130 cpr_alloc_bufs(void) 131 { 132 char *allocerr = "Unable to allocate memory for cpr buffer"; 133 size_t size; 134 135 /* 136 * set the cpr write buffer size to at least the historic 137 * size (128k) or large enough to store the both the early 138 * set of statefile structures (well under 0x800) plus the 139 * bitmaps, and roundup to the next pagesize. 140 */ 141 size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size); 142 cpr_buf_size = MAX(size, CPRBUFSZ); 143 cpr_buf_blocks = btodb(cpr_buf_size); 144 cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP); 145 if (cpr_buf == NULL) { 146 cpr_err(CE_WARN, allocerr); 147 return (ENOMEM); 148 } 149 cpr_buf_end = cpr_buf + cpr_buf_size; 150 151 cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1); 152 cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP); 153 if (cpr_pagedata == NULL) { 154 kmem_free(cpr_buf, cpr_buf_size); 155 cpr_buf = NULL; 156 cpr_err(CE_WARN, allocerr); 157 return (ENOMEM); 158 } 159 160 return (0); 161 } 162 163 164 /* 165 * Set bitmap size in bytes based on phys_install. 166 */ 167 void 168 cpr_set_bitmap_size(void) 169 { 170 struct memlist *pmem; 171 size_t size = 0; 172 173 memlist_read_lock(); 174 for (pmem = phys_install; pmem; pmem = pmem->next) 175 size += pmem->size; 176 memlist_read_unlock(); 177 cpr_bitmap_size = BITMAP_BYTES(size); 178 } 179 180 181 /* 182 * CPR dump header contains the following information: 183 * 1. header magic -- unique to cpr state file 184 * 2. kernel return pc & ppn for resume 185 * 3. current thread info 186 * 4. debug level and test mode 187 * 5. number of bitmaps allocated 188 * 6. number of page records 189 */ 190 static int 191 cpr_write_header(vnode_t *vp) 192 { 193 extern ushort_t cpr_mach_type; 194 struct cpr_dump_desc cdump; 195 pgcnt_t bitmap_pages; 196 pgcnt_t kpages, vpages, upages; 197 198 cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC; 199 cdump.cdd_version = CPR_VERSION; 200 cdump.cdd_machine = cpr_mach_type; 201 cdump.cdd_debug = cpr_debug; 202 cdump.cdd_test_mode = cpr_test_mode; 203 cdump.cdd_bitmaprec = cpr_nbitmaps; 204 205 cpr_clear_bitmaps(); 206 207 /* 208 * Remember how many pages we plan to save to statefile. 209 * This information will be used for sanity checks. 210 * Untag those pages that will not be saved to statefile. 211 */ 212 kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit); 213 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 214 upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit); 215 cdump.cdd_dumppgsize = kpages - vpages + upages; 216 cpr_pages_tobe_dumped = cdump.cdd_dumppgsize; 217 DEBUG7(errp( 218 "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n", 219 kpages, vpages, upages, cdump.cdd_dumppgsize)); 220 221 /* 222 * Some pages contain volatile data (cpr_buf and storage area for 223 * sensitive kpages), which are no longer needed after the statefile 224 * is dumped to disk. We have already untagged them from regular 225 * bitmaps. Now tag them into the volatile bitmaps. The pages in 226 * volatile bitmaps will be claimed during resume, and the resumed 227 * kernel will free them. 228 */ 229 (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit); 230 231 bitmap_pages = mmu_btopr(cpr_bitmap_size); 232 233 /* 234 * Export accurate statefile size for statefile allocation retry. 235 * statefile_size = all the headers + total pages + 236 * number of pages used by the bitmaps. 237 * Roundup will be done in the file allocation code. 238 */ 239 STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) + 240 (sizeof (cbd_t) * cdump.cdd_bitmaprec) + 241 (sizeof (cpd_t) * cdump.cdd_dumppgsize) + 242 mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages); 243 244 /* 245 * If the estimated statefile is not big enough, 246 * go retry now to save un-necessary operations. 247 */ 248 if (!(CPR->c_flags & C_COMPRESSING) && 249 (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) { 250 if (cpr_debug & (LEVEL1 | LEVEL7)) 251 errp("cpr_write_header: STAT->cs_nocomp_statefsz > " 252 "STAT->cs_est_statefsz\n"); 253 return (ENOSPC); 254 } 255 256 /* now write cpr dump descriptor */ 257 return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t))); 258 } 259 260 261 /* 262 * CPR dump tail record contains the following information: 263 * 1. header magic -- unique to cpr state file 264 * 2. all misc info that needs to be passed to cprboot or resumed kernel 265 */ 266 static int 267 cpr_write_terminator(vnode_t *vp) 268 { 269 cpr_term.magic = (uint_t)CPR_TERM_MAGIC; 270 cpr_term.va = (cpr_ptr)&cpr_term; 271 cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term); 272 273 /* count the last one (flush) */ 274 cpr_term.real_statef_size = STAT->cs_real_statefsz + 275 btod(cpr_wptr - cpr_buf) * DEV_BSIZE; 276 277 DEBUG9(errp("cpr_dump: Real Statefile Size: %ld\n", 278 STAT->cs_real_statefsz)); 279 280 cpr_tod_get(&cpr_term.tm_shutdown); 281 282 return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term))); 283 } 284 285 /* 286 * Write bitmap descriptor array, followed by merged bitmaps. 287 */ 288 static int 289 cpr_write_bitmap(vnode_t *vp) 290 { 291 char *rmap, *vmap, *dst, *tail; 292 size_t size, bytes; 293 cbd_t *dp; 294 int err; 295 296 dp = CPR->c_bmda; 297 if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp))) 298 return (err); 299 300 /* 301 * merge regular and volatile bitmaps into tmp space 302 * and write to disk 303 */ 304 for (; dp->cbd_size; dp++) { 305 rmap = (char *)dp->cbd_reg_bitmap; 306 vmap = (char *)dp->cbd_vlt_bitmap; 307 for (size = dp->cbd_size; size; size -= bytes) { 308 bytes = min(size, sizeof (cpr_pagecopy)); 309 tail = &cpr_pagecopy[bytes]; 310 for (dst = cpr_pagecopy; dst < tail; dst++) 311 *dst = *rmap++ | *vmap++; 312 if (err = cpr_write(vp, cpr_pagecopy, bytes)) 313 break; 314 } 315 } 316 317 return (err); 318 } 319 320 321 static int 322 cpr_write_statefile(vnode_t *vp) 323 { 324 uint_t error = 0; 325 extern int i_cpr_check_pgs_dumped(); 326 void flush_windows(void); 327 pgcnt_t spages; 328 char *str; 329 330 flush_windows(); 331 332 /* 333 * to get an accurate view of kas, we need to untag sensitive 334 * pages *before* dumping them because the disk driver makes 335 * allocations and changes kas along the way. The remaining 336 * pages referenced in the bitmaps are dumped out later as 337 * regular kpages. 338 */ 339 str = "cpr_write_statefile:"; 340 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit); 341 DEBUG7(errp("%s untag %ld sens pages\n", str, spages)); 342 343 /* 344 * now it's OK to call a driver that makes allocations 345 */ 346 cpr_disk_writes_ok = 1; 347 348 /* 349 * now write out the clean sensitive kpages 350 * according to the sensitive descriptors 351 */ 352 error = i_cpr_dump_sensitive_kpages(vp); 353 if (error) { 354 DEBUG7(errp("%s cpr_dump_sensitive_kpages() failed!\n", str)); 355 return (error); 356 } 357 358 /* 359 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped 360 */ 361 error = cpr_dump_regular_pages(vp); 362 if (error) { 363 DEBUG7(errp("%s cpr_dump_regular_pages() failed!\n", str)); 364 return (error); 365 } 366 367 /* 368 * sanity check to verify the right number of pages were dumped 369 */ 370 error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped, 371 cpr_regular_pgs_dumped); 372 373 if (error) { 374 errp("\n%s page count mismatch!\n", str); 375 #ifdef DEBUG 376 if (cpr_test_mode) 377 debug_enter(NULL); 378 #endif 379 } 380 381 return (error); 382 } 383 384 385 /* 386 * creates the CPR state file, the following sections are 387 * written out in sequence: 388 * - writes the cpr dump header 389 * - writes the memory usage bitmaps 390 * - writes the platform dependent info 391 * - writes the remaining user pages 392 * - writes the kernel pages 393 */ 394 int 395 cpr_dump(vnode_t *vp) 396 { 397 int error; 398 399 if (cpr_buf == NULL) { 400 ASSERT(cpr_pagedata == NULL); 401 if (error = cpr_alloc_bufs()) 402 return (error); 403 } 404 /* point to top of internal buffer */ 405 cpr_wptr = cpr_buf; 406 407 /* initialize global variables used by the write operation */ 408 cpr_file_bn = cpr_statefile_offset(); 409 cpr_dev_space = 0; 410 411 /* allocate bitmaps */ 412 if (CPR->c_bmda == NULL) { 413 if (error = i_cpr_alloc_bitmaps()) { 414 cpr_err(CE_WARN, "cannot allocate bitmaps"); 415 return (error); 416 } 417 } 418 419 if (error = i_cpr_prom_pages(CPR_PROM_SAVE)) 420 return (error); 421 422 if (error = i_cpr_dump_setup(vp)) 423 return (error); 424 425 /* 426 * set internal cross checking; we dont want to call 427 * a disk driver that makes allocations until after 428 * sensitive pages are saved 429 */ 430 cpr_disk_writes_ok = 0; 431 432 /* 433 * 1253112: heap corruption due to memory allocation when dumpping 434 * statefile. 435 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and 436 * kvseg segments can be contaminated should memory allocations happen 437 * during sddump, which is not supposed to happen after the system 438 * is quiesced. Let's call the kernel pages that tend to be affected 439 * 'sensitive kpages' here. To avoid saving inconsistent pages, we 440 * will allocate some storage space to save the clean sensitive pages 441 * aside before statefile dumping takes place. Since there may not be 442 * much memory left at this stage, the sensitive pages will be 443 * compressed before they are saved into the storage area. 444 */ 445 if (error = i_cpr_save_sensitive_kpages()) { 446 DEBUG7(errp("cpr_dump: save_sensitive_kpages failed!\n")); 447 return (error); 448 } 449 450 /* 451 * since all cpr allocations are done (space for sensitive kpages, 452 * bitmaps, cpr_buf), kas is stable, and now we can accurately 453 * count regular and sensitive kpages. 454 */ 455 if (error = cpr_write_header(vp)) { 456 DEBUG7(errp("cpr_dump: cpr_write_header() failed!\n")); 457 return (error); 458 } 459 460 if (error = i_cpr_write_machdep(vp)) 461 return (error); 462 463 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL)) 464 return (error); 465 466 if (error = cpr_write_bitmap(vp)) 467 return (error); 468 469 if (error = cpr_write_statefile(vp)) { 470 DEBUG7(errp("cpr_dump: cpr_write_statefile() failed!\n")); 471 return (error); 472 } 473 474 if (error = cpr_write_terminator(vp)) 475 return (error); 476 477 if (error = cpr_flush_write(vp)) 478 return (error); 479 480 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp)) 481 return (error); 482 483 return (0); 484 } 485 486 487 /* 488 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc; 489 * a page-count from each range is accumulated at arg->pages. 490 */ 491 static void 492 cpr_xwalk(void *arg, void *base, size_t size) 493 { 494 struct cpr_walkinfo *cwip = arg; 495 496 cwip->pages += cpr_count_pages(base, size, 497 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 498 cwip->size += size; 499 cwip->ranges++; 500 } 501 502 /* 503 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc; 504 * a page-count from each range is accumulated at arg->pages. 505 */ 506 static void 507 cpr_walk(void *arg, void *base, size_t size) 508 { 509 caddr_t addr = base; 510 caddr_t addr_end = addr + size; 511 512 /* 513 * If we are about to start walking the range of addresses we 514 * carved out of the kernel heap for the large page heap walk 515 * heap_lp_arena to find what segments are actually populated 516 */ 517 if (SEGKMEM_USE_LARGEPAGES && 518 addr == heap_lp_base && addr_end == heap_lp_end && 519 vmem_size(heap_lp_arena, VMEM_ALLOC) < size) { 520 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg); 521 } else { 522 cpr_xwalk(arg, base, size); 523 } 524 } 525 526 527 /* 528 * faster scan of kvseg using vmem_walk() to visit 529 * allocated ranges. 530 */ 531 pgcnt_t 532 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg) 533 { 534 struct cpr_walkinfo cwinfo; 535 536 bzero(&cwinfo, sizeof (cwinfo)); 537 cwinfo.mapflag = mapflag; 538 cwinfo.bitfunc = bitfunc; 539 540 vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo); 541 542 if (cpr_debug & LEVEL7) { 543 errp("walked %d sub-ranges, total pages %ld\n", 544 cwinfo.ranges, mmu_btop(cwinfo.size)); 545 cpr_show_range(seg->s_base, seg->s_size, 546 mapflag, bitfunc, cwinfo.pages); 547 } 548 549 return (cwinfo.pages); 550 } 551 552 553 /* 554 * cpr_walk_kpm() is called for every used area within the large 555 * segkpm virtual address window. A page-count is accumulated at 556 * arg->pages. 557 */ 558 static void 559 cpr_walk_kpm(void *arg, void *base, size_t size) 560 { 561 struct cpr_walkinfo *cwip = arg; 562 563 cwip->pages += cpr_count_pages(base, size, 564 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 565 cwip->size += size; 566 cwip->ranges++; 567 } 568 569 570 /* 571 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges. 572 */ 573 /*ARGSUSED*/ 574 static pgcnt_t 575 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg) 576 { 577 struct cpr_walkinfo cwinfo; 578 579 if (kpm_enable == 0) 580 return (0); 581 582 bzero(&cwinfo, sizeof (cwinfo)); 583 cwinfo.mapflag = mapflag; 584 cwinfo.bitfunc = bitfunc; 585 hat_kpm_walk(cpr_walk_kpm, &cwinfo); 586 587 if (cpr_debug & LEVEL7) { 588 errp("walked %d sub-ranges, total pages %ld\n", 589 cwinfo.ranges, mmu_btop(cwinfo.size)); 590 cpr_show_range(segkpm->s_base, segkpm->s_size, 591 mapflag, bitfunc, cwinfo.pages); 592 } 593 594 return (cwinfo.pages); 595 } 596 597 598 /* 599 * Sparsely filled kernel segments are registered in kseg_table for 600 * easier lookup. See also block comment for cpr_count_seg_pages. 601 */ 602 603 #define KSEG_SEG_ADDR 0 /* address of struct seg */ 604 #define KSEG_PTR_ADDR 1 /* address of pointer to struct seg */ 605 606 typedef struct { 607 struct seg **st_seg; /* segment pointer or segment address */ 608 pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */ 609 int st_addrtype; /* address type in st_seg */ 610 } ksegtbl_entry_t; 611 612 ksegtbl_entry_t kseg_table[] = { 613 {(struct seg **)&kvseg, cpr_scan_kvseg, KSEG_SEG_ADDR}, 614 {&segkpm, cpr_scan_segkpm, KSEG_PTR_ADDR}, 615 {NULL, 0, 0} 616 }; 617 618 619 /* 620 * Compare seg with each entry in kseg_table; when there is a match 621 * return the entry pointer, otherwise return NULL. 622 */ 623 static ksegtbl_entry_t * 624 cpr_sparse_seg_check(struct seg *seg) 625 { 626 ksegtbl_entry_t *ste = &kseg_table[0]; 627 struct seg *tseg; 628 629 for (; ste->st_seg; ste++) { 630 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ? 631 *ste->st_seg : (struct seg *)ste->st_seg; 632 if (seg == tseg) 633 return (ste); 634 } 635 636 return ((ksegtbl_entry_t *)NULL); 637 } 638 639 640 /* 641 * Count pages within each kernel segment; call cpr_sparse_seg_check() 642 * to find out whether a sparsely filled segment needs special 643 * treatment (e.g. kvseg). 644 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr 645 * module shouldn't need to know segment details like if it is 646 * sparsely filled or not (makes kseg_table obsolete). 647 */ 648 pgcnt_t 649 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc) 650 { 651 struct seg *segp; 652 pgcnt_t pages; 653 ksegtbl_entry_t *ste; 654 655 pages = 0; 656 for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) { 657 if (ste = cpr_sparse_seg_check(segp)) { 658 pages += (ste->st_fcn)(mapflag, bitfunc, segp); 659 } else { 660 pages += cpr_count_pages(segp->s_base, 661 segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE); 662 } 663 } 664 665 return (pages); 666 } 667 668 669 /* 670 * count kernel pages within kas and any special ranges 671 */ 672 pgcnt_t 673 cpr_count_kpages(int mapflag, bitfunc_t bitfunc) 674 { 675 pgcnt_t kas_cnt; 676 677 /* 678 * Some pages need to be taken care of differently. 679 * eg: panicbuf pages of sun4m are not in kas but they need 680 * to be saved. On sun4u, the physical pages of panicbuf are 681 * allocated via prom_retain(). 682 */ 683 kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc); 684 kas_cnt += cpr_count_seg_pages(mapflag, bitfunc); 685 686 DEBUG9(errp("cpr_count_kpages: kas_cnt=%ld\n", kas_cnt)); 687 DEBUG7(errp("\ncpr_count_kpages: %ld pages, 0x%lx bytes\n", 688 kas_cnt, mmu_ptob(kas_cnt))); 689 return (kas_cnt); 690 } 691 692 693 /* 694 * Set a bit corresponding to the arg phys page number; 695 * returns 0 when the ppn is valid and the corresponding 696 * map bit was clear, otherwise returns 1. 697 */ 698 int 699 cpr_setbit(pfn_t ppn, int mapflag) 700 { 701 char *bitmap; 702 cbd_t *dp; 703 pfn_t rel; 704 int clr; 705 706 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 707 if (PPN_IN_RANGE(ppn, dp)) { 708 bitmap = DESC_TO_MAP(dp, mapflag); 709 rel = ppn - dp->cbd_spfn; 710 if ((clr = isclr(bitmap, rel)) != 0) 711 setbit(bitmap, rel); 712 return (clr == 0); 713 } 714 } 715 716 return (1); 717 } 718 719 720 /* 721 * Clear a bit corresponding to the arg phys page number. 722 */ 723 int 724 cpr_clrbit(pfn_t ppn, int mapflag) 725 { 726 char *bitmap; 727 cbd_t *dp; 728 pfn_t rel; 729 int set; 730 731 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 732 if (PPN_IN_RANGE(ppn, dp)) { 733 bitmap = DESC_TO_MAP(dp, mapflag); 734 rel = ppn - dp->cbd_spfn; 735 if ((set = isset(bitmap, rel)) != 0) 736 clrbit(bitmap, rel); 737 return (set == 0); 738 } 739 } 740 741 return (1); 742 } 743 744 745 /* ARGSUSED */ 746 int 747 cpr_nobit(pfn_t ppn, int mapflag) 748 { 749 return (0); 750 } 751 752 753 /* 754 * Lookup a bit corresponding to the arg phys page number. 755 */ 756 int 757 cpr_isset(pfn_t ppn, int mapflag) 758 { 759 char *bitmap; 760 cbd_t *dp; 761 pfn_t rel; 762 763 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 764 if (PPN_IN_RANGE(ppn, dp)) { 765 bitmap = DESC_TO_MAP(dp, mapflag); 766 rel = ppn - dp->cbd_spfn; 767 return (isset(bitmap, rel)); 768 } 769 } 770 771 return (0); 772 } 773 774 775 /* 776 * Go thru all pages and pick up any page not caught during the invalidation 777 * stage. This is also used to save pages with cow lock or phys page lock held 778 * (none zero p_lckcnt or p_cowcnt) 779 */ 780 static int 781 cpr_count_upages(int mapflag, bitfunc_t bitfunc) 782 { 783 page_t *pp, *page0; 784 pgcnt_t dcnt = 0, tcnt = 0; 785 pfn_t pfn; 786 787 page0 = pp = page_first(); 788 789 do { 790 #if defined(__sparc) 791 extern struct vnode prom_ppages; 792 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 793 pp->p_vnode == &prom_ppages || 794 PP_ISFREE(pp) && PP_ISAGED(pp)) 795 #else 796 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 797 PP_ISFREE(pp) && PP_ISAGED(pp)) 798 #endif /* __sparc */ 799 continue; 800 801 pfn = page_pptonum(pp); 802 if (pf_is_memory(pfn)) { 803 tcnt++; 804 if ((*bitfunc)(pfn, mapflag) == 0) 805 dcnt++; /* dirty count */ 806 } 807 } while ((pp = page_next(pp)) != page0); 808 809 STAT->cs_upage2statef = dcnt; 810 DEBUG9(errp("cpr_count_upages: dirty=%ld total=%ld\n", 811 dcnt, tcnt)); 812 DEBUG7(errp("cpr_count_upages: %ld pages, 0x%lx bytes\n", 813 dcnt, mmu_ptob(dcnt))); 814 return (dcnt); 815 } 816 817 818 /* 819 * try compressing pages based on cflag, 820 * and for DEBUG kernels, verify uncompressed data checksum; 821 * 822 * this routine replaces common code from 823 * i_cpr_compress_and_save() and cpr_compress_and_write() 824 */ 825 char * 826 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag) 827 { 828 size_t nbytes, clen, len; 829 uint32_t test_sum; 830 char *datap; 831 832 nbytes = mmu_ptob(pages); 833 834 /* 835 * set length to the original uncompressed data size; 836 * always init cpd_flag to zero 837 */ 838 dp->cpd_length = nbytes; 839 dp->cpd_flag = 0; 840 841 #ifdef DEBUG 842 /* 843 * Make a copy of the uncompressed data so we can checksum it. 844 * Compress that copy so the checksum works at the other end 845 */ 846 cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes); 847 dp->cpd_usum = checksum32(cpr_pagecopy, nbytes); 848 dp->cpd_flag |= CPD_USUM; 849 datap = cpr_pagecopy; 850 #else 851 datap = CPR->c_mapping_area; 852 dp->cpd_usum = 0; 853 #endif 854 855 /* 856 * try compressing the raw data to cpr_pagedata; 857 * if there was a size reduction: record the new length, 858 * flag the compression, and point to the compressed data. 859 */ 860 dp->cpd_csum = 0; 861 if (cflag) { 862 clen = compress(datap, cpr_pagedata, nbytes); 863 if (clen < nbytes) { 864 dp->cpd_flag |= CPD_COMPRESS; 865 dp->cpd_length = clen; 866 datap = cpr_pagedata; 867 #ifdef DEBUG 868 dp->cpd_csum = checksum32(datap, clen); 869 dp->cpd_flag |= CPD_CSUM; 870 871 /* 872 * decompress the data back to a scratch area 873 * and compare the new checksum with the original 874 * checksum to verify the compression. 875 */ 876 bzero(cpr_pagecopy, sizeof (cpr_pagecopy)); 877 len = decompress(datap, cpr_pagecopy, 878 clen, sizeof (cpr_pagecopy)); 879 test_sum = checksum32(cpr_pagecopy, len); 880 ASSERT(test_sum == dp->cpd_usum); 881 #endif 882 } 883 } 884 885 return (datap); 886 } 887 888 889 /* 890 * 1. Prepare cpr page descriptor and write it to file 891 * 2. Compress page data and write it out 892 */ 893 static int 894 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg) 895 { 896 int error = 0; 897 char *datap; 898 cpd_t cpd; /* cpr page descriptor */ 899 extern void i_cpr_mapin(caddr_t, uint_t, pfn_t); 900 extern void i_cpr_mapout(caddr_t, uint_t); 901 902 i_cpr_mapin(CPR->c_mapping_area, npg, pfn); 903 904 DEBUG3(errp("mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n", 905 npg, CPR->c_mapping_area, pfn)); 906 907 /* 908 * Fill cpr page descriptor. 909 */ 910 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 911 cpd.cpd_pfn = pfn; 912 cpd.cpd_pages = npg; 913 914 STAT->cs_dumped_statefsz += mmu_ptob(npg); 915 916 datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING); 917 918 /* Write cpr page descriptor */ 919 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t)); 920 921 /* Write compressed page data */ 922 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 923 924 /* 925 * Unmap the pages for tlb and vac flushing 926 */ 927 i_cpr_mapout(CPR->c_mapping_area, npg); 928 929 if (error) { 930 DEBUG1(errp("cpr_compress_and_write: vp 0x%p va 0x%x ", 931 vp, va)); 932 DEBUG1(errp("pfn 0x%lx blk %d err %d\n", 933 pfn, cpr_file_bn, error)); 934 } else { 935 cpr_regular_pgs_dumped += npg; 936 } 937 938 return (error); 939 } 940 941 942 int 943 cpr_write(vnode_t *vp, caddr_t buffer, size_t size) 944 { 945 caddr_t fromp = buffer; 946 size_t bytes, wbytes; 947 int error; 948 949 if (cpr_dev_space == 0) { 950 if (vp->v_type == VBLK) { 951 cpr_dev_space = cpr_get_devsize(vp->v_rdev); 952 ASSERT(cpr_dev_space); 953 } else 954 cpr_dev_space = 1; /* not used in this case */ 955 } 956 957 /* 958 * break the write into multiple part if request is large, 959 * calculate count up to buf page boundary, then write it out. 960 * repeat until done. 961 */ 962 while (size) { 963 bytes = MIN(size, cpr_buf_end - cpr_wptr); 964 cprbcopy(fromp, cpr_wptr, bytes); 965 cpr_wptr += bytes; 966 fromp += bytes; 967 size -= bytes; 968 if (cpr_wptr < cpr_buf_end) 969 return (0); /* buffer not full yet */ 970 ASSERT(cpr_wptr == cpr_buf_end); 971 972 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks); 973 if (vp->v_type == VBLK) { 974 if (wbytes > cpr_dev_space) 975 return (ENOSPC); 976 } else { 977 if (wbytes > VTOI(vp)->i_size) 978 return (ENOSPC); 979 } 980 981 DEBUG3(errp("cpr_write: frmp=%p wptr=%p cnt=%lx...", 982 fromp, cpr_wptr, bytes)); 983 /* 984 * cross check, this should not happen! 985 */ 986 if (cpr_disk_writes_ok == 0) { 987 errp("cpr_write: disk write too early!\n"); 988 return (EINVAL); 989 } 990 991 do_polled_io = 1; 992 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks); 993 do_polled_io = 0; 994 DEBUG3(errp("done\n")); 995 996 STAT->cs_real_statefsz += cpr_buf_size; 997 998 if (error) { 999 cpr_err(CE_WARN, "cpr_write error %d", error); 1000 return (error); 1001 } 1002 cpr_file_bn += cpr_buf_blocks; /* Increment block count */ 1003 cpr_wptr = cpr_buf; /* back to top of buffer */ 1004 } 1005 return (0); 1006 } 1007 1008 1009 int 1010 cpr_flush_write(vnode_t *vp) 1011 { 1012 int nblk; 1013 int error; 1014 1015 /* 1016 * Calculate remaining blocks in buffer, rounded up to nearest 1017 * disk block 1018 */ 1019 nblk = btod(cpr_wptr - cpr_buf); 1020 1021 do_polled_io = 1; 1022 error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk); 1023 do_polled_io = 0; 1024 1025 cpr_file_bn += nblk; 1026 if (error) 1027 DEBUG2(errp("cpr_flush_write: error (%d)\n", error)); 1028 return (error); 1029 } 1030 1031 void 1032 cpr_clear_bitmaps(void) 1033 { 1034 cbd_t *dp; 1035 1036 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1037 bzero((void *)dp->cbd_reg_bitmap, 1038 (size_t)dp->cbd_size * 2); 1039 } 1040 DEBUG7(errp("\ncleared reg and vlt bitmaps\n")); 1041 } 1042 1043 int 1044 cpr_contig_pages(vnode_t *vp, int flag) 1045 { 1046 int chunks = 0, error = 0; 1047 pgcnt_t i, j, totbit; 1048 pfn_t spfn; 1049 cbd_t *dp; 1050 uint_t spin_cnt = 0; 1051 extern int i_cpr_compress_and_save(); 1052 1053 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1054 spfn = dp->cbd_spfn; 1055 totbit = BTOb(dp->cbd_size); 1056 i = 0; /* Beginning of bitmap */ 1057 j = 0; 1058 while (i < totbit) { 1059 while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) { 1060 if (isset((char *)dp->cbd_reg_bitmap, j+i)) 1061 j++; 1062 else /* not contiguous anymore */ 1063 break; 1064 } 1065 1066 if (j) { 1067 chunks++; 1068 if (flag == SAVE_TO_STORAGE) { 1069 error = i_cpr_compress_and_save( 1070 chunks, spfn + i, j); 1071 if (error) 1072 return (error); 1073 } else if (flag == WRITE_TO_STATEFILE) { 1074 error = cpr_compress_and_write(vp, 0, 1075 spfn + i, j); 1076 if (error) 1077 return (error); 1078 else { 1079 spin_cnt++; 1080 if ((spin_cnt & 0x5F) == 1) 1081 cpr_spinning_bar(); 1082 } 1083 } 1084 } 1085 1086 i += j; 1087 if (j != CPR_MAXCONTIG) { 1088 /* Stopped on a non-tagged page */ 1089 i++; 1090 } 1091 1092 j = 0; 1093 } 1094 } 1095 1096 if (flag == STORAGE_DESC_ALLOC) 1097 return (chunks); 1098 else 1099 return (0); 1100 } 1101 1102 1103 void 1104 cpr_show_range(caddr_t vaddr, size_t size, 1105 int mapflag, bitfunc_t bitfunc, pgcnt_t count) 1106 { 1107 char *action, *bname; 1108 1109 bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile"; 1110 if (bitfunc == cpr_setbit) 1111 action = "tag"; 1112 else if (bitfunc == cpr_clrbit) 1113 action = "untag"; 1114 else 1115 action = "none"; 1116 errp("range (0x%p, 0x%p), %s bitmap, %s %ld\n", 1117 vaddr, vaddr + size, bname, action, count); 1118 } 1119 1120 1121 pgcnt_t 1122 cpr_count_pages(caddr_t sva, size_t size, 1123 int mapflag, bitfunc_t bitfunc, int showrange) 1124 { 1125 caddr_t va, eva; 1126 pfn_t pfn; 1127 pgcnt_t count = 0; 1128 1129 eva = sva + PAGE_ROUNDUP(size); 1130 for (va = sva; va < eva; va += MMU_PAGESIZE) { 1131 pfn = va_to_pfn(va); 1132 if (pfn != PFN_INVALID && pf_is_memory(pfn)) { 1133 if ((*bitfunc)(pfn, mapflag) == 0) 1134 count++; 1135 } 1136 } 1137 1138 if ((cpr_debug & LEVEL7) && showrange == DBG_SHOWRANGE) 1139 cpr_show_range(sva, size, mapflag, bitfunc, count); 1140 1141 return (count); 1142 } 1143 1144 1145 pgcnt_t 1146 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc) 1147 { 1148 pgcnt_t count = 0; 1149 1150 if (cpr_buf) { 1151 count += cpr_count_pages(cpr_buf, cpr_buf_size, 1152 mapflag, bitfunc, DBG_SHOWRANGE); 1153 } 1154 if (cpr_pagedata) { 1155 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size, 1156 mapflag, bitfunc, DBG_SHOWRANGE); 1157 } 1158 count += i_cpr_count_storage_pages(mapflag, bitfunc); 1159 1160 DEBUG7(errp("cpr_count_vpages: %ld pages, 0x%lx bytes\n", 1161 count, mmu_ptob(count))); 1162 return (count); 1163 } 1164 1165 1166 static int 1167 cpr_dump_regular_pages(vnode_t *vp) 1168 { 1169 int error; 1170 1171 cpr_regular_pgs_dumped = 0; 1172 error = cpr_contig_pages(vp, WRITE_TO_STATEFILE); 1173 if (!error) 1174 DEBUG7(errp("cpr_dump_regular_pages() done.\n")); 1175 return (error); 1176 } 1177