1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Fill in and write out the cpr state file 30 * 1. Allocate and write headers, ELF and cpr dump header 31 * 2. Allocate bitmaps according to phys_install 32 * 3. Tag kernel pages into corresponding bitmap 33 * 4. Write bitmaps to state file 34 * 5. Write actual physical page data to state file 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/vm.h> 40 #include <sys/memlist.h> 41 #include <sys/kmem.h> 42 #include <sys/vnode.h> 43 #include <sys/fs/ufs_inode.h> 44 #include <sys/errno.h> 45 #include <sys/cmn_err.h> 46 #include <sys/debug.h> 47 #include <vm/page.h> 48 #include <vm/seg.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/seg_kpm.h> 51 #include <vm/hat.h> 52 #include <sys/cpr.h> 53 #include <sys/conf.h> 54 #include <sys/ddi.h> 55 #include <sys/panic.h> 56 #include <sys/thread.h> 57 58 /* Local defines and variables */ 59 #define BTOb(bytes) ((bytes) << 3) /* Bytes to bits, log2(NBBY) */ 60 #define bTOB(bits) ((bits) >> 3) /* bits to Bytes, log2(NBBY) */ 61 62 static uint_t cpr_pages_tobe_dumped; 63 static uint_t cpr_regular_pgs_dumped; 64 65 static int cpr_dump_regular_pages(vnode_t *); 66 static int cpr_count_upages(int, bitfunc_t); 67 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t); 68 int cpr_flush_write(vnode_t *); 69 70 int cpr_contig_pages(vnode_t *, int); 71 72 void cpr_clear_bitmaps(); 73 74 extern size_t cpr_get_devsize(dev_t); 75 extern int i_cpr_dump_setup(vnode_t *); 76 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *); 77 extern int cpr_test_mode; 78 79 ctrm_t cpr_term; 80 81 char *cpr_buf, *cpr_buf_end; 82 int cpr_buf_blocks; /* size of cpr_buf in blocks */ 83 size_t cpr_buf_size; /* size of cpr_buf in bytes */ 84 size_t cpr_bitmap_size; 85 int cpr_nbitmaps; 86 87 char *cpr_pagedata; /* page buffer for compression / tmp copy */ 88 size_t cpr_pagedata_size; /* page buffer size in bytes */ 89 90 static char *cpr_wptr; /* keep track of where to write to next */ 91 static int cpr_file_bn; /* cpr state-file block offset */ 92 static int cpr_disk_writes_ok; 93 static size_t cpr_dev_space = 0; 94 95 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE]; 96 97 /* 98 * On some platforms bcopy may modify the thread structure 99 * during bcopy (eg, to prevent cpu migration). If the 100 * range we are currently writing out includes our own 101 * thread structure then it will be snapshotted by bcopy 102 * including those modified members - and the updates made 103 * on exit from bcopy will no longer be seen when we later 104 * restore the mid-bcopy kthread_t. So if the range we 105 * need to copy overlaps with our thread structure we will 106 * use a simple byte copy. 107 */ 108 void 109 cprbcopy(void *from, void *to, size_t bytes) 110 { 111 extern int curthreadremapped; 112 caddr_t kthrend; 113 114 kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1; 115 if (curthreadremapped || (kthrend >= (caddr_t)from && 116 kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) { 117 caddr_t src = from, dst = to; 118 119 while (bytes-- > 0) 120 *dst++ = *src++; 121 } else { 122 bcopy(from, to, bytes); 123 } 124 } 125 126 /* 127 * Allocate pages for buffers used in writing out the statefile 128 */ 129 static int 130 cpr_alloc_bufs(void) 131 { 132 char *allocerr = "Unable to allocate memory for cpr buffer"; 133 size_t size; 134 135 /* 136 * set the cpr write buffer size to at least the historic 137 * size (128k) or large enough to store the both the early 138 * set of statefile structures (well under 0x800) plus the 139 * bitmaps, and roundup to the next pagesize. 140 */ 141 size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size); 142 cpr_buf_size = MAX(size, CPRBUFSZ); 143 cpr_buf_blocks = btodb(cpr_buf_size); 144 cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP); 145 if (cpr_buf == NULL) { 146 cpr_err(CE_WARN, allocerr); 147 return (ENOMEM); 148 } 149 cpr_buf_end = cpr_buf + cpr_buf_size; 150 151 cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1); 152 cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP); 153 if (cpr_pagedata == NULL) { 154 kmem_free(cpr_buf, cpr_buf_size); 155 cpr_buf = NULL; 156 cpr_err(CE_WARN, allocerr); 157 return (ENOMEM); 158 } 159 160 return (0); 161 } 162 163 164 /* 165 * Set bitmap size in bytes based on phys_install. 166 */ 167 void 168 cpr_set_bitmap_size(void) 169 { 170 struct memlist *pmem; 171 size_t size = 0; 172 173 memlist_read_lock(); 174 for (pmem = phys_install; pmem; pmem = pmem->next) 175 size += pmem->size; 176 memlist_read_unlock(); 177 cpr_bitmap_size = BITMAP_BYTES(size); 178 } 179 180 181 /* 182 * CPR dump header contains the following information: 183 * 1. header magic -- unique to cpr state file 184 * 2. kernel return pc & ppn for resume 185 * 3. current thread info 186 * 4. debug level and test mode 187 * 5. number of bitmaps allocated 188 * 6. number of page records 189 */ 190 static int 191 cpr_write_header(vnode_t *vp) 192 { 193 extern ushort_t cpr_mach_type; 194 struct cpr_dump_desc cdump; 195 pgcnt_t bitmap_pages; 196 pgcnt_t kpages, vpages, upages; 197 198 cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC; 199 cdump.cdd_version = CPR_VERSION; 200 cdump.cdd_machine = cpr_mach_type; 201 cdump.cdd_debug = cpr_debug; 202 cdump.cdd_test_mode = cpr_test_mode; 203 cdump.cdd_bitmaprec = cpr_nbitmaps; 204 205 cpr_clear_bitmaps(); 206 207 /* 208 * Remember how many pages we plan to save to statefile. 209 * This information will be used for sanity checks. 210 * Untag those pages that will not be saved to statefile. 211 */ 212 kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit); 213 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 214 upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit); 215 cdump.cdd_dumppgsize = kpages - vpages + upages; 216 cpr_pages_tobe_dumped = cdump.cdd_dumppgsize; 217 CPR_DEBUG(CPR_DEBUG7, 218 "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n", 219 kpages, vpages, upages, cdump.cdd_dumppgsize); 220 221 /* 222 * Some pages contain volatile data (cpr_buf and storage area for 223 * sensitive kpages), which are no longer needed after the statefile 224 * is dumped to disk. We have already untagged them from regular 225 * bitmaps. Now tag them into the volatile bitmaps. The pages in 226 * volatile bitmaps will be claimed during resume, and the resumed 227 * kernel will free them. 228 */ 229 (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit); 230 231 bitmap_pages = mmu_btopr(cpr_bitmap_size); 232 233 /* 234 * Export accurate statefile size for statefile allocation retry. 235 * statefile_size = all the headers + total pages + 236 * number of pages used by the bitmaps. 237 * Roundup will be done in the file allocation code. 238 */ 239 STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) + 240 (sizeof (cbd_t) * cdump.cdd_bitmaprec) + 241 (sizeof (cpd_t) * cdump.cdd_dumppgsize) + 242 mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages); 243 244 /* 245 * If the estimated statefile is not big enough, 246 * go retry now to save un-necessary operations. 247 */ 248 if (!(CPR->c_flags & C_COMPRESSING) && 249 (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) { 250 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7)) 251 prom_printf("cpr_write_header: STAT->cs_nocomp_statefsz > " 252 "STAT->cs_est_statefsz\n"); 253 return (ENOSPC); 254 } 255 256 /* now write cpr dump descriptor */ 257 return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t))); 258 } 259 260 261 /* 262 * CPR dump tail record contains the following information: 263 * 1. header magic -- unique to cpr state file 264 * 2. all misc info that needs to be passed to cprboot or resumed kernel 265 */ 266 static int 267 cpr_write_terminator(vnode_t *vp) 268 { 269 cpr_term.magic = (uint_t)CPR_TERM_MAGIC; 270 cpr_term.va = (cpr_ptr)&cpr_term; 271 cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term); 272 273 /* count the last one (flush) */ 274 cpr_term.real_statef_size = STAT->cs_real_statefsz + 275 btod(cpr_wptr - cpr_buf) * DEV_BSIZE; 276 277 CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n", 278 STAT->cs_real_statefsz); 279 280 cpr_tod_get(&cpr_term.tm_shutdown); 281 282 return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term))); 283 } 284 285 /* 286 * Write bitmap descriptor array, followed by merged bitmaps. 287 */ 288 static int 289 cpr_write_bitmap(vnode_t *vp) 290 { 291 char *rmap, *vmap, *dst, *tail; 292 size_t size, bytes; 293 cbd_t *dp; 294 int err; 295 296 dp = CPR->c_bmda; 297 if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp))) 298 return (err); 299 300 /* 301 * merge regular and volatile bitmaps into tmp space 302 * and write to disk 303 */ 304 for (; dp->cbd_size; dp++) { 305 rmap = (char *)dp->cbd_reg_bitmap; 306 vmap = (char *)dp->cbd_vlt_bitmap; 307 for (size = dp->cbd_size; size; size -= bytes) { 308 bytes = min(size, sizeof (cpr_pagecopy)); 309 tail = &cpr_pagecopy[bytes]; 310 for (dst = cpr_pagecopy; dst < tail; dst++) 311 *dst = *rmap++ | *vmap++; 312 if (err = cpr_write(vp, cpr_pagecopy, bytes)) 313 break; 314 } 315 } 316 317 return (err); 318 } 319 320 321 static int 322 cpr_write_statefile(vnode_t *vp) 323 { 324 uint_t error = 0; 325 extern int i_cpr_check_pgs_dumped(); 326 void flush_windows(void); 327 pgcnt_t spages; 328 char *str; 329 330 flush_windows(); 331 332 /* 333 * to get an accurate view of kas, we need to untag sensitive 334 * pages *before* dumping them because the disk driver makes 335 * allocations and changes kas along the way. The remaining 336 * pages referenced in the bitmaps are dumped out later as 337 * regular kpages. 338 */ 339 str = "cpr_write_statefile:"; 340 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit); 341 CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages); 342 343 /* 344 * now it's OK to call a driver that makes allocations 345 */ 346 cpr_disk_writes_ok = 1; 347 348 /* 349 * now write out the clean sensitive kpages 350 * according to the sensitive descriptors 351 */ 352 error = i_cpr_dump_sensitive_kpages(vp); 353 if (error) { 354 CPR_DEBUG(CPR_DEBUG7, 355 "%s cpr_dump_sensitive_kpages() failed!\n", str); 356 return (error); 357 } 358 359 /* 360 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped 361 */ 362 error = cpr_dump_regular_pages(vp); 363 if (error) { 364 CPR_DEBUG(CPR_DEBUG7, 365 "%s cpr_dump_regular_pages() failed!\n", str); 366 return (error); 367 } 368 369 /* 370 * sanity check to verify the right number of pages were dumped 371 */ 372 error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped, 373 cpr_regular_pgs_dumped); 374 375 if (error) { 376 prom_printf("\n%s page count mismatch!\n", str); 377 #ifdef DEBUG 378 if (cpr_test_mode) 379 debug_enter(NULL); 380 #endif 381 } 382 383 return (error); 384 } 385 386 387 /* 388 * creates the CPR state file, the following sections are 389 * written out in sequence: 390 * - writes the cpr dump header 391 * - writes the memory usage bitmaps 392 * - writes the platform dependent info 393 * - writes the remaining user pages 394 * - writes the kernel pages 395 */ 396 int 397 cpr_dump(vnode_t *vp) 398 { 399 int error; 400 401 if (cpr_buf == NULL) { 402 ASSERT(cpr_pagedata == NULL); 403 if (error = cpr_alloc_bufs()) 404 return (error); 405 } 406 /* point to top of internal buffer */ 407 cpr_wptr = cpr_buf; 408 409 /* initialize global variables used by the write operation */ 410 cpr_file_bn = cpr_statefile_offset(); 411 cpr_dev_space = 0; 412 413 /* allocate bitmaps */ 414 if (CPR->c_bmda == NULL) { 415 if (error = i_cpr_alloc_bitmaps()) { 416 cpr_err(CE_WARN, "cannot allocate bitmaps"); 417 return (error); 418 } 419 } 420 421 if (error = i_cpr_prom_pages(CPR_PROM_SAVE)) 422 return (error); 423 424 if (error = i_cpr_dump_setup(vp)) 425 return (error); 426 427 /* 428 * set internal cross checking; we dont want to call 429 * a disk driver that makes allocations until after 430 * sensitive pages are saved 431 */ 432 cpr_disk_writes_ok = 0; 433 434 /* 435 * 1253112: heap corruption due to memory allocation when dumpping 436 * statefile. 437 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and 438 * kvseg segments can be contaminated should memory allocations happen 439 * during sddump, which is not supposed to happen after the system 440 * is quiesced. Let's call the kernel pages that tend to be affected 441 * 'sensitive kpages' here. To avoid saving inconsistent pages, we 442 * will allocate some storage space to save the clean sensitive pages 443 * aside before statefile dumping takes place. Since there may not be 444 * much memory left at this stage, the sensitive pages will be 445 * compressed before they are saved into the storage area. 446 */ 447 if (error = i_cpr_save_sensitive_kpages()) { 448 CPR_DEBUG(CPR_DEBUG7, 449 "cpr_dump: save_sensitive_kpages failed!\n"); 450 return (error); 451 } 452 453 /* 454 * since all cpr allocations are done (space for sensitive kpages, 455 * bitmaps, cpr_buf), kas is stable, and now we can accurately 456 * count regular and sensitive kpages. 457 */ 458 if (error = cpr_write_header(vp)) { 459 CPR_DEBUG(CPR_DEBUG7, 460 "cpr_dump: cpr_write_header() failed!\n"); 461 return (error); 462 } 463 464 if (error = i_cpr_write_machdep(vp)) 465 return (error); 466 467 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL)) 468 return (error); 469 470 if (error = cpr_write_bitmap(vp)) 471 return (error); 472 473 if (error = cpr_write_statefile(vp)) { 474 CPR_DEBUG(CPR_DEBUG7, 475 "cpr_dump: cpr_write_statefile() failed!\n"); 476 return (error); 477 } 478 479 if (error = cpr_write_terminator(vp)) 480 return (error); 481 482 if (error = cpr_flush_write(vp)) 483 return (error); 484 485 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp)) 486 return (error); 487 488 return (0); 489 } 490 491 492 /* 493 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc; 494 * a page-count from each range is accumulated at arg->pages. 495 */ 496 static void 497 cpr_xwalk(void *arg, void *base, size_t size) 498 { 499 struct cpr_walkinfo *cwip = arg; 500 501 cwip->pages += cpr_count_pages(base, size, 502 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 503 cwip->size += size; 504 cwip->ranges++; 505 } 506 507 /* 508 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc; 509 * a page-count from each range is accumulated at arg->pages. 510 */ 511 static void 512 cpr_walk(void *arg, void *base, size_t size) 513 { 514 caddr_t addr = base; 515 caddr_t addr_end = addr + size; 516 517 /* 518 * If we are about to start walking the range of addresses we 519 * carved out of the kernel heap for the large page heap walk 520 * heap_lp_arena to find what segments are actually populated 521 */ 522 if (SEGKMEM_USE_LARGEPAGES && 523 addr == heap_lp_base && addr_end == heap_lp_end && 524 vmem_size(heap_lp_arena, VMEM_ALLOC) < size) { 525 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg); 526 } else { 527 cpr_xwalk(arg, base, size); 528 } 529 } 530 531 532 /* 533 * faster scan of kvseg using vmem_walk() to visit 534 * allocated ranges. 535 */ 536 pgcnt_t 537 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg) 538 { 539 struct cpr_walkinfo cwinfo; 540 541 bzero(&cwinfo, sizeof (cwinfo)); 542 cwinfo.mapflag = mapflag; 543 cwinfo.bitfunc = bitfunc; 544 545 vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo); 546 547 if (cpr_debug & CPR_DEBUG7) { 548 prom_printf("walked %d sub-ranges, total pages %ld\n", 549 cwinfo.ranges, mmu_btop(cwinfo.size)); 550 cpr_show_range(seg->s_base, seg->s_size, 551 mapflag, bitfunc, cwinfo.pages); 552 } 553 554 return (cwinfo.pages); 555 } 556 557 558 /* 559 * cpr_walk_kpm() is called for every used area within the large 560 * segkpm virtual address window. A page-count is accumulated at 561 * arg->pages. 562 */ 563 static void 564 cpr_walk_kpm(void *arg, void *base, size_t size) 565 { 566 struct cpr_walkinfo *cwip = arg; 567 568 cwip->pages += cpr_count_pages(base, size, 569 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 570 cwip->size += size; 571 cwip->ranges++; 572 } 573 574 575 /* 576 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges. 577 */ 578 /*ARGSUSED*/ 579 static pgcnt_t 580 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg) 581 { 582 struct cpr_walkinfo cwinfo; 583 584 if (kpm_enable == 0) 585 return (0); 586 587 bzero(&cwinfo, sizeof (cwinfo)); 588 cwinfo.mapflag = mapflag; 589 cwinfo.bitfunc = bitfunc; 590 hat_kpm_walk(cpr_walk_kpm, &cwinfo); 591 592 if (cpr_debug & CPR_DEBUG7) { 593 prom_printf("walked %d sub-ranges, total pages %ld\n", 594 cwinfo.ranges, mmu_btop(cwinfo.size)); 595 cpr_show_range(segkpm->s_base, segkpm->s_size, 596 mapflag, bitfunc, cwinfo.pages); 597 } 598 599 return (cwinfo.pages); 600 } 601 602 603 /* 604 * Sparsely filled kernel segments are registered in kseg_table for 605 * easier lookup. See also block comment for cpr_count_seg_pages. 606 */ 607 608 #define KSEG_SEG_ADDR 0 /* address of struct seg */ 609 #define KSEG_PTR_ADDR 1 /* address of pointer to struct seg */ 610 611 typedef struct { 612 struct seg **st_seg; /* segment pointer or segment address */ 613 pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */ 614 int st_addrtype; /* address type in st_seg */ 615 } ksegtbl_entry_t; 616 617 ksegtbl_entry_t kseg_table[] = { 618 {(struct seg **)&kvseg, cpr_scan_kvseg, KSEG_SEG_ADDR}, 619 {&segkpm, cpr_scan_segkpm, KSEG_PTR_ADDR}, 620 {NULL, 0, 0} 621 }; 622 623 624 /* 625 * Compare seg with each entry in kseg_table; when there is a match 626 * return the entry pointer, otherwise return NULL. 627 */ 628 static ksegtbl_entry_t * 629 cpr_sparse_seg_check(struct seg *seg) 630 { 631 ksegtbl_entry_t *ste = &kseg_table[0]; 632 struct seg *tseg; 633 634 for (; ste->st_seg; ste++) { 635 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ? 636 *ste->st_seg : (struct seg *)ste->st_seg; 637 if (seg == tseg) 638 return (ste); 639 } 640 641 return ((ksegtbl_entry_t *)NULL); 642 } 643 644 645 /* 646 * Count pages within each kernel segment; call cpr_sparse_seg_check() 647 * to find out whether a sparsely filled segment needs special 648 * treatment (e.g. kvseg). 649 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr 650 * module shouldn't need to know segment details like if it is 651 * sparsely filled or not (makes kseg_table obsolete). 652 */ 653 pgcnt_t 654 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc) 655 { 656 struct seg *segp; 657 pgcnt_t pages; 658 ksegtbl_entry_t *ste; 659 660 pages = 0; 661 for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) { 662 if (ste = cpr_sparse_seg_check(segp)) { 663 pages += (ste->st_fcn)(mapflag, bitfunc, segp); 664 } else { 665 pages += cpr_count_pages(segp->s_base, 666 segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE); 667 } 668 } 669 670 return (pages); 671 } 672 673 674 /* 675 * count kernel pages within kas and any special ranges 676 */ 677 pgcnt_t 678 cpr_count_kpages(int mapflag, bitfunc_t bitfunc) 679 { 680 pgcnt_t kas_cnt; 681 682 /* 683 * Some pages need to be taken care of differently. 684 * eg: panicbuf pages of sun4m are not in kas but they need 685 * to be saved. On sun4u, the physical pages of panicbuf are 686 * allocated via prom_retain(). 687 */ 688 kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc); 689 kas_cnt += cpr_count_seg_pages(mapflag, bitfunc); 690 691 CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt); 692 CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n", 693 kas_cnt, mmu_ptob(kas_cnt)); 694 return (kas_cnt); 695 } 696 697 698 /* 699 * Set a bit corresponding to the arg phys page number; 700 * returns 0 when the ppn is valid and the corresponding 701 * map bit was clear, otherwise returns 1. 702 */ 703 int 704 cpr_setbit(pfn_t ppn, int mapflag) 705 { 706 char *bitmap; 707 cbd_t *dp; 708 pfn_t rel; 709 int clr; 710 711 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 712 if (PPN_IN_RANGE(ppn, dp)) { 713 bitmap = DESC_TO_MAP(dp, mapflag); 714 rel = ppn - dp->cbd_spfn; 715 if ((clr = isclr(bitmap, rel)) != 0) 716 setbit(bitmap, rel); 717 return (clr == 0); 718 } 719 } 720 721 return (1); 722 } 723 724 725 /* 726 * Clear a bit corresponding to the arg phys page number. 727 */ 728 int 729 cpr_clrbit(pfn_t ppn, int mapflag) 730 { 731 char *bitmap; 732 cbd_t *dp; 733 pfn_t rel; 734 int set; 735 736 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 737 if (PPN_IN_RANGE(ppn, dp)) { 738 bitmap = DESC_TO_MAP(dp, mapflag); 739 rel = ppn - dp->cbd_spfn; 740 if ((set = isset(bitmap, rel)) != 0) 741 clrbit(bitmap, rel); 742 return (set == 0); 743 } 744 } 745 746 return (1); 747 } 748 749 750 /* ARGSUSED */ 751 int 752 cpr_nobit(pfn_t ppn, int mapflag) 753 { 754 return (0); 755 } 756 757 758 /* 759 * Lookup a bit corresponding to the arg phys page number. 760 */ 761 int 762 cpr_isset(pfn_t ppn, int mapflag) 763 { 764 char *bitmap; 765 cbd_t *dp; 766 pfn_t rel; 767 768 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 769 if (PPN_IN_RANGE(ppn, dp)) { 770 bitmap = DESC_TO_MAP(dp, mapflag); 771 rel = ppn - dp->cbd_spfn; 772 return (isset(bitmap, rel)); 773 } 774 } 775 776 return (0); 777 } 778 779 780 /* 781 * Go thru all pages and pick up any page not caught during the invalidation 782 * stage. This is also used to save pages with cow lock or phys page lock held 783 * (none zero p_lckcnt or p_cowcnt) 784 */ 785 static int 786 cpr_count_upages(int mapflag, bitfunc_t bitfunc) 787 { 788 page_t *pp, *page0; 789 pgcnt_t dcnt = 0, tcnt = 0; 790 pfn_t pfn; 791 792 page0 = pp = page_first(); 793 794 do { 795 #if defined(__sparc) 796 extern struct vnode prom_ppages; 797 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 798 pp->p_vnode == &prom_ppages || 799 PP_ISFREE(pp) && PP_ISAGED(pp)) 800 #else 801 if (pp->p_vnode == NULL || PP_ISKAS(pp) || 802 PP_ISFREE(pp) && PP_ISAGED(pp)) 803 #endif /* __sparc */ 804 continue; 805 806 pfn = page_pptonum(pp); 807 if (pf_is_memory(pfn)) { 808 tcnt++; 809 if ((*bitfunc)(pfn, mapflag) == 0) 810 dcnt++; /* dirty count */ 811 } 812 } while ((pp = page_next(pp)) != page0); 813 814 STAT->cs_upage2statef = dcnt; 815 CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n", 816 dcnt, tcnt); 817 CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n", 818 dcnt, mmu_ptob(dcnt)); 819 return (dcnt); 820 } 821 822 823 /* 824 * try compressing pages based on cflag, 825 * and for DEBUG kernels, verify uncompressed data checksum; 826 * 827 * this routine replaces common code from 828 * i_cpr_compress_and_save() and cpr_compress_and_write() 829 */ 830 char * 831 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag) 832 { 833 size_t nbytes, clen, len; 834 uint32_t test_sum; 835 char *datap; 836 837 nbytes = mmu_ptob(pages); 838 839 /* 840 * set length to the original uncompressed data size; 841 * always init cpd_flag to zero 842 */ 843 dp->cpd_length = nbytes; 844 dp->cpd_flag = 0; 845 846 #ifdef DEBUG 847 /* 848 * Make a copy of the uncompressed data so we can checksum it. 849 * Compress that copy so the checksum works at the other end 850 */ 851 cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes); 852 dp->cpd_usum = checksum32(cpr_pagecopy, nbytes); 853 dp->cpd_flag |= CPD_USUM; 854 datap = cpr_pagecopy; 855 #else 856 datap = CPR->c_mapping_area; 857 dp->cpd_usum = 0; 858 #endif 859 860 /* 861 * try compressing the raw data to cpr_pagedata; 862 * if there was a size reduction: record the new length, 863 * flag the compression, and point to the compressed data. 864 */ 865 dp->cpd_csum = 0; 866 if (cflag) { 867 clen = compress(datap, cpr_pagedata, nbytes); 868 if (clen < nbytes) { 869 dp->cpd_flag |= CPD_COMPRESS; 870 dp->cpd_length = clen; 871 datap = cpr_pagedata; 872 #ifdef DEBUG 873 dp->cpd_csum = checksum32(datap, clen); 874 dp->cpd_flag |= CPD_CSUM; 875 876 /* 877 * decompress the data back to a scratch area 878 * and compare the new checksum with the original 879 * checksum to verify the compression. 880 */ 881 bzero(cpr_pagecopy, sizeof (cpr_pagecopy)); 882 len = decompress(datap, cpr_pagecopy, 883 clen, sizeof (cpr_pagecopy)); 884 test_sum = checksum32(cpr_pagecopy, len); 885 ASSERT(test_sum == dp->cpd_usum); 886 #endif 887 } 888 } 889 890 return (datap); 891 } 892 893 894 /* 895 * 1. Prepare cpr page descriptor and write it to file 896 * 2. Compress page data and write it out 897 */ 898 static int 899 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg) 900 { 901 int error = 0; 902 char *datap; 903 cpd_t cpd; /* cpr page descriptor */ 904 extern void i_cpr_mapin(caddr_t, uint_t, pfn_t); 905 extern void i_cpr_mapout(caddr_t, uint_t); 906 907 i_cpr_mapin(CPR->c_mapping_area, npg, pfn); 908 909 CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n", 910 npg, CPR->c_mapping_area, pfn); 911 912 /* 913 * Fill cpr page descriptor. 914 */ 915 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 916 cpd.cpd_pfn = pfn; 917 cpd.cpd_pages = npg; 918 919 STAT->cs_dumped_statefsz += mmu_ptob(npg); 920 921 datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING); 922 923 /* Write cpr page descriptor */ 924 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t)); 925 926 /* Write compressed page data */ 927 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 928 929 /* 930 * Unmap the pages for tlb and vac flushing 931 */ 932 i_cpr_mapout(CPR->c_mapping_area, npg); 933 934 if (error) { 935 CPR_DEBUG(CPR_DEBUG1, 936 "cpr_compress_and_write: vp 0x%p va 0x%x ", vp, va); 937 CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n", 938 pfn, cpr_file_bn, error); 939 } else { 940 cpr_regular_pgs_dumped += npg; 941 } 942 943 return (error); 944 } 945 946 947 int 948 cpr_write(vnode_t *vp, caddr_t buffer, size_t size) 949 { 950 caddr_t fromp = buffer; 951 size_t bytes, wbytes; 952 int error; 953 954 if (cpr_dev_space == 0) { 955 if (vp->v_type == VBLK) { 956 cpr_dev_space = cpr_get_devsize(vp->v_rdev); 957 ASSERT(cpr_dev_space); 958 } else 959 cpr_dev_space = 1; /* not used in this case */ 960 } 961 962 /* 963 * break the write into multiple part if request is large, 964 * calculate count up to buf page boundary, then write it out. 965 * repeat until done. 966 */ 967 while (size) { 968 bytes = MIN(size, cpr_buf_end - cpr_wptr); 969 cprbcopy(fromp, cpr_wptr, bytes); 970 cpr_wptr += bytes; 971 fromp += bytes; 972 size -= bytes; 973 if (cpr_wptr < cpr_buf_end) 974 return (0); /* buffer not full yet */ 975 ASSERT(cpr_wptr == cpr_buf_end); 976 977 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks); 978 if (vp->v_type == VBLK) { 979 if (wbytes > cpr_dev_space) 980 return (ENOSPC); 981 } else { 982 if (wbytes > VTOI(vp)->i_size) 983 return (ENOSPC); 984 } 985 986 CPR_DEBUG(CPR_DEBUG3, 987 "cpr_write: frmp=%p wptr=%p cnt=%lx...", 988 fromp, cpr_wptr, bytes); 989 /* 990 * cross check, this should not happen! 991 */ 992 if (cpr_disk_writes_ok == 0) { 993 prom_printf("cpr_write: disk write too early!\n"); 994 return (EINVAL); 995 } 996 997 do_polled_io = 1; 998 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks); 999 do_polled_io = 0; 1000 CPR_DEBUG(CPR_DEBUG3, "done\n"); 1001 1002 STAT->cs_real_statefsz += cpr_buf_size; 1003 1004 if (error) { 1005 cpr_err(CE_WARN, "cpr_write error %d", error); 1006 return (error); 1007 } 1008 cpr_file_bn += cpr_buf_blocks; /* Increment block count */ 1009 cpr_wptr = cpr_buf; /* back to top of buffer */ 1010 } 1011 return (0); 1012 } 1013 1014 1015 int 1016 cpr_flush_write(vnode_t *vp) 1017 { 1018 int nblk; 1019 int error; 1020 1021 /* 1022 * Calculate remaining blocks in buffer, rounded up to nearest 1023 * disk block 1024 */ 1025 nblk = btod(cpr_wptr - cpr_buf); 1026 1027 do_polled_io = 1; 1028 error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk); 1029 do_polled_io = 0; 1030 1031 cpr_file_bn += nblk; 1032 if (error) 1033 CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n", 1034 error); 1035 return (error); 1036 } 1037 1038 void 1039 cpr_clear_bitmaps(void) 1040 { 1041 cbd_t *dp; 1042 1043 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1044 bzero((void *)dp->cbd_reg_bitmap, 1045 (size_t)dp->cbd_size * 2); 1046 } 1047 CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n"); 1048 } 1049 1050 int 1051 cpr_contig_pages(vnode_t *vp, int flag) 1052 { 1053 int chunks = 0, error = 0; 1054 pgcnt_t i, j, totbit; 1055 pfn_t spfn; 1056 cbd_t *dp; 1057 uint_t spin_cnt = 0; 1058 extern int i_cpr_compress_and_save(); 1059 1060 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1061 spfn = dp->cbd_spfn; 1062 totbit = BTOb(dp->cbd_size); 1063 i = 0; /* Beginning of bitmap */ 1064 j = 0; 1065 while (i < totbit) { 1066 while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) { 1067 if (isset((char *)dp->cbd_reg_bitmap, j+i)) 1068 j++; 1069 else /* not contiguous anymore */ 1070 break; 1071 } 1072 1073 if (j) { 1074 chunks++; 1075 if (flag == SAVE_TO_STORAGE) { 1076 error = i_cpr_compress_and_save( 1077 chunks, spfn + i, j); 1078 if (error) 1079 return (error); 1080 } else if (flag == WRITE_TO_STATEFILE) { 1081 error = cpr_compress_and_write(vp, 0, 1082 spfn + i, j); 1083 if (error) 1084 return (error); 1085 else { 1086 spin_cnt++; 1087 if ((spin_cnt & 0x5F) == 1) 1088 cpr_spinning_bar(); 1089 } 1090 } 1091 } 1092 1093 i += j; 1094 if (j != CPR_MAXCONTIG) { 1095 /* Stopped on a non-tagged page */ 1096 i++; 1097 } 1098 1099 j = 0; 1100 } 1101 } 1102 1103 if (flag == STORAGE_DESC_ALLOC) 1104 return (chunks); 1105 else 1106 return (0); 1107 } 1108 1109 1110 void 1111 cpr_show_range(caddr_t vaddr, size_t size, 1112 int mapflag, bitfunc_t bitfunc, pgcnt_t count) 1113 { 1114 char *action, *bname; 1115 1116 bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile"; 1117 if (bitfunc == cpr_setbit) 1118 action = "tag"; 1119 else if (bitfunc == cpr_clrbit) 1120 action = "untag"; 1121 else 1122 action = "none"; 1123 prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n", 1124 vaddr, vaddr + size, bname, action, count); 1125 } 1126 1127 1128 pgcnt_t 1129 cpr_count_pages(caddr_t sva, size_t size, 1130 int mapflag, bitfunc_t bitfunc, int showrange) 1131 { 1132 caddr_t va, eva; 1133 pfn_t pfn; 1134 pgcnt_t count = 0; 1135 1136 eva = sva + PAGE_ROUNDUP(size); 1137 for (va = sva; va < eva; va += MMU_PAGESIZE) { 1138 pfn = va_to_pfn(va); 1139 if (pfn != PFN_INVALID && pf_is_memory(pfn)) { 1140 if ((*bitfunc)(pfn, mapflag) == 0) 1141 count++; 1142 } 1143 } 1144 1145 if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE) 1146 cpr_show_range(sva, size, mapflag, bitfunc, count); 1147 1148 return (count); 1149 } 1150 1151 1152 pgcnt_t 1153 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc) 1154 { 1155 pgcnt_t count = 0; 1156 1157 if (cpr_buf) { 1158 count += cpr_count_pages(cpr_buf, cpr_buf_size, 1159 mapflag, bitfunc, DBG_SHOWRANGE); 1160 } 1161 if (cpr_pagedata) { 1162 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size, 1163 mapflag, bitfunc, DBG_SHOWRANGE); 1164 } 1165 count += i_cpr_count_storage_pages(mapflag, bitfunc); 1166 1167 CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n", 1168 count, mmu_ptob(count)); 1169 return (count); 1170 } 1171 1172 1173 static int 1174 cpr_dump_regular_pages(vnode_t *vp) 1175 { 1176 int error; 1177 1178 cpr_regular_pgs_dumped = 0; 1179 error = cpr_contig_pages(vp, WRITE_TO_STATEFILE); 1180 if (!error) 1181 CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n"); 1182 return (error); 1183 } 1184