1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Fill in and write out the cpr state file 31 * 1. Allocate and write headers, ELF and cpr dump header 32 * 2. Allocate bitmaps according to phys_install 33 * 3. Tag kernel pages into corresponding bitmap 34 * 4. Write bitmaps to state file 35 * 5. Write actual physical page data to state file 36 */ 37 38 #include <sys/types.h> 39 #include <sys/systm.h> 40 #include <sys/vm.h> 41 #include <sys/memlist.h> 42 #include <sys/kmem.h> 43 #include <sys/vnode.h> 44 #include <sys/fs/ufs_inode.h> 45 #include <sys/errno.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <vm/page.h> 49 #include <vm/seg.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/seg_kpm.h> 52 #include <vm/hat.h> 53 #include <sys/cpr.h> 54 #include <sys/conf.h> 55 #include <sys/ddi.h> 56 #include <sys/panic.h> 57 #include <sys/thread.h> 58 59 /* Local defines and variables */ 60 #define BTOb(bytes) ((bytes) << 3) /* Bytes to bits, log2(NBBY) */ 61 #define bTOB(bits) ((bits) >> 3) /* bits to Bytes, log2(NBBY) */ 62 63 static uint_t cpr_pages_tobe_dumped; 64 static uint_t cpr_regular_pgs_dumped; 65 66 static int cpr_dump_regular_pages(vnode_t *); 67 static int cpr_count_upages(int, bitfunc_t); 68 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t); 69 int cpr_flush_write(vnode_t *); 70 71 int cpr_contig_pages(vnode_t *, int); 72 73 void cpr_clear_bitmaps(); 74 75 extern size_t cpr_get_devsize(dev_t); 76 extern int i_cpr_dump_setup(vnode_t *); 77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *); 78 extern int cpr_test_mode; 79 80 ctrm_t cpr_term; 81 82 char *cpr_buf, *cpr_buf_end; 83 int cpr_buf_blocks; /* size of cpr_buf in blocks */ 84 size_t cpr_buf_size; /* size of cpr_buf in bytes */ 85 size_t cpr_bitmap_size; 86 int cpr_nbitmaps; 87 88 char *cpr_pagedata; /* page buffer for compression / tmp copy */ 89 size_t cpr_pagedata_size; /* page buffer size in bytes */ 90 91 static char *cpr_wptr; /* keep track of where to write to next */ 92 static int cpr_file_bn; /* cpr state-file block offset */ 93 static int cpr_disk_writes_ok; 94 static size_t cpr_dev_space = 0; 95 96 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE]; 97 98 /* 99 * On some platforms bcopy may modify the thread structure 100 * during bcopy (eg, to prevent cpu migration). If the 101 * range we are currently writing out includes our own 102 * thread structure then it will be snapshotted by bcopy 103 * including those modified members - and the updates made 104 * on exit from bcopy will no longer be seen when we later 105 * restore the mid-bcopy kthread_t. So if the range we 106 * need to copy overlaps with our thread structure we will 107 * use a simple byte copy. 108 */ 109 void 110 cprbcopy(void *from, void *to, size_t bytes) 111 { 112 extern int curthreadremapped; 113 caddr_t kthrend; 114 115 kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1; 116 if (curthreadremapped || (kthrend >= (caddr_t)from && 117 kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) { 118 caddr_t src = from, dst = to; 119 120 while (bytes-- > 0) 121 *dst++ = *src++; 122 } else { 123 bcopy(from, to, bytes); 124 } 125 } 126 127 /* 128 * Allocate pages for buffers used in writing out the statefile 129 */ 130 static int 131 cpr_alloc_bufs(void) 132 { 133 char *allocerr = "Unable to allocate memory for cpr buffer"; 134 size_t size; 135 136 /* 137 * set the cpr write buffer size to at least the historic 138 * size (128k) or large enough to store the both the early 139 * set of statefile structures (well under 0x800) plus the 140 * bitmaps, and roundup to the next pagesize. 141 */ 142 size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size); 143 cpr_buf_size = MAX(size, CPRBUFSZ); 144 cpr_buf_blocks = btodb(cpr_buf_size); 145 cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP); 146 if (cpr_buf == NULL) { 147 cpr_err(CE_WARN, allocerr); 148 return (ENOMEM); 149 } 150 cpr_buf_end = cpr_buf + cpr_buf_size; 151 152 cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1); 153 cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP); 154 if (cpr_pagedata == NULL) { 155 kmem_free(cpr_buf, cpr_buf_size); 156 cpr_buf = NULL; 157 cpr_err(CE_WARN, allocerr); 158 return (ENOMEM); 159 } 160 161 return (0); 162 } 163 164 165 /* 166 * Set bitmap size in bytes based on phys_install. 167 */ 168 void 169 cpr_set_bitmap_size(void) 170 { 171 struct memlist *pmem; 172 size_t size = 0; 173 174 memlist_read_lock(); 175 for (pmem = phys_install; pmem; pmem = pmem->next) 176 size += pmem->size; 177 memlist_read_unlock(); 178 cpr_bitmap_size = BITMAP_BYTES(size); 179 } 180 181 182 /* 183 * CPR dump header contains the following information: 184 * 1. header magic -- unique to cpr state file 185 * 2. kernel return pc & ppn for resume 186 * 3. current thread info 187 * 4. debug level and test mode 188 * 5. number of bitmaps allocated 189 * 6. number of page records 190 */ 191 static int 192 cpr_write_header(vnode_t *vp) 193 { 194 extern ushort_t cpr_mach_type; 195 struct cpr_dump_desc cdump; 196 pgcnt_t bitmap_pages; 197 pgcnt_t kpages, vpages, upages; 198 199 cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC; 200 cdump.cdd_version = CPR_VERSION; 201 cdump.cdd_machine = cpr_mach_type; 202 cdump.cdd_debug = cpr_debug; 203 cdump.cdd_test_mode = cpr_test_mode; 204 cdump.cdd_bitmaprec = cpr_nbitmaps; 205 206 cpr_clear_bitmaps(); 207 208 /* 209 * Remember how many pages we plan to save to statefile. 210 * This information will be used for sanity checks. 211 * Untag those pages that will not be saved to statefile. 212 */ 213 kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit); 214 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 215 upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit); 216 cdump.cdd_dumppgsize = kpages - vpages + upages; 217 cpr_pages_tobe_dumped = cdump.cdd_dumppgsize; 218 DEBUG7(errp( 219 "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n", 220 kpages, vpages, upages, cdump.cdd_dumppgsize)); 221 222 /* 223 * Some pages contain volatile data (cpr_buf and storage area for 224 * sensitive kpages), which are no longer needed after the statefile 225 * is dumped to disk. We have already untagged them from regular 226 * bitmaps. Now tag them into the volatile bitmaps. The pages in 227 * volatile bitmaps will be claimed during resume, and the resumed 228 * kernel will free them. 229 */ 230 (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit); 231 232 bitmap_pages = mmu_btopr(cpr_bitmap_size); 233 234 /* 235 * Export accurate statefile size for statefile allocation retry. 236 * statefile_size = all the headers + total pages + 237 * number of pages used by the bitmaps. 238 * Roundup will be done in the file allocation code. 239 */ 240 STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) + 241 (sizeof (cbd_t) * cdump.cdd_bitmaprec) + 242 (sizeof (cpd_t) * cdump.cdd_dumppgsize) + 243 mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages); 244 245 /* 246 * If the estimated statefile is not big enough, 247 * go retry now to save un-necessary operations. 248 */ 249 if (!(CPR->c_flags & C_COMPRESSING) && 250 (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) { 251 if (cpr_debug & (LEVEL1 | LEVEL7)) 252 errp("cpr_write_header: STAT->cs_nocomp_statefsz > " 253 "STAT->cs_est_statefsz\n"); 254 return (ENOSPC); 255 } 256 257 /* now write cpr dump descriptor */ 258 return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t))); 259 } 260 261 262 /* 263 * CPR dump tail record contains the following information: 264 * 1. header magic -- unique to cpr state file 265 * 2. all misc info that needs to be passed to cprboot or resumed kernel 266 */ 267 static int 268 cpr_write_terminator(vnode_t *vp) 269 { 270 cpr_term.magic = (uint_t)CPR_TERM_MAGIC; 271 cpr_term.va = (cpr_ptr)&cpr_term; 272 cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term); 273 274 /* count the last one (flush) */ 275 cpr_term.real_statef_size = STAT->cs_real_statefsz + 276 btod(cpr_wptr - cpr_buf) * DEV_BSIZE; 277 278 DEBUG9(errp("cpr_dump: Real Statefile Size: %d\n", 279 STAT->cs_real_statefsz)); 280 281 cpr_tod_get(&cpr_term.tm_shutdown); 282 283 return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term))); 284 } 285 286 /* 287 * Write bitmap descriptor array, followed by merged bitmaps. 288 */ 289 static int 290 cpr_write_bitmap(vnode_t *vp) 291 { 292 char *rmap, *vmap, *dst, *tail; 293 size_t size, bytes; 294 cbd_t *dp; 295 int err; 296 297 dp = CPR->c_bmda; 298 if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp))) 299 return (err); 300 301 /* 302 * merge regular and volatile bitmaps into tmp space 303 * and write to disk 304 */ 305 for (; dp->cbd_size; dp++) { 306 rmap = (char *)dp->cbd_reg_bitmap; 307 vmap = (char *)dp->cbd_vlt_bitmap; 308 for (size = dp->cbd_size; size; size -= bytes) { 309 bytes = min(size, sizeof (cpr_pagecopy)); 310 tail = &cpr_pagecopy[bytes]; 311 for (dst = cpr_pagecopy; dst < tail; dst++) 312 *dst = *rmap++ | *vmap++; 313 if (err = cpr_write(vp, cpr_pagecopy, bytes)) 314 break; 315 } 316 } 317 318 return (err); 319 } 320 321 322 static int 323 cpr_write_statefile(vnode_t *vp) 324 { 325 uint_t error = 0; 326 extern int i_cpr_check_pgs_dumped(); 327 void flush_windows(void); 328 pgcnt_t spages; 329 char *str; 330 331 flush_windows(); 332 333 /* 334 * to get an accurate view of kas, we need to untag sensitive 335 * pages *before* dumping them because the disk driver makes 336 * allocations and changes kas along the way. The remaining 337 * pages referenced in the bitmaps are dumped out later as 338 * regular kpages. 339 */ 340 str = "cpr_write_statefile:"; 341 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit); 342 DEBUG7(errp("%s untag %ld sens pages\n", str, spages)); 343 344 /* 345 * now it's OK to call a driver that makes allocations 346 */ 347 cpr_disk_writes_ok = 1; 348 349 /* 350 * now write out the clean sensitive kpages 351 * according to the sensitive descriptors 352 */ 353 error = i_cpr_dump_sensitive_kpages(vp); 354 if (error) { 355 DEBUG7(errp("%s cpr_dump_sensitive_kpages() failed!\n", str)); 356 return (error); 357 } 358 359 /* 360 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped 361 */ 362 error = cpr_dump_regular_pages(vp); 363 if (error) { 364 DEBUG7(errp("%s cpr_dump_regular_pages() failed!\n", str)); 365 return (error); 366 } 367 368 /* 369 * sanity check to verify the right number of pages were dumped 370 */ 371 error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped, 372 cpr_regular_pgs_dumped); 373 374 if (error) { 375 errp("\n%s page count mismatch!\n", str); 376 #ifdef DEBUG 377 if (cpr_test_mode) 378 debug_enter(NULL); 379 #endif 380 } 381 382 return (error); 383 } 384 385 386 /* 387 * creates the CPR state file, the following sections are 388 * written out in sequence: 389 * - writes the cpr dump header 390 * - writes the memory usage bitmaps 391 * - writes the platform dependent info 392 * - writes the remaining user pages 393 * - writes the kernel pages 394 */ 395 int 396 cpr_dump(vnode_t *vp) 397 { 398 int error; 399 400 if (cpr_buf == NULL) { 401 ASSERT(cpr_pagedata == NULL); 402 if (error = cpr_alloc_bufs()) 403 return (error); 404 } 405 /* point to top of internal buffer */ 406 cpr_wptr = cpr_buf; 407 408 /* initialize global variables used by the write operation */ 409 cpr_file_bn = cpr_statefile_offset(); 410 cpr_dev_space = 0; 411 412 /* allocate bitmaps */ 413 if (CPR->c_bmda == NULL) { 414 if (error = i_cpr_alloc_bitmaps()) { 415 cpr_err(CE_WARN, "cannot allocate bitmaps"); 416 return (error); 417 } 418 } 419 420 if (error = i_cpr_prom_pages(CPR_PROM_SAVE)) 421 return (error); 422 423 if (error = i_cpr_dump_setup(vp)) 424 return (error); 425 426 /* 427 * set internal cross checking; we dont want to call 428 * a disk driver that makes allocations until after 429 * sensitive pages are saved 430 */ 431 cpr_disk_writes_ok = 0; 432 433 /* 434 * 1253112: heap corruption due to memory allocation when dumpping 435 * statefile. 436 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and 437 * kvseg segments can be contaminated should memory allocations happen 438 * during sddump, which is not supposed to happen after the system 439 * is quiesced. Let's call the kernel pages that tend to be affected 440 * 'sensitive kpages' here. To avoid saving inconsistent pages, we 441 * will allocate some storage space to save the clean sensitive pages 442 * aside before statefile dumping takes place. Since there may not be 443 * much memory left at this stage, the sensitive pages will be 444 * compressed before they are saved into the storage area. 445 */ 446 if (error = i_cpr_save_sensitive_kpages()) { 447 DEBUG7(errp("cpr_dump: save_sensitive_kpages failed!\n")); 448 return (error); 449 } 450 451 /* 452 * since all cpr allocations are done (space for sensitive kpages, 453 * bitmaps, cpr_buf), kas is stable, and now we can accurately 454 * count regular and sensitive kpages. 455 */ 456 if (error = cpr_write_header(vp)) { 457 DEBUG7(errp("cpr_dump: cpr_write_header() failed!\n")); 458 return (error); 459 } 460 461 if (error = i_cpr_write_machdep(vp)) 462 return (error); 463 464 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL)) 465 return (error); 466 467 if (error = cpr_write_bitmap(vp)) 468 return (error); 469 470 if (error = cpr_write_statefile(vp)) { 471 DEBUG7(errp("cpr_dump: cpr_write_statefile() failed!\n")); 472 return (error); 473 } 474 475 if (error = cpr_write_terminator(vp)) 476 return (error); 477 478 if (error = cpr_flush_write(vp)) 479 return (error); 480 481 if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp)) 482 return (error); 483 484 return (0); 485 } 486 487 488 /* 489 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc; 490 * a page-count from each range is accumulated at arg->pages. 491 */ 492 static void 493 cpr_xwalk(void *arg, void *base, size_t size) 494 { 495 struct cpr_walkinfo *cwip = arg; 496 497 cwip->pages += cpr_count_pages(base, size, 498 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 499 cwip->size += size; 500 cwip->ranges++; 501 } 502 503 /* 504 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc; 505 * a page-count from each range is accumulated at arg->pages. 506 */ 507 static void 508 cpr_walk(void *arg, void *base, size_t size) 509 { 510 caddr_t addr = base; 511 caddr_t addr_end = addr + size; 512 513 /* 514 * If we are about to start walking the range of addresses we 515 * carved out of the kernel heap for the large page heap walk 516 * heap_lp_arena to find what segments are actually populated 517 */ 518 if (SEGKMEM_USE_LARGEPAGES && 519 addr == heap_lp_base && addr_end == heap_lp_end && 520 vmem_size(heap_lp_arena, VMEM_ALLOC) < size) { 521 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg); 522 } else { 523 cpr_xwalk(arg, base, size); 524 } 525 } 526 527 528 /* 529 * faster scan of kvseg using vmem_walk() to visit 530 * allocated ranges. 531 */ 532 pgcnt_t 533 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg) 534 { 535 struct cpr_walkinfo cwinfo; 536 537 bzero(&cwinfo, sizeof (cwinfo)); 538 cwinfo.mapflag = mapflag; 539 cwinfo.bitfunc = bitfunc; 540 541 vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo); 542 543 if (cpr_debug & LEVEL7) { 544 errp("walked %d sub-ranges, total pages %ld\n", 545 cwinfo.ranges, mmu_btop(cwinfo.size)); 546 cpr_show_range(seg->s_base, seg->s_size, 547 mapflag, bitfunc, cwinfo.pages); 548 } 549 550 return (cwinfo.pages); 551 } 552 553 554 /* 555 * cpr_walk_kpm() is called for every used area within the large 556 * segkpm virtual address window. A page-count is accumulated at 557 * arg->pages. 558 */ 559 static void 560 cpr_walk_kpm(void *arg, void *base, size_t size) 561 { 562 struct cpr_walkinfo *cwip = arg; 563 564 cwip->pages += cpr_count_pages(base, size, 565 cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE); 566 cwip->size += size; 567 cwip->ranges++; 568 } 569 570 571 /* 572 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges. 573 */ 574 /*ARGSUSED*/ 575 static pgcnt_t 576 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg) 577 { 578 struct cpr_walkinfo cwinfo; 579 580 if (kpm_enable == 0) 581 return (0); 582 583 bzero(&cwinfo, sizeof (cwinfo)); 584 cwinfo.mapflag = mapflag; 585 cwinfo.bitfunc = bitfunc; 586 hat_kpm_walk(cpr_walk_kpm, &cwinfo); 587 588 if (cpr_debug & LEVEL7) { 589 errp("walked %d sub-ranges, total pages %ld\n", 590 cwinfo.ranges, mmu_btop(cwinfo.size)); 591 cpr_show_range(segkpm->s_base, segkpm->s_size, 592 mapflag, bitfunc, cwinfo.pages); 593 } 594 595 return (cwinfo.pages); 596 } 597 598 599 /* 600 * Sparsely filled kernel segments are registered in kseg_table for 601 * easier lookup. See also block comment for cpr_count_seg_pages. 602 */ 603 604 #define KSEG_SEG_ADDR 0 /* address of struct seg */ 605 #define KSEG_PTR_ADDR 1 /* address of pointer to struct seg */ 606 607 typedef struct { 608 struct seg **st_seg; /* segment pointer or segment address */ 609 pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */ 610 int st_addrtype; /* address type in st_seg */ 611 } ksegtbl_entry_t; 612 613 ksegtbl_entry_t kseg_table[] = { 614 {(struct seg **)&kvseg, cpr_scan_kvseg, KSEG_SEG_ADDR}, 615 {&segkpm, cpr_scan_segkpm, KSEG_PTR_ADDR}, 616 {NULL, 0, 0} 617 }; 618 619 620 /* 621 * Compare seg with each entry in kseg_table; when there is a match 622 * return the entry pointer, otherwise return NULL. 623 */ 624 static ksegtbl_entry_t * 625 cpr_sparse_seg_check(struct seg *seg) 626 { 627 ksegtbl_entry_t *ste = &kseg_table[0]; 628 struct seg *tseg; 629 630 for (; ste->st_seg; ste++) { 631 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ? 632 *ste->st_seg : (struct seg *)ste->st_seg; 633 if (seg == tseg) 634 return (ste); 635 } 636 637 return ((ksegtbl_entry_t *)NULL); 638 } 639 640 641 /* 642 * Count pages within each kernel segment; call cpr_sparse_seg_check() 643 * to find out whether a sparsely filled segment needs special 644 * treatment (e.g. kvseg). 645 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr 646 * module shouldn't need to know segment details like if it is 647 * sparsely filled or not (makes kseg_table obsolete). 648 */ 649 pgcnt_t 650 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc) 651 { 652 struct seg *segp; 653 pgcnt_t pages; 654 ksegtbl_entry_t *ste; 655 656 pages = 0; 657 for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) { 658 if (ste = cpr_sparse_seg_check(segp)) { 659 pages += (ste->st_fcn)(mapflag, bitfunc, segp); 660 } else { 661 pages += cpr_count_pages(segp->s_base, 662 segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE); 663 } 664 } 665 666 return (pages); 667 } 668 669 670 /* 671 * count kernel pages within kas and any special ranges 672 */ 673 pgcnt_t 674 cpr_count_kpages(int mapflag, bitfunc_t bitfunc) 675 { 676 pgcnt_t kas_cnt; 677 678 /* 679 * Some pages need to be taken care of differently. 680 * eg: panicbuf pages of sun4m are not in kas but they need 681 * to be saved. On sun4u, the physical pages of panicbuf are 682 * allocated via prom_retain(). 683 */ 684 kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc); 685 kas_cnt += cpr_count_seg_pages(mapflag, bitfunc); 686 687 DEBUG9(errp("cpr_count_kpages: kas_cnt=%d\n", kas_cnt)); 688 DEBUG7(errp("\ncpr_count_kpages: %ld pages, 0x%lx bytes\n", 689 kas_cnt, mmu_ptob(kas_cnt))); 690 return (kas_cnt); 691 } 692 693 694 /* 695 * Set a bit corresponding to the arg phys page number; 696 * returns 0 when the ppn is valid and the corresponding 697 * map bit was clear, otherwise returns 1. 698 */ 699 int 700 cpr_setbit(pfn_t ppn, int mapflag) 701 { 702 char *bitmap; 703 cbd_t *dp; 704 pfn_t rel; 705 int clr; 706 707 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 708 if (PPN_IN_RANGE(ppn, dp)) { 709 bitmap = DESC_TO_MAP(dp, mapflag); 710 rel = ppn - dp->cbd_spfn; 711 if ((clr = isclr(bitmap, rel)) != 0) 712 setbit(bitmap, rel); 713 return (clr == 0); 714 } 715 } 716 717 return (1); 718 } 719 720 721 /* 722 * Clear a bit corresponding to the arg phys page number. 723 */ 724 int 725 cpr_clrbit(pfn_t ppn, int mapflag) 726 { 727 char *bitmap; 728 cbd_t *dp; 729 pfn_t rel; 730 int set; 731 732 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 733 if (PPN_IN_RANGE(ppn, dp)) { 734 bitmap = DESC_TO_MAP(dp, mapflag); 735 rel = ppn - dp->cbd_spfn; 736 if ((set = isset(bitmap, rel)) != 0) 737 clrbit(bitmap, rel); 738 return (set == 0); 739 } 740 } 741 742 return (1); 743 } 744 745 746 /* ARGSUSED */ 747 int 748 cpr_nobit(pfn_t ppn, int mapflag) 749 { 750 return (0); 751 } 752 753 754 /* 755 * Lookup a bit corresponding to the arg phys page number. 756 */ 757 int 758 cpr_isset(pfn_t ppn, int mapflag) 759 { 760 char *bitmap; 761 cbd_t *dp; 762 pfn_t rel; 763 764 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 765 if (PPN_IN_RANGE(ppn, dp)) { 766 bitmap = DESC_TO_MAP(dp, mapflag); 767 rel = ppn - dp->cbd_spfn; 768 return (isset(bitmap, rel)); 769 } 770 } 771 772 return (0); 773 } 774 775 776 /* 777 * Go thru all pages and pick up any page not caught during the invalidation 778 * stage. This is also used to save pages with cow lock or phys page lock held 779 * (none zero p_lckcnt or p_cowcnt) 780 */ 781 static int 782 cpr_count_upages(int mapflag, bitfunc_t bitfunc) 783 { 784 page_t *pp, *page0; 785 pgcnt_t dcnt = 0, tcnt = 0; 786 pfn_t pfn; 787 788 page0 = pp = page_first(); 789 790 do { 791 #if defined(__sparc) 792 extern struct vnode prom_ppages; 793 if (pp->p_vnode == NULL || pp->p_vnode == &kvp || 794 pp->p_vnode == &prom_ppages || 795 PP_ISFREE(pp) && PP_ISAGED(pp)) 796 #else 797 if (pp->p_vnode == NULL || pp->p_vnode == &kvp || 798 PP_ISFREE(pp) && PP_ISAGED(pp)) 799 #endif /* __sparc */ 800 continue; 801 802 pfn = page_pptonum(pp); 803 if (pf_is_memory(pfn)) { 804 tcnt++; 805 if ((*bitfunc)(pfn, mapflag) == 0) 806 dcnt++; /* dirty count */ 807 } 808 } while ((pp = page_next(pp)) != page0); 809 810 STAT->cs_upage2statef = dcnt; 811 DEBUG9(errp("cpr_count_upages: dirty=%ld total=%ld\n", 812 dcnt, tcnt)); 813 DEBUG7(errp("cpr_count_upages: %ld pages, 0x%lx bytes\n", 814 dcnt, mmu_ptob(dcnt))); 815 return (dcnt); 816 } 817 818 819 /* 820 * try compressing pages based on cflag, 821 * and for DEBUG kernels, verify uncompressed data checksum; 822 * 823 * this routine replaces common code from 824 * i_cpr_compress_and_save() and cpr_compress_and_write() 825 */ 826 char * 827 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag) 828 { 829 size_t nbytes, clen, len; 830 uint32_t test_sum; 831 char *datap; 832 833 nbytes = mmu_ptob(pages); 834 835 /* 836 * set length to the original uncompressed data size; 837 * always init cpd_flag to zero 838 */ 839 dp->cpd_length = nbytes; 840 dp->cpd_flag = 0; 841 842 #ifdef DEBUG 843 /* 844 * Make a copy of the uncompressed data so we can checksum it. 845 * Compress that copy so the checksum works at the other end 846 */ 847 cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes); 848 dp->cpd_usum = checksum32(cpr_pagecopy, nbytes); 849 dp->cpd_flag |= CPD_USUM; 850 datap = cpr_pagecopy; 851 #else 852 datap = CPR->c_mapping_area; 853 dp->cpd_usum = 0; 854 #endif 855 856 /* 857 * try compressing the raw data to cpr_pagedata; 858 * if there was a size reduction: record the new length, 859 * flag the compression, and point to the compressed data. 860 */ 861 dp->cpd_csum = 0; 862 if (cflag) { 863 clen = compress(datap, cpr_pagedata, nbytes); 864 if (clen < nbytes) { 865 dp->cpd_flag |= CPD_COMPRESS; 866 dp->cpd_length = clen; 867 datap = cpr_pagedata; 868 #ifdef DEBUG 869 dp->cpd_csum = checksum32(datap, clen); 870 dp->cpd_flag |= CPD_CSUM; 871 872 /* 873 * decompress the data back to a scratch area 874 * and compare the new checksum with the original 875 * checksum to verify the compression. 876 */ 877 bzero(cpr_pagecopy, sizeof (cpr_pagecopy)); 878 len = decompress(datap, cpr_pagecopy, 879 clen, sizeof (cpr_pagecopy)); 880 test_sum = checksum32(cpr_pagecopy, len); 881 ASSERT(test_sum == dp->cpd_usum); 882 #endif 883 } 884 } 885 886 return (datap); 887 } 888 889 890 /* 891 * 1. Prepare cpr page descriptor and write it to file 892 * 2. Compress page data and write it out 893 */ 894 static int 895 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg) 896 { 897 int error = 0; 898 char *datap; 899 cpd_t cpd; /* cpr page descriptor */ 900 extern void i_cpr_mapin(caddr_t, uint_t, pfn_t); 901 extern void i_cpr_mapout(caddr_t, uint_t); 902 903 i_cpr_mapin(CPR->c_mapping_area, npg, pfn); 904 905 DEBUG3(errp("mapped-in %d pages, vaddr 0x%p, pfn 0x%x\n", 906 npg, CPR->c_mapping_area, pfn)); 907 908 /* 909 * Fill cpr page descriptor. 910 */ 911 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 912 cpd.cpd_pfn = pfn; 913 cpd.cpd_pages = npg; 914 915 STAT->cs_dumped_statefsz += mmu_ptob(npg); 916 917 datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING); 918 919 /* Write cpr page descriptor */ 920 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t)); 921 922 /* Write compressed page data */ 923 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 924 925 /* 926 * Unmap the pages for tlb and vac flushing 927 */ 928 i_cpr_mapout(CPR->c_mapping_area, npg); 929 930 if (error) { 931 DEBUG1(errp("cpr_compress_and_write: vp 0x%p va 0x%x ", 932 vp, va)); 933 DEBUG1(errp("pfn 0x%lx blk %d err %d\n", 934 pfn, cpr_file_bn, error)); 935 } else { 936 cpr_regular_pgs_dumped += npg; 937 } 938 939 return (error); 940 } 941 942 943 int 944 cpr_write(vnode_t *vp, caddr_t buffer, size_t size) 945 { 946 caddr_t fromp = buffer; 947 size_t bytes, wbytes; 948 int error; 949 950 if (cpr_dev_space == 0) { 951 if (vp->v_type == VBLK) { 952 cpr_dev_space = cpr_get_devsize(vp->v_rdev); 953 ASSERT(cpr_dev_space); 954 } else 955 cpr_dev_space = 1; /* not used in this case */ 956 } 957 958 /* 959 * break the write into multiple part if request is large, 960 * calculate count up to buf page boundary, then write it out. 961 * repeat until done. 962 */ 963 while (size) { 964 bytes = MIN(size, cpr_buf_end - cpr_wptr); 965 cprbcopy(fromp, cpr_wptr, bytes); 966 cpr_wptr += bytes; 967 fromp += bytes; 968 size -= bytes; 969 if (cpr_wptr < cpr_buf_end) 970 return (0); /* buffer not full yet */ 971 ASSERT(cpr_wptr == cpr_buf_end); 972 973 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks); 974 if (vp->v_type == VBLK) { 975 if (wbytes > cpr_dev_space) 976 return (ENOSPC); 977 } else { 978 if (wbytes > VTOI(vp)->i_size) 979 return (ENOSPC); 980 } 981 982 DEBUG3(errp("cpr_write: frmp=%x wptr=%x cnt=%x...", 983 fromp, cpr_wptr, bytes)); 984 /* 985 * cross check, this should not happen! 986 */ 987 if (cpr_disk_writes_ok == 0) { 988 errp("cpr_write: disk write too early!\n"); 989 return (EINVAL); 990 } 991 992 do_polled_io = 1; 993 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks); 994 do_polled_io = 0; 995 DEBUG3(errp("done\n")); 996 997 STAT->cs_real_statefsz += cpr_buf_size; 998 999 if (error) { 1000 cpr_err(CE_WARN, "cpr_write error %d", error); 1001 return (error); 1002 } 1003 cpr_file_bn += cpr_buf_blocks; /* Increment block count */ 1004 cpr_wptr = cpr_buf; /* back to top of buffer */ 1005 } 1006 return (0); 1007 } 1008 1009 1010 int 1011 cpr_flush_write(vnode_t *vp) 1012 { 1013 int nblk; 1014 int error; 1015 1016 /* 1017 * Calculate remaining blocks in buffer, rounded up to nearest 1018 * disk block 1019 */ 1020 nblk = btod(cpr_wptr - cpr_buf); 1021 1022 do_polled_io = 1; 1023 error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk); 1024 do_polled_io = 0; 1025 1026 cpr_file_bn += nblk; 1027 if (error) 1028 DEBUG2(errp("cpr_flush_write: error (%d)\n", error)); 1029 return (error); 1030 } 1031 1032 void 1033 cpr_clear_bitmaps(void) 1034 { 1035 cbd_t *dp; 1036 1037 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1038 bzero((void *)dp->cbd_reg_bitmap, 1039 (size_t)dp->cbd_size * 2); 1040 } 1041 DEBUG7(errp("\ncleared reg and vlt bitmaps\n")); 1042 } 1043 1044 int 1045 cpr_contig_pages(vnode_t *vp, int flag) 1046 { 1047 int chunks = 0, error = 0; 1048 pgcnt_t i, j, totbit; 1049 pfn_t spfn; 1050 cbd_t *dp; 1051 uint_t spin_cnt = 0; 1052 extern int i_cpr_compress_and_save(); 1053 1054 for (dp = CPR->c_bmda; dp->cbd_size; dp++) { 1055 spfn = dp->cbd_spfn; 1056 totbit = BTOb(dp->cbd_size); 1057 i = 0; /* Beginning of bitmap */ 1058 j = 0; 1059 while (i < totbit) { 1060 while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) { 1061 if (isset((char *)dp->cbd_reg_bitmap, j+i)) 1062 j++; 1063 else /* not contiguous anymore */ 1064 break; 1065 } 1066 1067 if (j) { 1068 chunks++; 1069 if (flag == SAVE_TO_STORAGE) { 1070 error = i_cpr_compress_and_save( 1071 chunks, spfn + i, j); 1072 if (error) 1073 return (error); 1074 } else if (flag == WRITE_TO_STATEFILE) { 1075 error = cpr_compress_and_write(vp, 0, 1076 spfn + i, j); 1077 if (error) 1078 return (error); 1079 else { 1080 spin_cnt++; 1081 if ((spin_cnt & 0x5F) == 1) 1082 cpr_spinning_bar(); 1083 } 1084 } 1085 } 1086 1087 i += j; 1088 if (j != CPR_MAXCONTIG) { 1089 /* Stopped on a non-tagged page */ 1090 i++; 1091 } 1092 1093 j = 0; 1094 } 1095 } 1096 1097 if (flag == STORAGE_DESC_ALLOC) 1098 return (chunks); 1099 else 1100 return (0); 1101 } 1102 1103 1104 void 1105 cpr_show_range(caddr_t vaddr, size_t size, 1106 int mapflag, bitfunc_t bitfunc, pgcnt_t count) 1107 { 1108 char *action, *bname; 1109 1110 bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile"; 1111 if (bitfunc == cpr_setbit) 1112 action = "tag"; 1113 else if (bitfunc == cpr_clrbit) 1114 action = "untag"; 1115 else 1116 action = "none"; 1117 errp("range (0x%p, 0x%p), %s bitmap, %s %ld\n", 1118 vaddr, vaddr + size, bname, action, count); 1119 } 1120 1121 1122 pgcnt_t 1123 cpr_count_pages(caddr_t sva, size_t size, 1124 int mapflag, bitfunc_t bitfunc, int showrange) 1125 { 1126 caddr_t va, eva; 1127 pfn_t pfn; 1128 pgcnt_t count = 0; 1129 1130 eva = sva + PAGE_ROUNDUP(size); 1131 for (va = sva; va < eva; va += MMU_PAGESIZE) { 1132 pfn = va_to_pfn(va); 1133 if (pfn != PFN_INVALID && pf_is_memory(pfn)) { 1134 if ((*bitfunc)(pfn, mapflag) == 0) 1135 count++; 1136 } 1137 } 1138 1139 if ((cpr_debug & LEVEL7) && showrange == DBG_SHOWRANGE) 1140 cpr_show_range(sva, size, mapflag, bitfunc, count); 1141 1142 return (count); 1143 } 1144 1145 1146 pgcnt_t 1147 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc) 1148 { 1149 pgcnt_t count = 0; 1150 1151 if (cpr_buf) { 1152 count += cpr_count_pages(cpr_buf, cpr_buf_size, 1153 mapflag, bitfunc, DBG_SHOWRANGE); 1154 } 1155 if (cpr_pagedata) { 1156 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size, 1157 mapflag, bitfunc, DBG_SHOWRANGE); 1158 } 1159 count += i_cpr_count_storage_pages(mapflag, bitfunc); 1160 1161 DEBUG7(errp("cpr_count_vpages: %ld pages, 0x%lx bytes\n", 1162 count, mmu_ptob(count))); 1163 return (count); 1164 } 1165 1166 1167 static int 1168 cpr_dump_regular_pages(vnode_t *vp) 1169 { 1170 int error; 1171 1172 cpr_regular_pgs_dumped = 0; 1173 error = cpr_contig_pages(vp, WRITE_TO_STATEFILE); 1174 if (!error) 1175 DEBUG7(errp("cpr_dump_regular_pages() done.\n")); 1176 return (error); 1177 } 1178