1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Platform specific implementation code 30 */ 31 32 #define SUNDDI_IMPL 33 34 #include <sys/types.h> 35 #include <sys/promif.h> 36 #include <sys/prom_isa.h> 37 #include <sys/prom_plat.h> 38 #include <sys/mmu.h> 39 #include <vm/hat_sfmmu.h> 40 #include <sys/iommu.h> 41 #include <sys/scb.h> 42 #include <sys/cpuvar.h> 43 #include <sys/intreg.h> 44 #include <sys/pte.h> 45 #include <vm/hat.h> 46 #include <vm/page.h> 47 #include <vm/as.h> 48 #include <sys/cpr.h> 49 #include <sys/kmem.h> 50 #include <sys/clock.h> 51 #include <sys/kmem.h> 52 #include <sys/panic.h> 53 #include <vm/seg_kmem.h> 54 #include <sys/cpu_module.h> 55 #include <sys/callb.h> 56 #include <sys/machsystm.h> 57 #include <sys/vmsystm.h> 58 #include <sys/systm.h> 59 #include <sys/archsystm.h> 60 #include <sys/stack.h> 61 #include <sys/fs/ufs_fs.h> 62 #include <sys/memlist.h> 63 #include <sys/bootconf.h> 64 #include <sys/thread.h> 65 #include <vm/vm_dep.h> 66 67 extern void cpr_clear_bitmaps(void); 68 extern void dtlb_wr_entry(uint_t, tte_t *, uint64_t *); 69 extern void itlb_wr_entry(uint_t, tte_t *, uint64_t *); 70 71 static int i_cpr_storage_desc_alloc(csd_t **, pgcnt_t *, csd_t **, int); 72 static void i_cpr_storage_desc_init(csd_t *, pgcnt_t, csd_t *); 73 static caddr_t i_cpr_storage_data_alloc(pgcnt_t, pgcnt_t *, int); 74 static int cpr_dump_sensitive(vnode_t *, csd_t *); 75 static void i_cpr_clear_entries(uint64_t, uint64_t); 76 static void i_cpr_xcall(xcfunc_t); 77 78 void i_cpr_storage_free(void); 79 80 extern void *i_cpr_data_page; 81 extern int cpr_test_mode; 82 extern int cpr_nbitmaps; 83 extern char cpr_default_path[]; 84 extern caddr_t textva, datava; 85 86 static struct cpr_map_info cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 87 caddr_t cpr_vaddr = NULL; 88 89 static uint_t sensitive_pages_saved; 90 static uint_t sensitive_size_saved; 91 92 caddr_t i_cpr_storage_data_base; 93 caddr_t i_cpr_storage_data_end; 94 csd_t *i_cpr_storage_desc_base; 95 csd_t *i_cpr_storage_desc_end; /* one byte beyond last used descp */ 96 csd_t *i_cpr_storage_desc_last_used; /* last used descriptor */ 97 caddr_t sensitive_write_ptr; /* position for next storage write */ 98 99 size_t i_cpr_sensitive_bytes_dumped; 100 pgcnt_t i_cpr_sensitive_pgs_dumped; 101 pgcnt_t i_cpr_storage_data_sz; /* in pages */ 102 pgcnt_t i_cpr_storage_desc_pgcnt; /* in pages */ 103 104 ushort_t cpr_mach_type = CPR_MACHTYPE_4U; 105 static csu_md_t m_info; 106 107 108 #define MAX_STORAGE_RETRY 3 109 #define MAX_STORAGE_ALLOC_RETRY 3 110 #define INITIAL_ALLOC_PCNT 40 /* starting allocation percentage */ 111 #define INTEGRAL 100 /* to get 1% precision */ 112 113 #define EXTRA_RATE 2 /* add EXTRA_RATE% extra space */ 114 #define EXTRA_DESCS 10 115 116 #define CPR_NO_STORAGE_DESC 1 117 #define CPR_NO_STORAGE_DATA 2 118 119 #define CIF_SPLICE 0 120 #define CIF_UNLINK 1 121 122 123 /* 124 * CPR miscellaneous support routines 125 */ 126 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 127 mode, 0600, vpp, CRCREAT, 0)) 128 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 129 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 130 (ssize_t *)NULL)) 131 132 /* 133 * definitions for saving/restoring prom pages 134 */ 135 static void *ppage_buf; 136 static pgcnt_t ppage_count; 137 static pfn_t *pphys_list; 138 static size_t pphys_list_size; 139 140 typedef void (*tlb_rw_t)(uint_t, tte_t *, uint64_t *); 141 typedef void (*tlb_filter_t)(int, tte_t *, uint64_t, void *); 142 143 /* 144 * private struct for tlb handling 145 */ 146 struct cpr_trans_info { 147 sutlb_t *dst; 148 sutlb_t *tail; 149 tlb_rw_t reader; 150 tlb_rw_t writer; 151 tlb_filter_t filter; 152 int index; 153 uint64_t skip; /* assumes TLB <= 64 locked entries */ 154 }; 155 typedef struct cpr_trans_info cti_t; 156 157 158 /* 159 * special handling for tlb info 160 */ 161 #define WITHIN_OFW(va) \ 162 (((va) > (uint64_t)OFW_START_ADDR) && ((va) < (uint64_t)OFW_END_ADDR)) 163 164 #define WITHIN_NUCLEUS(va, base) \ 165 (((va) >= (base)) && \ 166 (((va) + MMU_PAGESIZE) <= ((base) + MMU_PAGESIZE4M))) 167 168 #define IS_BIGKTSB(va) \ 169 (enable_bigktsb && \ 170 ((va) >= (uint64_t)ktsb_base) && \ 171 ((va) < (uint64_t)(ktsb_base + ktsb_sz))) 172 173 174 /* 175 * WARNING: 176 * the text from this file is linked to follow cpr_resume_setup.o; 177 * only add text between here and i_cpr_end_jumpback when it needs 178 * to be called during resume before we switch back to the kernel 179 * trap table. all the text in this range must fit within a page. 180 */ 181 182 183 /* 184 * each time a machine is reset, the prom uses an inconsistent set of phys 185 * pages and the cif cookie may differ as well. so prior to restoring the 186 * original prom, we have to use to use the new/tmp prom's translations 187 * when requesting prom services. 188 * 189 * cif_handler starts out as the original prom cookie, and that gets used 190 * by client_handler() to jump into the prom. here we splice-in a wrapper 191 * routine by writing cif_handler; client_handler() will now jump to the 192 * wrapper which switches the %tba to the new/tmp prom's trap table then 193 * jumps to the new cookie. 194 */ 195 void 196 i_cpr_cif_setup(int action) 197 { 198 extern void *i_cpr_orig_cif, *cif_handler; 199 extern int i_cpr_cif_wrapper(void *); 200 201 /* 202 * save the original cookie and change the current cookie to the 203 * wrapper routine. later we just restore the original cookie. 204 */ 205 if (action == CIF_SPLICE) { 206 i_cpr_orig_cif = cif_handler; 207 cif_handler = (void *)i_cpr_cif_wrapper; 208 } else if (action == CIF_UNLINK) 209 cif_handler = i_cpr_orig_cif; 210 } 211 212 213 /* 214 * launch slave cpus into kernel text, pause them, 215 * and restore the original prom pages 216 */ 217 void 218 i_cpr_mp_setup(void) 219 { 220 extern void restart_other_cpu(int); 221 ihandle_t tmpout = 0; 222 char *str; 223 cpu_t *cp; 224 225 uint64_t kctx = kcontextreg; 226 227 /* 228 * Do not allow setting page size codes in MMU primary context 229 * register while using cif wrapper. This is needed to work 230 * arround OBP incorrect handling of this MMU register. 231 */ 232 kcontextreg = 0; 233 234 /* 235 * reset cpu_ready_set so x_calls work properly 236 */ 237 CPUSET_ZERO(cpu_ready_set); 238 CPUSET_ADD(cpu_ready_set, getprocessorid()); 239 240 /* 241 * setup cif to use the cookie from the new/tmp prom 242 * and setup tmp handling for calling prom services. 243 */ 244 i_cpr_cif_setup(CIF_SPLICE); 245 246 /* 247 * at this point, only the nucleus and a few cpr pages are 248 * mapped in. once we switch to the kernel trap table, 249 * we can access the rest of kernel space. 250 */ 251 prom_set_traptable(&trap_table); 252 253 if (ncpus > 1) { 254 sfmmu_init_tsbs(); 255 256 if (cpr_debug & CPR_DEBUG1) { 257 prom_interpret("stdout @ swap l!", (uintptr_t)&tmpout, 258 0, 0, 0, 0); 259 str = "MP startup...\r\n"; 260 (void) prom_write(tmpout, str, strlen(str), 0, 0); 261 } 262 263 mutex_enter(&cpu_lock); 264 /* 265 * All of the slave cpus are not ready at this time, 266 * yet the cpu structures have various cpu_flags set; 267 * clear cpu_flags and mutex_ready. 268 * Since we are coming up from a CPU suspend, the slave cpus 269 * are frozen. 270 */ 271 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) { 272 cp->cpu_flags = CPU_FROZEN; 273 cp->cpu_m.mutex_ready = 0; 274 } 275 276 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) 277 restart_other_cpu(cp->cpu_id); 278 279 pause_cpus(NULL); 280 mutex_exit(&cpu_lock); 281 282 if (cpr_debug & CPR_DEBUG1) { 283 str = "MP paused...\r\n"; 284 (void) prom_write(tmpout, str, strlen(str), 0, 0); 285 } 286 287 i_cpr_xcall(i_cpr_clear_entries); 288 } else 289 i_cpr_clear_entries(0, 0); 290 291 /* 292 * now unlink the cif wrapper; WARNING: do not call any 293 * prom_xxx() routines until after prom pages are restored. 294 */ 295 i_cpr_cif_setup(CIF_UNLINK); 296 297 (void) i_cpr_prom_pages(CPR_PROM_RESTORE); 298 299 /* allow setting page size codes in MMU primary context register */ 300 kcontextreg = kctx; 301 } 302 303 304 /* 305 * end marker for jumpback page; 306 * this symbol is used to check the size of i_cpr_resume_setup() 307 * and the above text. For simplicity, the Makefile needs to 308 * link i_cpr_resume_setup.o and cpr_impl.o consecutively. 309 */ 310 void 311 i_cpr_end_jumpback(void) 312 { 313 } 314 315 316 /* 317 * scan tlb entries with reader; when valid entries are found, 318 * the filter routine will selectively save/clear them 319 */ 320 static void 321 i_cpr_scan_tlb(cti_t *ctip) 322 { 323 uint64_t va_tag; 324 int tlb_index; 325 tte_t tte; 326 327 for (tlb_index = ctip->index; tlb_index >= 0; tlb_index--) { 328 (*ctip->reader)((uint_t)tlb_index, &tte, &va_tag); 329 if (va_tag && TTE_IS_VALID(&tte)) 330 (*ctip->filter)(tlb_index, &tte, va_tag, ctip); 331 } 332 } 333 334 335 /* 336 * filter for locked tlb entries that reference the text/data nucleus 337 * and any bigktsb's; these will be reinstalled by cprboot on all cpus 338 */ 339 /* ARGSUSED */ 340 static void 341 i_cpr_lnb(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 342 { 343 cti_t *ctip; 344 345 /* 346 * record tlb data at ctip->dst; the target tlb index starts 347 * at the highest tlb offset and moves towards 0. the prom 348 * reserves both dtlb and itlb index 0. any selected entry 349 * also gets marked to prevent being flushed during resume 350 */ 351 if (TTE_IS_LOCKED(ttep) && (va_tag == (uint64_t)textva || 352 va_tag == (uint64_t)datava || IS_BIGKTSB(va_tag))) { 353 ctip = ctrans; 354 while ((1 << ctip->index) & ctip->skip) 355 ctip->index--; 356 ASSERT(ctip->index > 0); 357 ASSERT(ctip->dst < ctip->tail); 358 ctip->dst->tte.ll = ttep->ll; 359 ctip->dst->va_tag = va_tag; 360 ctip->dst->index = ctip->index--; 361 ctip->dst->tmp = 0; 362 ctip->dst++; 363 } 364 } 365 366 367 /* 368 * some tlb entries are stale, filter for unlocked entries 369 * within the prom virt range and clear them 370 */ 371 static void 372 i_cpr_ufw(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 373 { 374 sutlb_t clr; 375 cti_t *ctip; 376 377 if (!TTE_IS_LOCKED(ttep) && WITHIN_OFW(va_tag)) { 378 ctip = ctrans; 379 bzero(&clr, sizeof (clr)); 380 (*ctip->writer)((uint_t)index, &clr.tte, &clr.va_tag); 381 } 382 } 383 384 385 /* 386 * some of the entries installed by cprboot are needed only on a 387 * short-term basis and need to be flushed to avoid clogging the tlbs. 388 * scan the dtte/itte arrays for items marked as temporary and clear 389 * dtlb/itlb entries using wrfunc. 390 */ 391 static void 392 i_cpr_clear_tmp(sutlb_t *listp, int max, tlb_rw_t wrfunc) 393 { 394 sutlb_t clr, *tail; 395 396 bzero(&clr, sizeof (clr)); 397 for (tail = listp + max; listp < tail && listp->va_tag; listp++) { 398 if (listp->tmp) 399 (*wrfunc)((uint_t)listp->index, &clr.tte, &clr.va_tag); 400 } 401 } 402 403 404 /* ARGSUSED */ 405 static void 406 i_cpr_clear_entries(uint64_t arg1, uint64_t arg2) 407 { 408 extern void demap_all(void); 409 cti_t cti; 410 411 i_cpr_clear_tmp(m_info.dtte, CPR_MAX_TLB, dtlb_wr_entry); 412 i_cpr_clear_tmp(m_info.itte, CPR_MAX_TLB, itlb_wr_entry); 413 414 /* 415 * for newer cpus that implement DEMAP_ALL_TYPE, demap_all is 416 * a second label for vtag_flushall. the call is made using 417 * vtag_flushall() instead of demap_all() due to runtime and 418 * krtld results with both older and newer cpu modules. 419 */ 420 if (&demap_all != 0) { 421 vtag_flushall(); 422 return; 423 } 424 425 /* 426 * for older V9 cpus, scan tlbs and clear stale entries 427 */ 428 bzero(&cti, sizeof (cti)); 429 cti.filter = i_cpr_ufw; 430 431 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 432 cti.reader = dtlb_rd_entry; 433 cti.writer = dtlb_wr_entry; 434 i_cpr_scan_tlb(&cti); 435 436 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 437 cti.reader = itlb_rd_entry; 438 cti.writer = itlb_wr_entry; 439 i_cpr_scan_tlb(&cti); 440 } 441 442 443 /* 444 * craft tlb info for tmp use during resume; this data gets used by 445 * cprboot to install tlb entries. we also mark each struct as tmp 446 * so those tlb entries will get flushed after switching to the kernel 447 * trap table. no data needs to be recorded for vaddr when it falls 448 * within the nucleus since we've already recorded nucleus ttes and 449 * a 8K tte would conflict with a 4MB tte. eg: the cpr module 450 * text/data may have been loaded into the text/data nucleus. 451 */ 452 static void 453 i_cpr_make_tte(cti_t *ctip, void *vaddr, caddr_t nbase) 454 { 455 pfn_t ppn; 456 uint_t rw; 457 458 if (WITHIN_NUCLEUS((caddr_t)vaddr, nbase)) 459 return; 460 461 while ((1 << ctip->index) & ctip->skip) 462 ctip->index--; 463 ASSERT(ctip->index > 0); 464 ASSERT(ctip->dst < ctip->tail); 465 466 /* 467 * without any global service available to lookup 468 * a tte by vaddr, we craft our own here: 469 */ 470 ppn = va_to_pfn(vaddr); 471 rw = (nbase == datava) ? TTE_HWWR_INT : 0; 472 ctip->dst->tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 473 ctip->dst->tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 474 TTE_CP_INT | TTE_PRIV_INT | rw; 475 ctip->dst->va_tag = ((uintptr_t)vaddr & MMU_PAGEMASK); 476 ctip->dst->index = ctip->index--; 477 ctip->dst->tmp = 1; 478 ctip->dst++; 479 } 480 481 482 static void 483 i_cpr_xcall(xcfunc_t func) 484 { 485 uint_t pil, reset_pil; 486 487 pil = getpil(); 488 if (pil < XCALL_PIL) 489 reset_pil = 0; 490 else { 491 reset_pil = 1; 492 setpil(XCALL_PIL - 1); 493 } 494 xc_some(cpu_ready_set, func, 0, 0); 495 if (reset_pil) 496 setpil(pil); 497 } 498 499 500 /* 501 * restart paused slave cpus 502 */ 503 void 504 i_cpr_machdep_setup(void) 505 { 506 if (ncpus > 1) { 507 CPR_DEBUG(CPR_DEBUG1, "MP restarted...\n"); 508 mutex_enter(&cpu_lock); 509 start_cpus(); 510 mutex_exit(&cpu_lock); 511 } 512 } 513 514 515 /* 516 * Stop all interrupt activities in the system 517 */ 518 void 519 i_cpr_stop_intr(void) 520 { 521 (void) spl7(); 522 } 523 524 /* 525 * Set machine up to take interrupts 526 */ 527 void 528 i_cpr_enable_intr(void) 529 { 530 (void) spl0(); 531 } 532 533 534 /* 535 * record cpu nodes and ids 536 */ 537 static void 538 i_cpr_save_cpu_info(void) 539 { 540 struct sun4u_cpu_info *scip; 541 cpu_t *cp; 542 543 scip = m_info.sci; 544 cp = CPU; 545 do { 546 ASSERT(scip < &m_info.sci[NCPU]); 547 scip->cpu_id = cp->cpu_id; 548 scip->node = cpunodes[cp->cpu_id].nodeid; 549 scip++; 550 } while ((cp = cp->cpu_next) != CPU); 551 } 552 553 554 /* 555 * Write necessary machine dependent information to cpr state file, 556 * eg. sun4u mmu ctx secondary for the current running process (cpr) ... 557 */ 558 int 559 i_cpr_write_machdep(vnode_t *vp) 560 { 561 extern uint_t getpstate(), getwstate(); 562 extern uint_t i_cpr_tstack_size; 563 const char ustr[] = ": unix-tte 2drop false ;"; 564 uintptr_t tinfo; 565 label_t *ltp; 566 cmd_t cmach; 567 char *fmt; 568 int rc; 569 570 /* 571 * ustr[] is used as temporary forth words during 572 * slave startup sequence, see sfmmu_mp_startup() 573 */ 574 575 cmach.md_magic = (uint_t)CPR_MACHDEP_MAGIC; 576 cmach.md_size = sizeof (m_info) + sizeof (ustr); 577 578 if (rc = cpr_write(vp, (caddr_t)&cmach, sizeof (cmach))) { 579 cpr_err(CE_WARN, "Failed to write descriptor."); 580 return (rc); 581 } 582 583 /* 584 * m_info is now cleared in i_cpr_dump_setup() 585 */ 586 m_info.ksb = (uint32_t)STACK_BIAS; 587 m_info.kpstate = (uint16_t)getpstate(); 588 m_info.kwstate = (uint16_t)getwstate(); 589 CPR_DEBUG(CPR_DEBUG1, "stack bias 0x%x, pstate 0x%x, wstate 0x%x\n", 590 m_info.ksb, m_info.kpstate, m_info.kwstate); 591 592 ltp = &ttolwp(curthread)->lwp_qsav; 593 m_info.qsav_pc = (cpr_ext)ltp->val[0]; 594 m_info.qsav_sp = (cpr_ext)ltp->val[1]; 595 596 /* 597 * Set secondary context to INVALID_CONTEXT to force the HAT 598 * to re-setup the MMU registers and locked TTEs it needs for 599 * TLB miss handling. 600 */ 601 m_info.mmu_ctx_sec = INVALID_CONTEXT; 602 m_info.mmu_ctx_pri = KCONTEXT; 603 604 tinfo = (uintptr_t)curthread; 605 m_info.thrp = (cpr_ptr)tinfo; 606 607 tinfo = (uintptr_t)i_cpr_resume_setup; 608 m_info.func = (cpr_ptr)tinfo; 609 610 /* 611 * i_cpr_data_page is comprised of a 4K stack area and a few 612 * trailing data symbols; the page is shared by the prom and 613 * kernel during resume. the stack size is recorded here 614 * and used by cprboot to set %sp 615 */ 616 tinfo = (uintptr_t)&i_cpr_data_page; 617 m_info.tmp_stack = (cpr_ptr)tinfo; 618 m_info.tmp_stacksize = i_cpr_tstack_size; 619 620 m_info.test_mode = cpr_test_mode; 621 622 i_cpr_save_cpu_info(); 623 624 if (rc = cpr_write(vp, (caddr_t)&m_info, sizeof (m_info))) { 625 cpr_err(CE_WARN, "Failed to write machdep info."); 626 return (rc); 627 } 628 629 fmt = "error writing %s forth info"; 630 if (rc = cpr_write(vp, (caddr_t)ustr, sizeof (ustr))) 631 cpr_err(CE_WARN, fmt, "unix-tte"); 632 633 return (rc); 634 } 635 636 637 /* 638 * Save miscellaneous information which needs to be written to the 639 * state file. This information is required to re-initialize 640 * kernel/prom handshaking. 641 */ 642 void 643 i_cpr_save_machdep_info(void) 644 { 645 CPR_DEBUG(CPR_DEBUG5, "jumpback size = 0x%lx\n", 646 (uintptr_t)&i_cpr_end_jumpback - 647 (uintptr_t)i_cpr_resume_setup); 648 649 /* 650 * Verify the jumpback code all falls in one page. 651 */ 652 if (((uintptr_t)&i_cpr_end_jumpback & MMU_PAGEMASK) != 653 ((uintptr_t)i_cpr_resume_setup & MMU_PAGEMASK)) 654 cpr_err(CE_PANIC, "jumpback code exceeds one page."); 655 } 656 657 658 void 659 i_cpr_set_tbr(void) 660 { 661 } 662 663 664 /* 665 * cpu0 should contain bootcpu info 666 */ 667 cpu_t * 668 i_cpr_bootcpu(void) 669 { 670 return (&cpu0); 671 } 672 673 674 /* 675 * Return the virtual address of the mapping area 676 */ 677 caddr_t 678 i_cpr_map_setup(void) 679 { 680 /* 681 * Allocate a virtual memory range spanned by an hmeblk. 682 * This would be 8 hments or 64k bytes. Starting VA 683 * must be 64k (8-page) aligned. 684 */ 685 cpr_vaddr = vmem_xalloc(heap_arena, 686 mmu_ptob(NHMENTS), mmu_ptob(NHMENTS), 687 0, 0, NULL, NULL, VM_NOSLEEP); 688 return (cpr_vaddr); 689 } 690 691 /* 692 * create tmp locked tlb entries for a group of phys pages; 693 * 694 * i_cpr_mapin/i_cpr_mapout should always be called in pairs, 695 * otherwise would fill up a tlb with locked entries 696 */ 697 void 698 i_cpr_mapin(caddr_t vaddr, uint_t pages, pfn_t ppn) 699 { 700 tte_t tte; 701 extern pfn_t curthreadpfn; 702 extern int curthreadremapped; 703 704 curthreadremapped = (ppn <= curthreadpfn && curthreadpfn < ppn + pages); 705 706 for (; pages--; ppn++, vaddr += MMU_PAGESIZE) { 707 tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 708 tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 709 TTE_CP_INT | TTE_PRIV_INT | TTE_HWWR_INT; 710 sfmmu_dtlb_ld_kva(vaddr, &tte); 711 } 712 } 713 714 void 715 i_cpr_mapout(caddr_t vaddr, uint_t pages) 716 { 717 extern int curthreadremapped; 718 719 if (curthreadremapped && vaddr <= (caddr_t)curthread && 720 (caddr_t)curthread < vaddr + pages * MMU_PAGESIZE) 721 curthreadremapped = 0; 722 723 for (; pages--; vaddr += MMU_PAGESIZE) 724 vtag_flushpage(vaddr, (uint64_t)ksfmmup); 725 } 726 727 /* 728 * We're done using the mapping area; release virtual space 729 */ 730 void 731 i_cpr_map_destroy(void) 732 { 733 vmem_free(heap_arena, cpr_vaddr, mmu_ptob(NHMENTS)); 734 cpr_vaddr = NULL; 735 } 736 737 /* ARGSUSED */ 738 void 739 i_cpr_handle_xc(int flag) 740 { 741 } 742 743 744 /* 745 * This function takes care of pages which are not in kas or need to be 746 * taken care of in a special way. For example, panicbuf pages are not 747 * in kas and their pages are allocated via prom_retain(). 748 */ 749 pgcnt_t 750 i_cpr_count_special_kpages(int mapflag, bitfunc_t bitfunc) 751 { 752 struct cpr_map_info *pri, *tail; 753 pgcnt_t pages, total = 0; 754 pfn_t pfn; 755 756 /* 757 * Save information about prom retained panicbuf pages 758 */ 759 if (bitfunc == cpr_setbit) { 760 pri = &cpr_prom_retain[CPR_PANICBUF]; 761 pri->virt = (cpr_ptr)panicbuf; 762 pri->phys = va_to_pa(panicbuf); 763 pri->size = sizeof (panicbuf); 764 } 765 766 /* 767 * Go through the prom_retain array to tag those pages. 768 */ 769 tail = &cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 770 for (pri = cpr_prom_retain; pri < tail; pri++) { 771 pages = mmu_btopr(pri->size); 772 for (pfn = ADDR_TO_PN(pri->phys); pages--; pfn++) { 773 if (pf_is_memory(pfn)) { 774 if (bitfunc == cpr_setbit) { 775 if ((*bitfunc)(pfn, mapflag) == 0) 776 total++; 777 } else 778 total++; 779 } 780 } 781 } 782 783 return (total); 784 } 785 786 787 /* 788 * Free up memory-related resources here. We start by freeing buffers 789 * allocated during suspend initialization. Also, free up the mapping 790 * resources allocated in cpr_init(). 791 */ 792 void 793 i_cpr_free_memory_resources(void) 794 { 795 (void) i_cpr_prom_pages(CPR_PROM_FREE); 796 i_cpr_map_destroy(); 797 i_cpr_storage_free(); 798 } 799 800 801 /* 802 * Derived from cpr_write_statefile(). 803 * Save the sensitive pages to the storage area and do bookkeeping 804 * using the sensitive descriptors. Each descriptor will contain no more 805 * than CPR_MAXCONTIG amount of contiguous pages to match the max amount 806 * of pages that statefile gets written to disk at each write. 807 * XXX The CPR_MAXCONTIG can be changed to the size of the compression 808 * scratch area. 809 */ 810 static int 811 i_cpr_save_to_storage(void) 812 { 813 sensitive_size_saved = 0; 814 sensitive_pages_saved = 0; 815 sensitive_write_ptr = i_cpr_storage_data_base; 816 return (cpr_contig_pages(NULL, SAVE_TO_STORAGE)); 817 } 818 819 820 /* 821 * This routine allocates space to save the sensitive kernel pages, 822 * i.e. kernel data nucleus, kvalloc and kvseg segments. 823 * It's assumed that those segments are the only areas that can be 824 * contaminated by memory allocations during statefile dumping. 825 * The space allocated here contains: 826 * A list of descriptors describing the saved sensitive pages. 827 * The storage area for saving the compressed sensitive kernel pages. 828 * Since storage pages are allocated from segkmem, they need to be 829 * excluded when saving. 830 */ 831 int 832 i_cpr_save_sensitive_kpages(void) 833 { 834 static const char pages_fmt[] = "\n%s %s allocs\n" 835 " spages %ld, vpages %ld, diff %ld\n"; 836 int retry_cnt; 837 int error = 0; 838 pgcnt_t pages, spages, vpages; 839 caddr_t addr; 840 char *str; 841 842 /* 843 * Tag sensitive kpages. Allocate space for storage descriptors 844 * and storage data area based on the resulting bitmaps. 845 * Note: The storage space will be part of the sensitive 846 * segment, so we need to tag kpages here before the storage 847 * is actually allocated just so their space won't be accounted 848 * for. They will not be part of the statefile although those 849 * pages will be claimed by cprboot. 850 */ 851 cpr_clear_bitmaps(); 852 853 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 854 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 855 pages = spages - vpages; 856 857 str = "i_cpr_save_sensitive_kpages:"; 858 CPR_DEBUG(CPR_DEBUG7, pages_fmt, "before", str, spages, vpages, pages); 859 860 /* 861 * Allocate space to save the clean sensitive kpages 862 */ 863 for (retry_cnt = 0; retry_cnt < MAX_STORAGE_ALLOC_RETRY; retry_cnt++) { 864 /* 865 * Alloc on first pass or realloc if we are retrying because 866 * of insufficient storage for sensitive pages 867 */ 868 if (retry_cnt == 0 || error == ENOMEM) { 869 if (i_cpr_storage_data_base) { 870 kmem_free(i_cpr_storage_data_base, 871 mmu_ptob(i_cpr_storage_data_sz)); 872 i_cpr_storage_data_base = NULL; 873 i_cpr_storage_data_sz = 0; 874 } 875 addr = i_cpr_storage_data_alloc(pages, 876 &i_cpr_storage_data_sz, retry_cnt); 877 if (addr == NULL) { 878 CPR_DEBUG(CPR_DEBUG7, 879 "\n%s can't allocate data storage space!\n", 880 str); 881 return (ENOMEM); 882 } 883 i_cpr_storage_data_base = addr; 884 i_cpr_storage_data_end = 885 addr + mmu_ptob(i_cpr_storage_data_sz); 886 } 887 888 /* 889 * Allocate on first pass, only realloc if retry is because of 890 * insufficient descriptors, but reset contents on each pass 891 * (desc_alloc resets contents as well) 892 */ 893 if (retry_cnt == 0 || error == -1) { 894 error = i_cpr_storage_desc_alloc( 895 &i_cpr_storage_desc_base, &i_cpr_storage_desc_pgcnt, 896 &i_cpr_storage_desc_end, retry_cnt); 897 if (error != 0) 898 return (error); 899 } else { 900 i_cpr_storage_desc_init(i_cpr_storage_desc_base, 901 i_cpr_storage_desc_pgcnt, i_cpr_storage_desc_end); 902 } 903 904 /* 905 * We are ready to save the sensitive kpages to storage. 906 * We cannot trust what's tagged in the bitmaps anymore 907 * after storage allocations. Clear up the bitmaps and 908 * retag the sensitive kpages again. The storage pages 909 * should be untagged. 910 */ 911 cpr_clear_bitmaps(); 912 913 spages = 914 i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 915 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 916 917 CPR_DEBUG(CPR_DEBUG7, pages_fmt, "after ", str, 918 spages, vpages, spages - vpages); 919 920 /* 921 * Returns 0 on success, -1 if too few descriptors, and 922 * ENOMEM if not enough space to save sensitive pages 923 */ 924 CPR_DEBUG(CPR_DEBUG1, "compressing pages to storage...\n"); 925 error = i_cpr_save_to_storage(); 926 if (error == 0) { 927 /* Saving to storage succeeded */ 928 CPR_DEBUG(CPR_DEBUG1, "compressed %d pages\n", 929 sensitive_pages_saved); 930 break; 931 } else if (error == -1) 932 CPR_DEBUG(CPR_DEBUG1, "%s too few descriptors\n", str); 933 } 934 if (error == -1) 935 error = ENOMEM; 936 return (error); 937 } 938 939 940 /* 941 * Estimate how much memory we will need to save 942 * the sensitive pages with compression. 943 */ 944 static caddr_t 945 i_cpr_storage_data_alloc(pgcnt_t pages, pgcnt_t *alloc_pages, int retry_cnt) 946 { 947 pgcnt_t alloc_pcnt, last_pcnt; 948 caddr_t addr; 949 char *str; 950 951 str = "i_cpr_storage_data_alloc:"; 952 if (retry_cnt == 0) { 953 /* 954 * common compression ratio is about 3:1 955 * initial storage allocation is estimated at 40% 956 * to cover the majority of cases 957 */ 958 alloc_pcnt = INITIAL_ALLOC_PCNT; 959 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 960 CPR_DEBUG(CPR_DEBUG7, "%s sensitive pages: %ld\n", str, pages); 961 CPR_DEBUG(CPR_DEBUG7, 962 "%s initial est pages: %ld, alloc %ld%%\n", 963 str, *alloc_pages, alloc_pcnt); 964 } else { 965 /* 966 * calculate the prior compression percentage (x100) 967 * from the last attempt to save sensitive pages 968 */ 969 ASSERT(sensitive_pages_saved != 0); 970 last_pcnt = (mmu_btopr(sensitive_size_saved) * INTEGRAL) / 971 sensitive_pages_saved; 972 CPR_DEBUG(CPR_DEBUG7, "%s last ratio %ld%%\n", str, last_pcnt); 973 974 /* 975 * new estimated storage size is based on 976 * the larger ratio + 5% for each retry: 977 * pages * (last + [5%, 10%]) 978 */ 979 alloc_pcnt = MAX(last_pcnt, INITIAL_ALLOC_PCNT) + 980 (retry_cnt * 5); 981 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 982 CPR_DEBUG(CPR_DEBUG7, "%s Retry est pages: %ld, alloc %ld%%\n", 983 str, *alloc_pages, alloc_pcnt); 984 } 985 986 addr = kmem_alloc(mmu_ptob(*alloc_pages), KM_NOSLEEP); 987 CPR_DEBUG(CPR_DEBUG7, "%s alloc %ld pages\n", str, *alloc_pages); 988 return (addr); 989 } 990 991 992 void 993 i_cpr_storage_free(void) 994 { 995 /* Free descriptors */ 996 if (i_cpr_storage_desc_base) { 997 kmem_free(i_cpr_storage_desc_base, 998 mmu_ptob(i_cpr_storage_desc_pgcnt)); 999 i_cpr_storage_desc_base = NULL; 1000 i_cpr_storage_desc_pgcnt = 0; 1001 } 1002 1003 1004 /* Data storage */ 1005 if (i_cpr_storage_data_base) { 1006 kmem_free(i_cpr_storage_data_base, 1007 mmu_ptob(i_cpr_storage_data_sz)); 1008 i_cpr_storage_data_base = NULL; 1009 i_cpr_storage_data_sz = 0; 1010 } 1011 } 1012 1013 1014 /* 1015 * This routine is derived from cpr_compress_and_write(). 1016 * 1. Do bookkeeping in the descriptor for the contiguous sensitive chunk. 1017 * 2. Compress and save the clean sensitive pages into the storage area. 1018 */ 1019 int 1020 i_cpr_compress_and_save(int chunks, pfn_t spfn, pgcnt_t pages) 1021 { 1022 extern char *cpr_compress_pages(cpd_t *, pgcnt_t, int); 1023 extern caddr_t i_cpr_storage_data_end; 1024 uint_t remaining, datalen; 1025 uint32_t test_usum; 1026 char *datap; 1027 csd_t *descp; 1028 cpd_t cpd; 1029 int error; 1030 1031 /* 1032 * Fill next empty storage descriptor 1033 */ 1034 descp = i_cpr_storage_desc_base + chunks - 1; 1035 if (descp >= i_cpr_storage_desc_end) { 1036 CPR_DEBUG(CPR_DEBUG1, "ran out of descriptors, base 0x%p, " 1037 "chunks %d, end 0x%p, descp 0x%p\n", 1038 i_cpr_storage_desc_base, chunks, 1039 i_cpr_storage_desc_end, descp); 1040 return (-1); 1041 } 1042 ASSERT(descp->csd_dirty_spfn == (uint_t)-1); 1043 i_cpr_storage_desc_last_used = descp; 1044 1045 descp->csd_dirty_spfn = spfn; 1046 descp->csd_dirty_npages = pages; 1047 1048 i_cpr_mapin(CPR->c_mapping_area, pages, spfn); 1049 1050 /* 1051 * try compressing pages and copy cpd fields 1052 * pfn is copied for debug use 1053 */ 1054 cpd.cpd_pfn = spfn; 1055 datap = cpr_compress_pages(&cpd, pages, C_COMPRESSING); 1056 datalen = cpd.cpd_length; 1057 descp->csd_clean_compressed = (cpd.cpd_flag & CPD_COMPRESS); 1058 #ifdef DEBUG 1059 descp->csd_usum = cpd.cpd_usum; 1060 descp->csd_csum = cpd.cpd_csum; 1061 #endif 1062 1063 error = 0; 1064 1065 /* 1066 * Save the raw or compressed data to the storage area pointed to by 1067 * sensitive_write_ptr. Make sure the storage space is big enough to 1068 * hold the result. Otherwise roll back to increase the storage space. 1069 */ 1070 descp->csd_clean_sva = (cpr_ptr)sensitive_write_ptr; 1071 descp->csd_clean_sz = datalen; 1072 if ((sensitive_write_ptr + datalen) < i_cpr_storage_data_end) { 1073 extern void cprbcopy(void *, void *, size_t); 1074 1075 cprbcopy(datap, sensitive_write_ptr, datalen); 1076 sensitive_size_saved += datalen; 1077 sensitive_pages_saved += descp->csd_dirty_npages; 1078 sensitive_write_ptr += datalen; 1079 } else { 1080 remaining = (i_cpr_storage_data_end - sensitive_write_ptr); 1081 CPR_DEBUG(CPR_DEBUG1, "i_cpr_compress_and_save: The storage " 1082 "space is too small!\ngot %d, want %d\n\n", 1083 remaining, (remaining + datalen)); 1084 #ifdef DEBUG 1085 /* 1086 * Check to see if the content of the sensitive pages that we 1087 * just copied have changed during this small time window. 1088 */ 1089 test_usum = checksum32(CPR->c_mapping_area, mmu_ptob(pages)); 1090 descp->csd_usum = cpd.cpd_usum; 1091 if (test_usum != descp->csd_usum) { 1092 CPR_DEBUG(CPR_DEBUG1, "\nWARNING: " 1093 "i_cpr_compress_and_save: " 1094 "Data in the range of pfn 0x%lx to pfn " 1095 "0x%lx has changed after they are saved " 1096 "into storage.", spfn, (spfn + pages - 1)); 1097 } 1098 #endif 1099 error = ENOMEM; 1100 } 1101 1102 i_cpr_mapout(CPR->c_mapping_area, pages); 1103 return (error); 1104 } 1105 1106 1107 /* 1108 * This routine is derived from cpr_count_kpages(). 1109 * It goes through kernel data nucleus and segkmem segments to select 1110 * pages in use and mark them in the corresponding bitmap. 1111 */ 1112 pgcnt_t 1113 i_cpr_count_sensitive_kpages(int mapflag, bitfunc_t bitfunc) 1114 { 1115 pgcnt_t kdata_cnt = 0, segkmem_cnt = 0; 1116 extern caddr_t e_moddata; 1117 extern struct seg kvalloc; 1118 extern struct seg kmem64; 1119 size_t size; 1120 1121 /* 1122 * Kernel data nucleus pages 1123 */ 1124 size = e_moddata - s_data; 1125 kdata_cnt += cpr_count_pages(s_data, size, 1126 mapflag, bitfunc, DBG_SHOWRANGE); 1127 1128 /* 1129 * kvseg and kvalloc pages 1130 */ 1131 segkmem_cnt += cpr_scan_kvseg(mapflag, bitfunc, &kvseg); 1132 segkmem_cnt += cpr_count_pages(kvalloc.s_base, kvalloc.s_size, 1133 mapflag, bitfunc, DBG_SHOWRANGE); 1134 1135 /* segment to support kernel memory usage above 32-bit space (4GB) */ 1136 if (kmem64.s_base) 1137 segkmem_cnt += cpr_count_pages(kmem64.s_base, kmem64.s_size, 1138 mapflag, bitfunc, DBG_SHOWRANGE); 1139 1140 CPR_DEBUG(CPR_DEBUG7, "\ni_cpr_count_sensitive_kpages:\n" 1141 "\tkdata_cnt %ld + segkmem_cnt %ld = %ld pages\n", 1142 kdata_cnt, segkmem_cnt, kdata_cnt + segkmem_cnt); 1143 1144 return (kdata_cnt + segkmem_cnt); 1145 } 1146 1147 1148 pgcnt_t 1149 i_cpr_count_storage_pages(int mapflag, bitfunc_t bitfunc) 1150 { 1151 pgcnt_t count = 0; 1152 1153 if (i_cpr_storage_desc_base) { 1154 count += cpr_count_pages((caddr_t)i_cpr_storage_desc_base, 1155 (size_t)mmu_ptob(i_cpr_storage_desc_pgcnt), 1156 mapflag, bitfunc, DBG_SHOWRANGE); 1157 } 1158 if (i_cpr_storage_data_base) { 1159 count += cpr_count_pages(i_cpr_storage_data_base, 1160 (size_t)mmu_ptob(i_cpr_storage_data_sz), 1161 mapflag, bitfunc, DBG_SHOWRANGE); 1162 } 1163 return (count); 1164 } 1165 1166 1167 /* 1168 * Derived from cpr_write_statefile(). 1169 * Allocate (or reallocate after exhausting the supply) descriptors for each 1170 * chunk of contiguous sensitive kpages. 1171 */ 1172 static int 1173 i_cpr_storage_desc_alloc(csd_t **basepp, pgcnt_t *pgsp, csd_t **endpp, 1174 int retry) 1175 { 1176 pgcnt_t npages; 1177 int chunks; 1178 csd_t *descp, *end; 1179 size_t len; 1180 char *str = "i_cpr_storage_desc_alloc:"; 1181 1182 /* 1183 * On initial allocation, add some extra to cover overhead caused 1184 * by the allocation for the storage area later. 1185 */ 1186 if (retry == 0) { 1187 chunks = cpr_contig_pages(NULL, STORAGE_DESC_ALLOC) + 1188 EXTRA_DESCS; 1189 npages = mmu_btopr(sizeof (**basepp) * (pgcnt_t)chunks); 1190 CPR_DEBUG(CPR_DEBUG7, "%s chunks %d, ", str, chunks); 1191 } else { 1192 CPR_DEBUG(CPR_DEBUG7, "%s retry %d: ", str, retry); 1193 npages = *pgsp + 1; 1194 } 1195 /* Free old descriptors, if any */ 1196 if (*basepp) 1197 kmem_free((caddr_t)*basepp, mmu_ptob(*pgsp)); 1198 1199 descp = *basepp = kmem_alloc(mmu_ptob(npages), KM_NOSLEEP); 1200 if (descp == NULL) { 1201 CPR_DEBUG(CPR_DEBUG7, "%s no space for descriptors!\n", str); 1202 return (ENOMEM); 1203 } 1204 1205 *pgsp = npages; 1206 len = mmu_ptob(npages); 1207 end = *endpp = descp + (len / (sizeof (**basepp))); 1208 CPR_DEBUG(CPR_DEBUG7, "npages 0x%lx, len 0x%lx, items 0x%lx\n\t*basepp " 1209 "%p, *endpp %p\n", npages, len, (len / (sizeof (**basepp))), 1210 *basepp, *endpp); 1211 i_cpr_storage_desc_init(descp, npages, end); 1212 return (0); 1213 } 1214 1215 static void 1216 i_cpr_storage_desc_init(csd_t *descp, pgcnt_t npages, csd_t *end) 1217 { 1218 size_t len = mmu_ptob(npages); 1219 1220 /* Initialize the descriptors to something impossible. */ 1221 bzero(descp, len); 1222 #ifdef DEBUG 1223 /* 1224 * This condition is tested by an ASSERT 1225 */ 1226 for (; descp < end; descp++) 1227 descp->csd_dirty_spfn = (uint_t)-1; 1228 #endif 1229 } 1230 1231 int 1232 i_cpr_dump_sensitive_kpages(vnode_t *vp) 1233 { 1234 int error = 0; 1235 uint_t spin_cnt = 0; 1236 csd_t *descp; 1237 1238 /* 1239 * These following two variables need to be reinitialized 1240 * for each cpr cycle. 1241 */ 1242 i_cpr_sensitive_bytes_dumped = 0; 1243 i_cpr_sensitive_pgs_dumped = 0; 1244 1245 if (i_cpr_storage_desc_base) { 1246 for (descp = i_cpr_storage_desc_base; 1247 descp <= i_cpr_storage_desc_last_used; descp++) { 1248 if (error = cpr_dump_sensitive(vp, descp)) 1249 return (error); 1250 spin_cnt++; 1251 if ((spin_cnt & 0x5F) == 1) 1252 cpr_spinning_bar(); 1253 } 1254 prom_printf(" \b"); 1255 } 1256 1257 CPR_DEBUG(CPR_DEBUG7, "\ni_cpr_dump_sensitive_kpages: dumped %ld\n", 1258 i_cpr_sensitive_pgs_dumped); 1259 return (0); 1260 } 1261 1262 1263 /* 1264 * 1. Fill the cpr page descriptor with the info of the dirty pages 1265 * and 1266 * write the descriptor out. It will be used at resume. 1267 * 2. Write the clean data in stead of the dirty data out. 1268 * Note: to save space, the clean data is already compressed. 1269 */ 1270 static int 1271 cpr_dump_sensitive(vnode_t *vp, csd_t *descp) 1272 { 1273 int error = 0; 1274 caddr_t datap; 1275 cpd_t cpd; /* cpr page descriptor */ 1276 pfn_t dirty_spfn; 1277 pgcnt_t dirty_npages; 1278 size_t clean_sz; 1279 caddr_t clean_sva; 1280 int clean_compressed; 1281 extern uchar_t cpr_pagecopy[]; 1282 1283 dirty_spfn = descp->csd_dirty_spfn; 1284 dirty_npages = descp->csd_dirty_npages; 1285 clean_sva = (caddr_t)descp->csd_clean_sva; 1286 clean_sz = descp->csd_clean_sz; 1287 clean_compressed = descp->csd_clean_compressed; 1288 1289 /* Fill cpr page descriptor. */ 1290 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 1291 cpd.cpd_pfn = dirty_spfn; 1292 cpd.cpd_flag = 0; /* must init to zero */ 1293 cpd.cpd_pages = dirty_npages; 1294 1295 #ifdef DEBUG 1296 if ((cpd.cpd_usum = descp->csd_usum) != 0) 1297 cpd.cpd_flag |= CPD_USUM; 1298 if ((cpd.cpd_csum = descp->csd_csum) != 0) 1299 cpd.cpd_flag |= CPD_CSUM; 1300 #endif 1301 1302 STAT->cs_dumped_statefsz += mmu_ptob(dirty_npages); 1303 1304 /* 1305 * The sensitive kpages are usually saved with compression 1306 * unless compression could not reduce the size of the data. 1307 * If user choose not to have the statefile compressed, 1308 * we need to decompress the data back before dumping it to disk. 1309 */ 1310 if (CPR->c_flags & C_COMPRESSING) { 1311 cpd.cpd_length = clean_sz; 1312 datap = clean_sva; 1313 if (clean_compressed) 1314 cpd.cpd_flag |= CPD_COMPRESS; 1315 } else { 1316 if (clean_compressed) { 1317 cpd.cpd_length = decompress(clean_sva, cpr_pagecopy, 1318 clean_sz, mmu_ptob(dirty_npages)); 1319 datap = (caddr_t)cpr_pagecopy; 1320 ASSERT(cpd.cpd_length == mmu_ptob(dirty_npages)); 1321 } else { 1322 cpd.cpd_length = clean_sz; 1323 datap = clean_sva; 1324 } 1325 cpd.cpd_csum = 0; 1326 } 1327 1328 /* Write cpr page descriptor */ 1329 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd)); 1330 if (error) { 1331 CPR_DEBUG(CPR_DEBUG7, "descp: %p\n", descp); 1332 #ifdef DEBUG 1333 debug_enter("cpr_dump_sensitive: cpr_write() page " 1334 "descriptor failed!\n"); 1335 #endif 1336 return (error); 1337 } 1338 1339 i_cpr_sensitive_bytes_dumped += sizeof (cpd_t); 1340 1341 /* Write page data */ 1342 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 1343 if (error) { 1344 CPR_DEBUG(CPR_DEBUG7, "error: %x\n", error); 1345 CPR_DEBUG(CPR_DEBUG7, "descp: %p\n", descp); 1346 CPR_DEBUG(CPR_DEBUG7, "cpr_write(%p, %p , %lx)\n", vp, datap, 1347 cpd.cpd_length); 1348 #ifdef DEBUG 1349 debug_enter("cpr_dump_sensitive: cpr_write() data failed!\n"); 1350 #endif 1351 return (error); 1352 } 1353 1354 i_cpr_sensitive_bytes_dumped += cpd.cpd_length; 1355 i_cpr_sensitive_pgs_dumped += dirty_npages; 1356 1357 return (error); 1358 } 1359 1360 1361 /* 1362 * Sanity check to make sure that we have dumped right amount 1363 * of pages from different sources to statefile. 1364 */ 1365 int 1366 i_cpr_check_pgs_dumped(uint_t pgs_expected, uint_t regular_pgs_dumped) 1367 { 1368 uint_t total_pgs_dumped; 1369 1370 total_pgs_dumped = regular_pgs_dumped + i_cpr_sensitive_pgs_dumped; 1371 1372 CPR_DEBUG(CPR_DEBUG7, "\ncheck_pgs: reg %d + sens %ld = %d, " 1373 "expect %d\n\n", regular_pgs_dumped, i_cpr_sensitive_pgs_dumped, 1374 total_pgs_dumped, pgs_expected); 1375 1376 if (pgs_expected == total_pgs_dumped) 1377 return (0); 1378 1379 return (EINVAL); 1380 } 1381 1382 1383 int 1384 i_cpr_reusefini(void) 1385 { 1386 struct vnode *vp; 1387 cdef_t *cdef; 1388 size_t size; 1389 char *bufp; 1390 int rc; 1391 1392 if (cpr_reusable_mode) 1393 cpr_reusable_mode = 0; 1394 1395 if (rc = cpr_open_deffile(FREAD|FWRITE, &vp)) { 1396 if (rc == EROFS) { 1397 cpr_err(CE_CONT, "uadmin A_FREEZE AD_REUSEFINI " 1398 "(uadmin %d %d)\nmust be done with / mounted " 1399 "writeable.\n", A_FREEZE, AD_REUSEFINI); 1400 } 1401 return (rc); 1402 } 1403 1404 cdef = kmem_alloc(sizeof (*cdef), KM_SLEEP); 1405 rc = cpr_rdwr(UIO_READ, vp, cdef, sizeof (*cdef)); 1406 1407 if (rc) { 1408 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1409 cpr_default_path, rc); 1410 } else if (cdef->mini.magic != CPR_DEFAULT_MAGIC) { 1411 cpr_err(CE_WARN, "bad magic number in %s, cannot restore " 1412 "prom values for %s", cpr_default_path, 1413 cpr_enumerate_promprops(&bufp, &size)); 1414 kmem_free(bufp, size); 1415 rc = EINVAL; 1416 } else { 1417 /* 1418 * clean up prom properties 1419 */ 1420 rc = cpr_update_nvram(cdef->props); 1421 if (rc == 0) { 1422 /* 1423 * invalidate the disk copy and turn off reusable 1424 */ 1425 cdef->mini.magic = 0; 1426 cdef->mini.reusable = 0; 1427 if (rc = cpr_rdwr(UIO_WRITE, vp, 1428 &cdef->mini, sizeof (cdef->mini))) { 1429 cpr_err(CE_WARN, "Failed writing %s, errno %d", 1430 cpr_default_path, rc); 1431 } 1432 } 1433 } 1434 1435 (void) VOP_CLOSE(vp, FREAD|FWRITE, 1, (offset_t)0, CRED()); 1436 VN_RELE(vp); 1437 kmem_free(cdef, sizeof (*cdef)); 1438 1439 return (rc); 1440 } 1441 1442 1443 int 1444 i_cpr_reuseinit(void) 1445 { 1446 int rc = 0; 1447 1448 if (rc = cpr_default_setup(1)) 1449 return (rc); 1450 1451 /* 1452 * We need to validate default file 1453 */ 1454 rc = cpr_validate_definfo(1); 1455 if (rc == 0) 1456 cpr_reusable_mode = 1; 1457 else if (rc == EROFS) { 1458 cpr_err(CE_NOTE, "reuseinit must be performed " 1459 "while / is mounted writeable"); 1460 } 1461 1462 (void) cpr_default_setup(0); 1463 1464 return (rc); 1465 } 1466 1467 1468 int 1469 i_cpr_check_cprinfo(void) 1470 { 1471 struct vnode *vp; 1472 cmini_t mini; 1473 int rc = 0; 1474 1475 if (rc = cpr_open_deffile(FREAD, &vp)) { 1476 if (rc == ENOENT) 1477 cpr_err(CE_NOTE, "cprinfo file does not " 1478 "exist. You must run 'uadmin %d %d' " 1479 "command while / is mounted writeable,\n" 1480 "then reboot and run 'uadmin %d %d' " 1481 "to create a reusable statefile", 1482 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1483 return (rc); 1484 } 1485 1486 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 1487 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED()); 1488 VN_RELE(vp); 1489 1490 if (rc) { 1491 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1492 cpr_default_path, rc); 1493 } else if (mini.magic != CPR_DEFAULT_MAGIC) { 1494 cpr_err(CE_CONT, "bad magic number in cprinfo file.\n" 1495 "You must run 'uadmin %d %d' while / is mounted " 1496 "writeable, then reboot and run 'uadmin %d %d' " 1497 "to create a reusable statefile\n", 1498 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1499 rc = EINVAL; 1500 } 1501 1502 return (rc); 1503 } 1504 1505 1506 int 1507 i_cpr_reusable_supported(void) 1508 { 1509 return (1); 1510 } 1511 1512 1513 /* 1514 * find prom phys pages and alloc space for a tmp copy 1515 */ 1516 static int 1517 i_cpr_find_ppages(void) 1518 { 1519 extern struct vnode prom_ppages; 1520 struct page *pp; 1521 struct memlist *pmem; 1522 pgcnt_t npages, pcnt, scnt, vcnt; 1523 pfn_t ppn, plast, *dst; 1524 int mapflag; 1525 1526 cpr_clear_bitmaps(); 1527 mapflag = REGULAR_BITMAP; 1528 1529 /* 1530 * there should be a page_t for each phys page used by the kernel; 1531 * set a bit for each phys page not tracked by a page_t 1532 */ 1533 pcnt = 0; 1534 memlist_read_lock(); 1535 for (pmem = phys_install; pmem; pmem = pmem->next) { 1536 npages = mmu_btop(pmem->size); 1537 ppn = mmu_btop(pmem->address); 1538 for (plast = ppn + npages; ppn < plast; ppn++) { 1539 if (page_numtopp_nolock(ppn)) 1540 continue; 1541 (void) cpr_setbit(ppn, mapflag); 1542 pcnt++; 1543 } 1544 } 1545 memlist_read_unlock(); 1546 1547 /* 1548 * clear bits for phys pages in each segment 1549 */ 1550 scnt = cpr_count_seg_pages(mapflag, cpr_clrbit); 1551 1552 /* 1553 * set bits for phys pages referenced by the prom_ppages vnode; 1554 * these pages are mostly comprised of forthdebug words 1555 */ 1556 vcnt = 0; 1557 for (pp = prom_ppages.v_pages; pp; ) { 1558 if (cpr_setbit(pp->p_offset, mapflag) == 0) 1559 vcnt++; 1560 pp = pp->p_vpnext; 1561 if (pp == prom_ppages.v_pages) 1562 break; 1563 } 1564 1565 /* 1566 * total number of prom pages are: 1567 * (non-page_t pages - seg pages + vnode pages) 1568 */ 1569 ppage_count = pcnt - scnt + vcnt; 1570 CPR_DEBUG(CPR_DEBUG1, 1571 "find_ppages: pcnt %ld - scnt %ld + vcnt %ld = %ld\n", 1572 pcnt, scnt, vcnt, ppage_count); 1573 1574 /* 1575 * alloc array of pfn_t to store phys page list 1576 */ 1577 pphys_list_size = ppage_count * sizeof (pfn_t); 1578 pphys_list = kmem_alloc(pphys_list_size, KM_NOSLEEP); 1579 if (pphys_list == NULL) { 1580 cpr_err(CE_WARN, "cannot alloc pphys_list"); 1581 return (ENOMEM); 1582 } 1583 1584 /* 1585 * phys pages referenced in the bitmap should be 1586 * those used by the prom; scan bitmap and save 1587 * a list of prom phys page numbers 1588 */ 1589 dst = pphys_list; 1590 memlist_read_lock(); 1591 for (pmem = phys_install; pmem; pmem = pmem->next) { 1592 npages = mmu_btop(pmem->size); 1593 ppn = mmu_btop(pmem->address); 1594 for (plast = ppn + npages; ppn < plast; ppn++) { 1595 if (cpr_isset(ppn, mapflag)) { 1596 ASSERT(dst < (pphys_list + ppage_count)); 1597 *dst++ = ppn; 1598 } 1599 } 1600 } 1601 memlist_read_unlock(); 1602 1603 /* 1604 * allocate space to store prom pages 1605 */ 1606 ppage_buf = kmem_alloc(mmu_ptob(ppage_count), KM_NOSLEEP); 1607 if (ppage_buf == NULL) { 1608 kmem_free(pphys_list, pphys_list_size); 1609 pphys_list = NULL; 1610 cpr_err(CE_WARN, "cannot alloc ppage_buf"); 1611 return (ENOMEM); 1612 } 1613 1614 return (0); 1615 } 1616 1617 1618 /* 1619 * save prom pages to kmem pages 1620 */ 1621 static void 1622 i_cpr_save_ppages(void) 1623 { 1624 pfn_t *pphys, *plast; 1625 caddr_t dst; 1626 1627 /* 1628 * map in each prom page and copy to a kmem page 1629 */ 1630 dst = ppage_buf; 1631 plast = pphys_list + ppage_count; 1632 for (pphys = pphys_list; pphys < plast; pphys++) { 1633 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1634 bcopy(cpr_vaddr, dst, MMU_PAGESIZE); 1635 i_cpr_mapout(cpr_vaddr, 1); 1636 dst += MMU_PAGESIZE; 1637 } 1638 1639 CPR_DEBUG(CPR_DEBUG1, "saved %ld prom pages\n", ppage_count); 1640 } 1641 1642 1643 /* 1644 * restore prom pages from kmem pages 1645 */ 1646 static void 1647 i_cpr_restore_ppages(void) 1648 { 1649 pfn_t *pphys, *plast; 1650 caddr_t src; 1651 1652 dcache_flushall(); 1653 1654 /* 1655 * map in each prom page and copy from a kmem page 1656 */ 1657 src = ppage_buf; 1658 plast = pphys_list + ppage_count; 1659 for (pphys = pphys_list; pphys < plast; pphys++) { 1660 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1661 bcopy(src, cpr_vaddr, MMU_PAGESIZE); 1662 i_cpr_mapout(cpr_vaddr, 1); 1663 src += MMU_PAGESIZE; 1664 } 1665 1666 dcache_flushall(); 1667 1668 CPR_DEBUG(CPR_DEBUG1, "restored %ld prom pages\n", ppage_count); 1669 } 1670 1671 1672 /* 1673 * save/restore prom pages or free related allocs 1674 */ 1675 int 1676 i_cpr_prom_pages(int action) 1677 { 1678 int error; 1679 1680 if (action == CPR_PROM_SAVE) { 1681 if (ppage_buf == NULL) { 1682 ASSERT(pphys_list == NULL); 1683 if (error = i_cpr_find_ppages()) 1684 return (error); 1685 i_cpr_save_ppages(); 1686 } 1687 } else if (action == CPR_PROM_RESTORE) { 1688 i_cpr_restore_ppages(); 1689 } else if (action == CPR_PROM_FREE) { 1690 if (pphys_list) { 1691 ASSERT(pphys_list_size); 1692 kmem_free(pphys_list, pphys_list_size); 1693 pphys_list = NULL; 1694 pphys_list_size = 0; 1695 } 1696 if (ppage_buf) { 1697 ASSERT(ppage_count); 1698 kmem_free(ppage_buf, mmu_ptob(ppage_count)); 1699 CPR_DEBUG(CPR_DEBUG1, "freed %ld prom pages\n", 1700 ppage_count); 1701 ppage_buf = NULL; 1702 ppage_count = 0; 1703 } 1704 } 1705 return (0); 1706 } 1707 1708 1709 /* 1710 * record tlb data for the nucleus, bigktsb's, and the cpr module; 1711 * this data is later used by cprboot to install dtlb/itlb entries. 1712 * when we jump into the cpr module during the resume phase, those 1713 * mappings are needed until switching to the kernel trap table. 1714 * to make the dtte/itte info available during resume, we need 1715 * the info recorded prior to saving sensitive pages, otherwise 1716 * all the data would appear as NULLs. 1717 */ 1718 static void 1719 i_cpr_save_tlbinfo(void) 1720 { 1721 cti_t cti = {0}; 1722 1723 /* 1724 * during resume - shortly after jumping into the cpr module, 1725 * sfmmu_load_mmustate() will overwrite any dtlb entry at any 1726 * index used for TSBs; skip is set so that any saved tte will 1727 * target other tlb offsets and prevent being lost during 1728 * resume. now scan the dtlb and save locked entries, 1729 * then add entries for the tmp stack / data page and the 1730 * cpr thread structure. 1731 */ 1732 cti.dst = m_info.dtte; 1733 cti.tail = cti.dst + CPR_MAX_TLB; 1734 cti.reader = dtlb_rd_entry; 1735 cti.writer = NULL; 1736 cti.filter = i_cpr_lnb; 1737 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 1738 1739 if (utsb_dtlb_ttenum != -1) 1740 cti.skip = (1 << utsb_dtlb_ttenum); 1741 1742 if (utsb4m_dtlb_ttenum != -1) 1743 cti.skip |= (1 << utsb4m_dtlb_ttenum); 1744 1745 i_cpr_scan_tlb(&cti); 1746 i_cpr_make_tte(&cti, &i_cpr_data_page, datava); 1747 i_cpr_make_tte(&cti, curthread, datava); 1748 1749 /* 1750 * scan itlb and save locked entries; add an entry for 1751 * the first text page of the cpr module; cprboot will 1752 * jump to that page after restoring kernel pages. 1753 */ 1754 cti.dst = m_info.itte; 1755 cti.tail = cti.dst + CPR_MAX_TLB; 1756 cti.reader = itlb_rd_entry; 1757 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 1758 cti.skip = 0; 1759 i_cpr_scan_tlb(&cti); 1760 i_cpr_make_tte(&cti, (void *)i_cpr_resume_setup, textva); 1761 } 1762 1763 1764 /* ARGSUSED */ 1765 int 1766 i_cpr_dump_setup(vnode_t *vp) 1767 { 1768 /* 1769 * zero out m_info and add info to dtte/itte arrays 1770 */ 1771 bzero(&m_info, sizeof (m_info)); 1772 i_cpr_save_tlbinfo(); 1773 return (0); 1774 } 1775 1776 1777 int 1778 i_cpr_is_supported(void) 1779 { 1780 char es_prop[] = "energystar-v2"; 1781 pnode_t node; 1782 int last; 1783 extern int cpr_supported_override; 1784 extern int cpr_platform_enable; 1785 1786 /* 1787 * The next statement tests if a specific platform has turned off 1788 * cpr support. 1789 */ 1790 if (cpr_supported_override) 1791 return (0); 1792 1793 /* 1794 * Do not inspect energystar-v* property if a platform has 1795 * specifically turned on cpr support 1796 */ 1797 if (cpr_platform_enable) 1798 return (1); 1799 1800 node = prom_rootnode(); 1801 if (prom_getproplen(node, es_prop) != -1) 1802 return (1); 1803 last = strlen(es_prop) - 1; 1804 es_prop[last] = '3'; 1805 return (prom_getproplen(node, es_prop) != -1); 1806 } 1807 1808 1809 /* 1810 * the actual size of the statefile data isn't known until after all the 1811 * compressed pages are written; even the inode size doesn't reflect the 1812 * data size since there are usually many extra fs blocks. for recording 1813 * the actual data size, the first sector of the statefile is copied to 1814 * a tmp buf, and the copy is later updated and flushed to disk. 1815 */ 1816 int 1817 i_cpr_blockzero(char *base, char **bufpp, int *blkno, vnode_t *vp) 1818 { 1819 extern int cpr_flush_write(vnode_t *); 1820 static char cpr_sector[DEV_BSIZE]; 1821 cpr_ext bytes, *dst; 1822 1823 /* 1824 * this routine is called after cdd_t and csu_md_t are copied 1825 * to cpr_buf; mini-hack alert: the save/update method creates 1826 * a dependency on the combined struct size being >= one sector 1827 * or DEV_BSIZE; since introduction in Sol2.7, csu_md_t size is 1828 * over 1K bytes and will probably grow with any changes. 1829 * 1830 * copy when vp is NULL, flush when non-NULL 1831 */ 1832 if (vp == NULL) { 1833 ASSERT((*bufpp - base) >= DEV_BSIZE); 1834 bcopy(base, cpr_sector, sizeof (cpr_sector)); 1835 return (0); 1836 } else { 1837 bytes = dbtob(*blkno); 1838 dst = &((cdd_t *)cpr_sector)->cdd_filesize; 1839 bcopy(&bytes, dst, sizeof (bytes)); 1840 bcopy(cpr_sector, base, sizeof (cpr_sector)); 1841 *bufpp = base + sizeof (cpr_sector); 1842 *blkno = cpr_statefile_offset(); 1843 CPR_DEBUG(CPR_DEBUG1, "statefile data size: %ld\n\n", bytes); 1844 return (cpr_flush_write(vp)); 1845 } 1846 } 1847 1848 1849 /* 1850 * Allocate bitmaps according to the phys_install list. 1851 */ 1852 static int 1853 i_cpr_bitmap_setup(void) 1854 { 1855 struct memlist *pmem; 1856 cbd_t *dp, *tail; 1857 void *space; 1858 size_t size; 1859 1860 /* 1861 * The number of bitmap descriptors will be the count of 1862 * phys_install ranges plus 1 for a trailing NULL struct. 1863 */ 1864 cpr_nbitmaps = 1; 1865 for (pmem = phys_install; pmem; pmem = pmem->next) 1866 cpr_nbitmaps++; 1867 1868 if (cpr_nbitmaps > (CPR_MAX_BMDESC - 1)) { 1869 cpr_err(CE_WARN, "too many physical memory ranges %d, max %d", 1870 cpr_nbitmaps, CPR_MAX_BMDESC - 1); 1871 return (EFBIG); 1872 } 1873 1874 /* Alloc an array of bitmap descriptors. */ 1875 dp = kmem_zalloc(cpr_nbitmaps * sizeof (*dp), KM_NOSLEEP); 1876 if (dp == NULL) { 1877 cpr_nbitmaps = 0; 1878 return (ENOMEM); 1879 } 1880 tail = dp + cpr_nbitmaps; 1881 1882 CPR->c_bmda = dp; 1883 for (pmem = phys_install; pmem; pmem = pmem->next) { 1884 size = BITMAP_BYTES(pmem->size); 1885 space = kmem_zalloc(size * 2, KM_NOSLEEP); 1886 if (space == NULL) 1887 return (ENOMEM); 1888 ASSERT(dp < tail); 1889 dp->cbd_magic = CPR_BITMAP_MAGIC; 1890 dp->cbd_spfn = mmu_btop(pmem->address); 1891 dp->cbd_epfn = mmu_btop(pmem->address + pmem->size) - 1; 1892 dp->cbd_size = size; 1893 dp->cbd_reg_bitmap = (cpr_ptr)space; 1894 dp->cbd_vlt_bitmap = (cpr_ptr)((caddr_t)space + size); 1895 dp++; 1896 } 1897 1898 /* set magic for the last descriptor */ 1899 ASSERT(dp == (tail - 1)); 1900 dp->cbd_magic = CPR_BITMAP_MAGIC; 1901 1902 return (0); 1903 } 1904 1905 1906 void 1907 i_cpr_bitmap_cleanup(void) 1908 { 1909 cbd_t *dp; 1910 1911 if (CPR->c_bmda == NULL) 1912 return; 1913 for (dp = CPR->c_bmda; dp->cbd_size; dp++) 1914 kmem_free((void *)dp->cbd_reg_bitmap, dp->cbd_size * 2); 1915 kmem_free(CPR->c_bmda, cpr_nbitmaps * sizeof (*CPR->c_bmda)); 1916 CPR->c_bmda = NULL; 1917 cpr_nbitmaps = 0; 1918 } 1919 1920 1921 /* 1922 * A "regular" and "volatile" bitmap are created for each range of 1923 * physical memory. The volatile maps are used to count and track pages 1924 * susceptible to heap corruption - caused by drivers that allocate mem 1925 * during VOP_DUMP(); the regular maps are used for all the other non- 1926 * susceptible pages. Before writing the bitmaps to the statefile, 1927 * each bitmap pair gets merged to simplify handling within cprboot. 1928 */ 1929 int 1930 i_cpr_alloc_bitmaps(void) 1931 { 1932 int err; 1933 1934 memlist_read_lock(); 1935 err = i_cpr_bitmap_setup(); 1936 memlist_read_unlock(); 1937 if (err) 1938 i_cpr_bitmap_cleanup(); 1939 return (err); 1940 } 1941