1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Platform specific implementation code 30 */ 31 32 #define SUNDDI_IMPL 33 34 #include <sys/types.h> 35 #include <sys/promif.h> 36 #include <sys/prom_isa.h> 37 #include <sys/prom_plat.h> 38 #include <sys/mmu.h> 39 #include <vm/hat_sfmmu.h> 40 #include <sys/iommu.h> 41 #include <sys/scb.h> 42 #include <sys/cpuvar.h> 43 #include <sys/intreg.h> 44 #include <sys/pte.h> 45 #include <vm/hat.h> 46 #include <vm/page.h> 47 #include <vm/as.h> 48 #include <sys/cpr.h> 49 #include <sys/kmem.h> 50 #include <sys/clock.h> 51 #include <sys/kmem.h> 52 #include <sys/panic.h> 53 #include <vm/seg_kmem.h> 54 #include <sys/cpu_module.h> 55 #include <sys/callb.h> 56 #include <sys/machsystm.h> 57 #include <sys/vmsystm.h> 58 #include <sys/systm.h> 59 #include <sys/archsystm.h> 60 #include <sys/stack.h> 61 #include <sys/fs/ufs_fs.h> 62 #include <sys/memlist.h> 63 #include <sys/bootconf.h> 64 #include <sys/thread.h> 65 #include <vm/vm_dep.h> 66 67 extern void cpr_clear_bitmaps(void); 68 extern void dtlb_wr_entry(uint_t, tte_t *, uint64_t *); 69 extern void itlb_wr_entry(uint_t, tte_t *, uint64_t *); 70 71 static int i_cpr_storage_desc_alloc(csd_t **, pgcnt_t *, csd_t **, int); 72 static void i_cpr_storage_desc_init(csd_t *, pgcnt_t, csd_t *); 73 static caddr_t i_cpr_storage_data_alloc(pgcnt_t, pgcnt_t *, int); 74 static int cpr_dump_sensitive(vnode_t *, csd_t *); 75 static void i_cpr_clear_entries(uint64_t, uint64_t); 76 static void i_cpr_xcall(xcfunc_t); 77 78 void i_cpr_storage_free(void); 79 80 extern void *i_cpr_data_page; 81 extern int cpr_test_mode; 82 extern int cpr_nbitmaps; 83 extern char cpr_default_path[]; 84 extern caddr_t textva, datava; 85 86 static struct cpr_map_info cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 87 caddr_t cpr_vaddr = NULL; 88 89 static uint_t sensitive_pages_saved; 90 static uint_t sensitive_size_saved; 91 92 caddr_t i_cpr_storage_data_base; 93 caddr_t i_cpr_storage_data_end; 94 csd_t *i_cpr_storage_desc_base; 95 csd_t *i_cpr_storage_desc_end; /* one byte beyond last used descp */ 96 csd_t *i_cpr_storage_desc_last_used; /* last used descriptor */ 97 caddr_t sensitive_write_ptr; /* position for next storage write */ 98 99 size_t i_cpr_sensitive_bytes_dumped; 100 pgcnt_t i_cpr_sensitive_pgs_dumped; 101 pgcnt_t i_cpr_storage_data_sz; /* in pages */ 102 pgcnt_t i_cpr_storage_desc_pgcnt; /* in pages */ 103 104 ushort_t cpr_mach_type = CPR_MACHTYPE_4U; 105 static csu_md_t m_info; 106 107 108 #define MAX_STORAGE_RETRY 3 109 #define MAX_STORAGE_ALLOC_RETRY 3 110 #define INITIAL_ALLOC_PCNT 40 /* starting allocation percentage */ 111 #define INTEGRAL 100 /* to get 1% precision */ 112 113 #define EXTRA_RATE 2 /* add EXTRA_RATE% extra space */ 114 #define EXTRA_DESCS 10 115 116 #define CPR_NO_STORAGE_DESC 1 117 #define CPR_NO_STORAGE_DATA 2 118 119 #define CIF_SPLICE 0 120 #define CIF_UNLINK 1 121 122 123 /* 124 * CPR miscellaneous support routines 125 */ 126 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 127 mode, 0600, vpp, CRCREAT, 0)) 128 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 129 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 130 (ssize_t *)NULL)) 131 132 /* 133 * definitions for saving/restoring prom pages 134 */ 135 static void *ppage_buf; 136 static pgcnt_t ppage_count; 137 static pfn_t *pphys_list; 138 static size_t pphys_list_size; 139 140 typedef void (*tlb_rw_t)(uint_t, tte_t *, uint64_t *); 141 typedef void (*tlb_filter_t)(int, tte_t *, uint64_t, void *); 142 143 /* 144 * private struct for tlb handling 145 */ 146 struct cpr_trans_info { 147 sutlb_t *dst; 148 sutlb_t *tail; 149 tlb_rw_t reader; 150 tlb_rw_t writer; 151 tlb_filter_t filter; 152 int index; 153 uint64_t skip; /* assumes TLB <= 64 locked entries */ 154 }; 155 typedef struct cpr_trans_info cti_t; 156 157 158 /* 159 * special handling for tlb info 160 */ 161 #define WITHIN_OFW(va) \ 162 (((va) > (uint64_t)OFW_START_ADDR) && ((va) < (uint64_t)OFW_END_ADDR)) 163 164 #define WITHIN_NUCLEUS(va, base) \ 165 (((va) >= (base)) && \ 166 (((va) + MMU_PAGESIZE) <= ((base) + MMU_PAGESIZE4M))) 167 168 #define IS_BIGKTSB(va) \ 169 (enable_bigktsb && \ 170 ((va) >= (uint64_t)ktsb_base) && \ 171 ((va) < (uint64_t)(ktsb_base + ktsb_sz))) 172 173 174 /* 175 * WARNING: 176 * the text from this file is linked to follow cpr_resume_setup.o; 177 * only add text between here and i_cpr_end_jumpback when it needs 178 * to be called during resume before we switch back to the kernel 179 * trap table. all the text in this range must fit within a page. 180 */ 181 182 183 /* 184 * each time a machine is reset, the prom uses an inconsistent set of phys 185 * pages and the cif cookie may differ as well. so prior to restoring the 186 * original prom, we have to use to use the new/tmp prom's translations 187 * when requesting prom services. 188 * 189 * cif_handler starts out as the original prom cookie, and that gets used 190 * by client_handler() to jump into the prom. here we splice-in a wrapper 191 * routine by writing cif_handler; client_handler() will now jump to the 192 * wrapper which switches the %tba to the new/tmp prom's trap table then 193 * jumps to the new cookie. 194 */ 195 void 196 i_cpr_cif_setup(int action) 197 { 198 extern void *i_cpr_orig_cif, *cif_handler; 199 extern int i_cpr_cif_wrapper(void *); 200 201 /* 202 * save the original cookie and change the current cookie to the 203 * wrapper routine. later we just restore the original cookie. 204 */ 205 if (action == CIF_SPLICE) { 206 i_cpr_orig_cif = cif_handler; 207 cif_handler = (void *)i_cpr_cif_wrapper; 208 } else if (action == CIF_UNLINK) 209 cif_handler = i_cpr_orig_cif; 210 } 211 212 213 /* 214 * launch slave cpus into kernel text, pause them, 215 * and restore the original prom pages 216 */ 217 void 218 i_cpr_mp_setup(void) 219 { 220 extern void restart_other_cpu(int); 221 cpu_t *cp; 222 223 uint64_t kctx = kcontextreg; 224 225 /* 226 * Do not allow setting page size codes in MMU primary context 227 * register while using cif wrapper. This is needed to work 228 * arround OBP incorrect handling of this MMU register. 229 */ 230 kcontextreg = 0; 231 232 /* 233 * reset cpu_ready_set so x_calls work properly 234 */ 235 CPUSET_ZERO(cpu_ready_set); 236 CPUSET_ADD(cpu_ready_set, getprocessorid()); 237 238 /* 239 * setup cif to use the cookie from the new/tmp prom 240 * and setup tmp handling for calling prom services. 241 */ 242 i_cpr_cif_setup(CIF_SPLICE); 243 244 /* 245 * at this point, only the nucleus and a few cpr pages are 246 * mapped in. once we switch to the kernel trap table, 247 * we can access the rest of kernel space. 248 */ 249 prom_set_traptable(&trap_table); 250 251 if (ncpus > 1) { 252 sfmmu_init_tsbs(); 253 254 mutex_enter(&cpu_lock); 255 /* 256 * All of the slave cpus are not ready at this time, 257 * yet the cpu structures have various cpu_flags set; 258 * clear cpu_flags and mutex_ready. 259 * Since we are coming up from a CPU suspend, the slave cpus 260 * are frozen. 261 */ 262 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) { 263 cp->cpu_flags = CPU_FROZEN; 264 cp->cpu_m.mutex_ready = 0; 265 } 266 267 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) 268 restart_other_cpu(cp->cpu_id); 269 270 pause_cpus(NULL); 271 mutex_exit(&cpu_lock); 272 273 i_cpr_xcall(i_cpr_clear_entries); 274 } else 275 i_cpr_clear_entries(0, 0); 276 277 /* 278 * now unlink the cif wrapper; WARNING: do not call any 279 * prom_xxx() routines until after prom pages are restored. 280 */ 281 i_cpr_cif_setup(CIF_UNLINK); 282 283 (void) i_cpr_prom_pages(CPR_PROM_RESTORE); 284 285 /* allow setting page size codes in MMU primary context register */ 286 kcontextreg = kctx; 287 } 288 289 290 /* 291 * end marker for jumpback page; 292 * this symbol is used to check the size of i_cpr_resume_setup() 293 * and the above text. For simplicity, the Makefile needs to 294 * link i_cpr_resume_setup.o and cpr_impl.o consecutively. 295 */ 296 void 297 i_cpr_end_jumpback(void) 298 { 299 } 300 301 302 /* 303 * scan tlb entries with reader; when valid entries are found, 304 * the filter routine will selectively save/clear them 305 */ 306 static void 307 i_cpr_scan_tlb(cti_t *ctip) 308 { 309 uint64_t va_tag; 310 int tlb_index; 311 tte_t tte; 312 313 for (tlb_index = ctip->index; tlb_index >= 0; tlb_index--) { 314 (*ctip->reader)((uint_t)tlb_index, &tte, &va_tag); 315 if (va_tag && TTE_IS_VALID(&tte)) 316 (*ctip->filter)(tlb_index, &tte, va_tag, ctip); 317 } 318 } 319 320 321 /* 322 * filter for locked tlb entries that reference the text/data nucleus 323 * and any bigktsb's; these will be reinstalled by cprboot on all cpus 324 */ 325 /* ARGSUSED */ 326 static void 327 i_cpr_lnb(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 328 { 329 cti_t *ctip; 330 331 /* 332 * record tlb data at ctip->dst; the target tlb index starts 333 * at the highest tlb offset and moves towards 0. the prom 334 * reserves both dtlb and itlb index 0. any selected entry 335 * also gets marked to prevent being flushed during resume 336 */ 337 if (TTE_IS_LOCKED(ttep) && (va_tag == (uint64_t)textva || 338 va_tag == (uint64_t)datava || IS_BIGKTSB(va_tag))) { 339 ctip = ctrans; 340 while ((1 << ctip->index) & ctip->skip) 341 ctip->index--; 342 ASSERT(ctip->index > 0); 343 ASSERT(ctip->dst < ctip->tail); 344 ctip->dst->tte.ll = ttep->ll; 345 ctip->dst->va_tag = va_tag; 346 ctip->dst->index = ctip->index--; 347 ctip->dst->tmp = 0; 348 ctip->dst++; 349 } 350 } 351 352 353 /* 354 * some tlb entries are stale, filter for unlocked entries 355 * within the prom virt range and clear them 356 */ 357 static void 358 i_cpr_ufw(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 359 { 360 sutlb_t clr; 361 cti_t *ctip; 362 363 if (!TTE_IS_LOCKED(ttep) && WITHIN_OFW(va_tag)) { 364 ctip = ctrans; 365 bzero(&clr, sizeof (clr)); 366 (*ctip->writer)((uint_t)index, &clr.tte, &clr.va_tag); 367 } 368 } 369 370 371 /* 372 * some of the entries installed by cprboot are needed only on a 373 * short-term basis and need to be flushed to avoid clogging the tlbs. 374 * scan the dtte/itte arrays for items marked as temporary and clear 375 * dtlb/itlb entries using wrfunc. 376 */ 377 static void 378 i_cpr_clear_tmp(sutlb_t *listp, int max, tlb_rw_t wrfunc) 379 { 380 sutlb_t clr, *tail; 381 382 bzero(&clr, sizeof (clr)); 383 for (tail = listp + max; listp < tail && listp->va_tag; listp++) { 384 if (listp->tmp) 385 (*wrfunc)((uint_t)listp->index, &clr.tte, &clr.va_tag); 386 } 387 } 388 389 390 /* ARGSUSED */ 391 static void 392 i_cpr_clear_entries(uint64_t arg1, uint64_t arg2) 393 { 394 extern void demap_all(void); 395 cti_t cti; 396 397 i_cpr_clear_tmp(m_info.dtte, CPR_MAX_TLB, dtlb_wr_entry); 398 i_cpr_clear_tmp(m_info.itte, CPR_MAX_TLB, itlb_wr_entry); 399 400 /* 401 * for newer cpus that implement DEMAP_ALL_TYPE, demap_all is 402 * a second label for vtag_flushall. the call is made using 403 * vtag_flushall() instead of demap_all() due to runtime and 404 * krtld results with both older and newer cpu modules. 405 */ 406 if (&demap_all != 0) { 407 vtag_flushall(); 408 return; 409 } 410 411 /* 412 * for older V9 cpus, scan tlbs and clear stale entries 413 */ 414 bzero(&cti, sizeof (cti)); 415 cti.filter = i_cpr_ufw; 416 417 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 418 cti.reader = dtlb_rd_entry; 419 cti.writer = dtlb_wr_entry; 420 i_cpr_scan_tlb(&cti); 421 422 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 423 cti.reader = itlb_rd_entry; 424 cti.writer = itlb_wr_entry; 425 i_cpr_scan_tlb(&cti); 426 } 427 428 429 /* 430 * craft tlb info for tmp use during resume; this data gets used by 431 * cprboot to install tlb entries. we also mark each struct as tmp 432 * so those tlb entries will get flushed after switching to the kernel 433 * trap table. no data needs to be recorded for vaddr when it falls 434 * within the nucleus since we've already recorded nucleus ttes and 435 * a 8K tte would conflict with a 4MB tte. eg: the cpr module 436 * text/data may have been loaded into the text/data nucleus. 437 */ 438 static void 439 i_cpr_make_tte(cti_t *ctip, void *vaddr, caddr_t nbase) 440 { 441 pfn_t ppn; 442 uint_t rw; 443 444 if (WITHIN_NUCLEUS((caddr_t)vaddr, nbase)) 445 return; 446 447 while ((1 << ctip->index) & ctip->skip) 448 ctip->index--; 449 ASSERT(ctip->index > 0); 450 ASSERT(ctip->dst < ctip->tail); 451 452 /* 453 * without any global service available to lookup 454 * a tte by vaddr, we craft our own here: 455 */ 456 ppn = va_to_pfn(vaddr); 457 rw = (nbase == datava) ? TTE_HWWR_INT : 0; 458 ctip->dst->tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 459 ctip->dst->tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 460 TTE_CP_INT | TTE_PRIV_INT | rw; 461 ctip->dst->va_tag = ((uintptr_t)vaddr & MMU_PAGEMASK); 462 ctip->dst->index = ctip->index--; 463 ctip->dst->tmp = 1; 464 ctip->dst++; 465 } 466 467 468 static void 469 i_cpr_xcall(xcfunc_t func) 470 { 471 uint_t pil, reset_pil; 472 473 pil = getpil(); 474 if (pil < XCALL_PIL) 475 reset_pil = 0; 476 else { 477 reset_pil = 1; 478 setpil(XCALL_PIL - 1); 479 } 480 xc_some(cpu_ready_set, func, 0, 0); 481 if (reset_pil) 482 setpil(pil); 483 } 484 485 486 /* 487 * restart paused slave cpus 488 */ 489 void 490 i_cpr_machdep_setup(void) 491 { 492 if (ncpus > 1) { 493 CPR_DEBUG(CPR_DEBUG1, "MP restarted...\n"); 494 mutex_enter(&cpu_lock); 495 start_cpus(); 496 mutex_exit(&cpu_lock); 497 } 498 } 499 500 501 /* 502 * Stop all interrupt activities in the system 503 */ 504 void 505 i_cpr_stop_intr(void) 506 { 507 (void) spl7(); 508 } 509 510 /* 511 * Set machine up to take interrupts 512 */ 513 void 514 i_cpr_enable_intr(void) 515 { 516 (void) spl0(); 517 } 518 519 520 /* 521 * record cpu nodes and ids 522 */ 523 static void 524 i_cpr_save_cpu_info(void) 525 { 526 struct sun4u_cpu_info *scip; 527 cpu_t *cp; 528 529 scip = m_info.sci; 530 cp = CPU; 531 do { 532 ASSERT(scip < &m_info.sci[NCPU]); 533 scip->cpu_id = cp->cpu_id; 534 scip->node = cpunodes[cp->cpu_id].nodeid; 535 scip++; 536 } while ((cp = cp->cpu_next) != CPU); 537 } 538 539 540 /* 541 * Write necessary machine dependent information to cpr state file, 542 * eg. sun4u mmu ctx secondary for the current running process (cpr) ... 543 */ 544 int 545 i_cpr_write_machdep(vnode_t *vp) 546 { 547 extern uint_t getpstate(), getwstate(); 548 extern uint_t i_cpr_tstack_size; 549 const char ustr[] = ": unix-tte 2drop false ;"; 550 uintptr_t tinfo; 551 label_t *ltp; 552 cmd_t cmach; 553 char *fmt; 554 int rc; 555 556 /* 557 * ustr[] is used as temporary forth words during 558 * slave startup sequence, see sfmmu_mp_startup() 559 */ 560 561 cmach.md_magic = (uint_t)CPR_MACHDEP_MAGIC; 562 cmach.md_size = sizeof (m_info) + sizeof (ustr); 563 564 if (rc = cpr_write(vp, (caddr_t)&cmach, sizeof (cmach))) { 565 cpr_err(CE_WARN, "Failed to write descriptor."); 566 return (rc); 567 } 568 569 /* 570 * m_info is now cleared in i_cpr_dump_setup() 571 */ 572 m_info.ksb = (uint32_t)STACK_BIAS; 573 m_info.kpstate = (uint16_t)getpstate(); 574 m_info.kwstate = (uint16_t)getwstate(); 575 CPR_DEBUG(CPR_DEBUG1, "stack bias 0x%x, pstate 0x%x, wstate 0x%x\n", 576 m_info.ksb, m_info.kpstate, m_info.kwstate); 577 578 ltp = &ttolwp(curthread)->lwp_qsav; 579 m_info.qsav_pc = (cpr_ext)ltp->val[0]; 580 m_info.qsav_sp = (cpr_ext)ltp->val[1]; 581 582 /* 583 * Set secondary context to INVALID_CONTEXT to force the HAT 584 * to re-setup the MMU registers and locked TTEs it needs for 585 * TLB miss handling. 586 */ 587 m_info.mmu_ctx_sec = INVALID_CONTEXT; 588 m_info.mmu_ctx_pri = KCONTEXT; 589 590 tinfo = (uintptr_t)curthread; 591 m_info.thrp = (cpr_ptr)tinfo; 592 593 tinfo = (uintptr_t)i_cpr_resume_setup; 594 m_info.func = (cpr_ptr)tinfo; 595 596 /* 597 * i_cpr_data_page is comprised of a 4K stack area and a few 598 * trailing data symbols; the page is shared by the prom and 599 * kernel during resume. the stack size is recorded here 600 * and used by cprboot to set %sp 601 */ 602 tinfo = (uintptr_t)&i_cpr_data_page; 603 m_info.tmp_stack = (cpr_ptr)tinfo; 604 m_info.tmp_stacksize = i_cpr_tstack_size; 605 606 m_info.test_mode = cpr_test_mode; 607 608 i_cpr_save_cpu_info(); 609 610 if (rc = cpr_write(vp, (caddr_t)&m_info, sizeof (m_info))) { 611 cpr_err(CE_WARN, "Failed to write machdep info."); 612 return (rc); 613 } 614 615 fmt = "error writing %s forth info"; 616 if (rc = cpr_write(vp, (caddr_t)ustr, sizeof (ustr))) 617 cpr_err(CE_WARN, fmt, "unix-tte"); 618 619 return (rc); 620 } 621 622 623 /* 624 * Save miscellaneous information which needs to be written to the 625 * state file. This information is required to re-initialize 626 * kernel/prom handshaking. 627 */ 628 void 629 i_cpr_save_machdep_info(void) 630 { 631 CPR_DEBUG(CPR_DEBUG5, "jumpback size = 0x%lx\n", 632 (uintptr_t)&i_cpr_end_jumpback - 633 (uintptr_t)i_cpr_resume_setup); 634 635 /* 636 * Verify the jumpback code all falls in one page. 637 */ 638 if (((uintptr_t)&i_cpr_end_jumpback & MMU_PAGEMASK) != 639 ((uintptr_t)i_cpr_resume_setup & MMU_PAGEMASK)) 640 cpr_err(CE_PANIC, "jumpback code exceeds one page."); 641 } 642 643 644 void 645 i_cpr_set_tbr(void) 646 { 647 } 648 649 650 /* 651 * cpu0 should contain bootcpu info 652 */ 653 cpu_t * 654 i_cpr_bootcpu(void) 655 { 656 return (&cpu0); 657 } 658 659 660 /* 661 * Return the virtual address of the mapping area 662 */ 663 caddr_t 664 i_cpr_map_setup(void) 665 { 666 /* 667 * Allocate a virtual memory range spanned by an hmeblk. 668 * This would be 8 hments or 64k bytes. Starting VA 669 * must be 64k (8-page) aligned. 670 */ 671 cpr_vaddr = vmem_xalloc(heap_arena, 672 mmu_ptob(NHMENTS), mmu_ptob(NHMENTS), 673 0, 0, NULL, NULL, VM_NOSLEEP); 674 return (cpr_vaddr); 675 } 676 677 /* 678 * create tmp locked tlb entries for a group of phys pages; 679 * 680 * i_cpr_mapin/i_cpr_mapout should always be called in pairs, 681 * otherwise would fill up a tlb with locked entries 682 */ 683 void 684 i_cpr_mapin(caddr_t vaddr, uint_t pages, pfn_t ppn) 685 { 686 tte_t tte; 687 extern pfn_t curthreadpfn; 688 extern int curthreadremapped; 689 690 curthreadremapped = (ppn <= curthreadpfn && curthreadpfn < ppn + pages); 691 692 for (; pages--; ppn++, vaddr += MMU_PAGESIZE) { 693 tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 694 tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 695 TTE_CP_INT | TTE_PRIV_INT | TTE_HWWR_INT; 696 sfmmu_dtlb_ld_kva(vaddr, &tte); 697 } 698 } 699 700 void 701 i_cpr_mapout(caddr_t vaddr, uint_t pages) 702 { 703 extern int curthreadremapped; 704 705 if (curthreadremapped && vaddr <= (caddr_t)curthread && 706 (caddr_t)curthread < vaddr + pages * MMU_PAGESIZE) 707 curthreadremapped = 0; 708 709 for (; pages--; vaddr += MMU_PAGESIZE) 710 vtag_flushpage(vaddr, (uint64_t)ksfmmup); 711 } 712 713 /* 714 * We're done using the mapping area; release virtual space 715 */ 716 void 717 i_cpr_map_destroy(void) 718 { 719 vmem_free(heap_arena, cpr_vaddr, mmu_ptob(NHMENTS)); 720 cpr_vaddr = NULL; 721 } 722 723 /* ARGSUSED */ 724 void 725 i_cpr_handle_xc(int flag) 726 { 727 } 728 729 730 /* 731 * This function takes care of pages which are not in kas or need to be 732 * taken care of in a special way. For example, panicbuf pages are not 733 * in kas and their pages are allocated via prom_retain(). 734 */ 735 pgcnt_t 736 i_cpr_count_special_kpages(int mapflag, bitfunc_t bitfunc) 737 { 738 struct cpr_map_info *pri, *tail; 739 pgcnt_t pages, total = 0; 740 pfn_t pfn; 741 742 /* 743 * Save information about prom retained panicbuf pages 744 */ 745 if (bitfunc == cpr_setbit) { 746 pri = &cpr_prom_retain[CPR_PANICBUF]; 747 pri->virt = (cpr_ptr)panicbuf; 748 pri->phys = va_to_pa(panicbuf); 749 pri->size = sizeof (panicbuf); 750 } 751 752 /* 753 * Go through the prom_retain array to tag those pages. 754 */ 755 tail = &cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 756 for (pri = cpr_prom_retain; pri < tail; pri++) { 757 pages = mmu_btopr(pri->size); 758 for (pfn = ADDR_TO_PN(pri->phys); pages--; pfn++) { 759 if (pf_is_memory(pfn)) { 760 if (bitfunc == cpr_setbit) { 761 if ((*bitfunc)(pfn, mapflag) == 0) 762 total++; 763 } else 764 total++; 765 } 766 } 767 } 768 769 return (total); 770 } 771 772 773 /* 774 * Free up memory-related resources here. We start by freeing buffers 775 * allocated during suspend initialization. Also, free up the mapping 776 * resources allocated in cpr_init(). 777 */ 778 void 779 i_cpr_free_memory_resources(void) 780 { 781 (void) i_cpr_prom_pages(CPR_PROM_FREE); 782 i_cpr_map_destroy(); 783 i_cpr_storage_free(); 784 } 785 786 787 /* 788 * Derived from cpr_write_statefile(). 789 * Save the sensitive pages to the storage area and do bookkeeping 790 * using the sensitive descriptors. Each descriptor will contain no more 791 * than CPR_MAXCONTIG amount of contiguous pages to match the max amount 792 * of pages that statefile gets written to disk at each write. 793 * XXX The CPR_MAXCONTIG can be changed to the size of the compression 794 * scratch area. 795 */ 796 static int 797 i_cpr_save_to_storage(void) 798 { 799 sensitive_size_saved = 0; 800 sensitive_pages_saved = 0; 801 sensitive_write_ptr = i_cpr_storage_data_base; 802 return (cpr_contig_pages(NULL, SAVE_TO_STORAGE)); 803 } 804 805 806 /* 807 * This routine allocates space to save the sensitive kernel pages, 808 * i.e. kernel data nucleus, kvalloc and kvseg segments. 809 * It's assumed that those segments are the only areas that can be 810 * contaminated by memory allocations during statefile dumping. 811 * The space allocated here contains: 812 * A list of descriptors describing the saved sensitive pages. 813 * The storage area for saving the compressed sensitive kernel pages. 814 * Since storage pages are allocated from segkmem, they need to be 815 * excluded when saving. 816 */ 817 int 818 i_cpr_save_sensitive_kpages(void) 819 { 820 static const char pages_fmt[] = "\n%s %s allocs\n" 821 " spages %ld, vpages %ld, diff %ld\n"; 822 int retry_cnt; 823 int error = 0; 824 pgcnt_t pages, spages, vpages; 825 caddr_t addr; 826 char *str; 827 828 /* 829 * Tag sensitive kpages. Allocate space for storage descriptors 830 * and storage data area based on the resulting bitmaps. 831 * Note: The storage space will be part of the sensitive 832 * segment, so we need to tag kpages here before the storage 833 * is actually allocated just so their space won't be accounted 834 * for. They will not be part of the statefile although those 835 * pages will be claimed by cprboot. 836 */ 837 cpr_clear_bitmaps(); 838 839 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 840 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 841 pages = spages - vpages; 842 843 str = "i_cpr_save_sensitive_kpages:"; 844 CPR_DEBUG(CPR_DEBUG7, pages_fmt, "before", str, spages, vpages, pages); 845 846 /* 847 * Allocate space to save the clean sensitive kpages 848 */ 849 for (retry_cnt = 0; retry_cnt < MAX_STORAGE_ALLOC_RETRY; retry_cnt++) { 850 /* 851 * Alloc on first pass or realloc if we are retrying because 852 * of insufficient storage for sensitive pages 853 */ 854 if (retry_cnt == 0 || error == ENOMEM) { 855 if (i_cpr_storage_data_base) { 856 kmem_free(i_cpr_storage_data_base, 857 mmu_ptob(i_cpr_storage_data_sz)); 858 i_cpr_storage_data_base = NULL; 859 i_cpr_storage_data_sz = 0; 860 } 861 addr = i_cpr_storage_data_alloc(pages, 862 &i_cpr_storage_data_sz, retry_cnt); 863 if (addr == NULL) { 864 CPR_DEBUG(CPR_DEBUG7, 865 "\n%s can't allocate data storage space!\n", 866 str); 867 return (ENOMEM); 868 } 869 i_cpr_storage_data_base = addr; 870 i_cpr_storage_data_end = 871 addr + mmu_ptob(i_cpr_storage_data_sz); 872 } 873 874 /* 875 * Allocate on first pass, only realloc if retry is because of 876 * insufficient descriptors, but reset contents on each pass 877 * (desc_alloc resets contents as well) 878 */ 879 if (retry_cnt == 0 || error == -1) { 880 error = i_cpr_storage_desc_alloc( 881 &i_cpr_storage_desc_base, &i_cpr_storage_desc_pgcnt, 882 &i_cpr_storage_desc_end, retry_cnt); 883 if (error != 0) 884 return (error); 885 } else { 886 i_cpr_storage_desc_init(i_cpr_storage_desc_base, 887 i_cpr_storage_desc_pgcnt, i_cpr_storage_desc_end); 888 } 889 890 /* 891 * We are ready to save the sensitive kpages to storage. 892 * We cannot trust what's tagged in the bitmaps anymore 893 * after storage allocations. Clear up the bitmaps and 894 * retag the sensitive kpages again. The storage pages 895 * should be untagged. 896 */ 897 cpr_clear_bitmaps(); 898 899 spages = 900 i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 901 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 902 903 CPR_DEBUG(CPR_DEBUG7, pages_fmt, "after ", str, 904 spages, vpages, spages - vpages); 905 906 /* 907 * Returns 0 on success, -1 if too few descriptors, and 908 * ENOMEM if not enough space to save sensitive pages 909 */ 910 CPR_DEBUG(CPR_DEBUG1, "compressing pages to storage...\n"); 911 error = i_cpr_save_to_storage(); 912 if (error == 0) { 913 /* Saving to storage succeeded */ 914 CPR_DEBUG(CPR_DEBUG1, "compressed %d pages\n", 915 sensitive_pages_saved); 916 break; 917 } else if (error == -1) 918 CPR_DEBUG(CPR_DEBUG1, "%s too few descriptors\n", str); 919 } 920 if (error == -1) 921 error = ENOMEM; 922 return (error); 923 } 924 925 926 /* 927 * Estimate how much memory we will need to save 928 * the sensitive pages with compression. 929 */ 930 static caddr_t 931 i_cpr_storage_data_alloc(pgcnt_t pages, pgcnt_t *alloc_pages, int retry_cnt) 932 { 933 pgcnt_t alloc_pcnt, last_pcnt; 934 caddr_t addr; 935 char *str; 936 937 str = "i_cpr_storage_data_alloc:"; 938 if (retry_cnt == 0) { 939 /* 940 * common compression ratio is about 3:1 941 * initial storage allocation is estimated at 40% 942 * to cover the majority of cases 943 */ 944 alloc_pcnt = INITIAL_ALLOC_PCNT; 945 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 946 CPR_DEBUG(CPR_DEBUG7, "%s sensitive pages: %ld\n", str, pages); 947 CPR_DEBUG(CPR_DEBUG7, 948 "%s initial est pages: %ld, alloc %ld%%\n", 949 str, *alloc_pages, alloc_pcnt); 950 } else { 951 /* 952 * calculate the prior compression percentage (x100) 953 * from the last attempt to save sensitive pages 954 */ 955 ASSERT(sensitive_pages_saved != 0); 956 last_pcnt = (mmu_btopr(sensitive_size_saved) * INTEGRAL) / 957 sensitive_pages_saved; 958 CPR_DEBUG(CPR_DEBUG7, "%s last ratio %ld%%\n", str, last_pcnt); 959 960 /* 961 * new estimated storage size is based on 962 * the larger ratio + 5% for each retry: 963 * pages * (last + [5%, 10%]) 964 */ 965 alloc_pcnt = MAX(last_pcnt, INITIAL_ALLOC_PCNT) + 966 (retry_cnt * 5); 967 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 968 CPR_DEBUG(CPR_DEBUG7, "%s Retry est pages: %ld, alloc %ld%%\n", 969 str, *alloc_pages, alloc_pcnt); 970 } 971 972 addr = kmem_alloc(mmu_ptob(*alloc_pages), KM_NOSLEEP); 973 CPR_DEBUG(CPR_DEBUG7, "%s alloc %ld pages\n", str, *alloc_pages); 974 return (addr); 975 } 976 977 978 void 979 i_cpr_storage_free(void) 980 { 981 /* Free descriptors */ 982 if (i_cpr_storage_desc_base) { 983 kmem_free(i_cpr_storage_desc_base, 984 mmu_ptob(i_cpr_storage_desc_pgcnt)); 985 i_cpr_storage_desc_base = NULL; 986 i_cpr_storage_desc_pgcnt = 0; 987 } 988 989 990 /* Data storage */ 991 if (i_cpr_storage_data_base) { 992 kmem_free(i_cpr_storage_data_base, 993 mmu_ptob(i_cpr_storage_data_sz)); 994 i_cpr_storage_data_base = NULL; 995 i_cpr_storage_data_sz = 0; 996 } 997 } 998 999 1000 /* 1001 * This routine is derived from cpr_compress_and_write(). 1002 * 1. Do bookkeeping in the descriptor for the contiguous sensitive chunk. 1003 * 2. Compress and save the clean sensitive pages into the storage area. 1004 */ 1005 int 1006 i_cpr_compress_and_save(int chunks, pfn_t spfn, pgcnt_t pages) 1007 { 1008 extern char *cpr_compress_pages(cpd_t *, pgcnt_t, int); 1009 extern caddr_t i_cpr_storage_data_end; 1010 uint_t remaining, datalen; 1011 uint32_t test_usum; 1012 char *datap; 1013 csd_t *descp; 1014 cpd_t cpd; 1015 int error; 1016 1017 /* 1018 * Fill next empty storage descriptor 1019 */ 1020 descp = i_cpr_storage_desc_base + chunks - 1; 1021 if (descp >= i_cpr_storage_desc_end) { 1022 CPR_DEBUG(CPR_DEBUG1, "ran out of descriptors, base 0x%p, " 1023 "chunks %d, end 0x%p, descp 0x%p\n", 1024 i_cpr_storage_desc_base, chunks, 1025 i_cpr_storage_desc_end, descp); 1026 return (-1); 1027 } 1028 ASSERT(descp->csd_dirty_spfn == (uint_t)-1); 1029 i_cpr_storage_desc_last_used = descp; 1030 1031 descp->csd_dirty_spfn = spfn; 1032 descp->csd_dirty_npages = pages; 1033 1034 i_cpr_mapin(CPR->c_mapping_area, pages, spfn); 1035 1036 /* 1037 * try compressing pages and copy cpd fields 1038 * pfn is copied for debug use 1039 */ 1040 cpd.cpd_pfn = spfn; 1041 datap = cpr_compress_pages(&cpd, pages, C_COMPRESSING); 1042 datalen = cpd.cpd_length; 1043 descp->csd_clean_compressed = (cpd.cpd_flag & CPD_COMPRESS); 1044 #ifdef DEBUG 1045 descp->csd_usum = cpd.cpd_usum; 1046 descp->csd_csum = cpd.cpd_csum; 1047 #endif 1048 1049 error = 0; 1050 1051 /* 1052 * Save the raw or compressed data to the storage area pointed to by 1053 * sensitive_write_ptr. Make sure the storage space is big enough to 1054 * hold the result. Otherwise roll back to increase the storage space. 1055 */ 1056 descp->csd_clean_sva = (cpr_ptr)sensitive_write_ptr; 1057 descp->csd_clean_sz = datalen; 1058 if ((sensitive_write_ptr + datalen) < i_cpr_storage_data_end) { 1059 extern void cprbcopy(void *, void *, size_t); 1060 1061 cprbcopy(datap, sensitive_write_ptr, datalen); 1062 sensitive_size_saved += datalen; 1063 sensitive_pages_saved += descp->csd_dirty_npages; 1064 sensitive_write_ptr += datalen; 1065 } else { 1066 remaining = (i_cpr_storage_data_end - sensitive_write_ptr); 1067 CPR_DEBUG(CPR_DEBUG1, "i_cpr_compress_and_save: The storage " 1068 "space is too small!\ngot %d, want %d\n\n", 1069 remaining, (remaining + datalen)); 1070 #ifdef DEBUG 1071 /* 1072 * Check to see if the content of the sensitive pages that we 1073 * just copied have changed during this small time window. 1074 */ 1075 test_usum = checksum32(CPR->c_mapping_area, mmu_ptob(pages)); 1076 descp->csd_usum = cpd.cpd_usum; 1077 if (test_usum != descp->csd_usum) { 1078 CPR_DEBUG(CPR_DEBUG1, "\nWARNING: " 1079 "i_cpr_compress_and_save: " 1080 "Data in the range of pfn 0x%lx to pfn " 1081 "0x%lx has changed after they are saved " 1082 "into storage.", spfn, (spfn + pages - 1)); 1083 } 1084 #endif 1085 error = ENOMEM; 1086 } 1087 1088 i_cpr_mapout(CPR->c_mapping_area, pages); 1089 return (error); 1090 } 1091 1092 1093 /* 1094 * This routine is derived from cpr_count_kpages(). 1095 * It goes through kernel data nucleus and segkmem segments to select 1096 * pages in use and mark them in the corresponding bitmap. 1097 */ 1098 pgcnt_t 1099 i_cpr_count_sensitive_kpages(int mapflag, bitfunc_t bitfunc) 1100 { 1101 pgcnt_t kdata_cnt = 0, segkmem_cnt = 0; 1102 extern caddr_t e_moddata; 1103 extern struct seg kvalloc; 1104 extern struct seg kmem64; 1105 size_t size; 1106 1107 /* 1108 * Kernel data nucleus pages 1109 */ 1110 size = e_moddata - s_data; 1111 kdata_cnt += cpr_count_pages(s_data, size, 1112 mapflag, bitfunc, DBG_SHOWRANGE); 1113 1114 /* 1115 * kvseg and kvalloc pages 1116 */ 1117 segkmem_cnt += cpr_scan_kvseg(mapflag, bitfunc, &kvseg); 1118 segkmem_cnt += cpr_count_pages(kvalloc.s_base, kvalloc.s_size, 1119 mapflag, bitfunc, DBG_SHOWRANGE); 1120 1121 /* segment to support kernel memory usage above 32-bit space (4GB) */ 1122 if (kmem64.s_base) 1123 segkmem_cnt += cpr_count_pages(kmem64.s_base, kmem64.s_size, 1124 mapflag, bitfunc, DBG_SHOWRANGE); 1125 1126 CPR_DEBUG(CPR_DEBUG7, "\ni_cpr_count_sensitive_kpages:\n" 1127 "\tkdata_cnt %ld + segkmem_cnt %ld = %ld pages\n", 1128 kdata_cnt, segkmem_cnt, kdata_cnt + segkmem_cnt); 1129 1130 return (kdata_cnt + segkmem_cnt); 1131 } 1132 1133 1134 pgcnt_t 1135 i_cpr_count_storage_pages(int mapflag, bitfunc_t bitfunc) 1136 { 1137 pgcnt_t count = 0; 1138 1139 if (i_cpr_storage_desc_base) { 1140 count += cpr_count_pages((caddr_t)i_cpr_storage_desc_base, 1141 (size_t)mmu_ptob(i_cpr_storage_desc_pgcnt), 1142 mapflag, bitfunc, DBG_SHOWRANGE); 1143 } 1144 if (i_cpr_storage_data_base) { 1145 count += cpr_count_pages(i_cpr_storage_data_base, 1146 (size_t)mmu_ptob(i_cpr_storage_data_sz), 1147 mapflag, bitfunc, DBG_SHOWRANGE); 1148 } 1149 return (count); 1150 } 1151 1152 1153 /* 1154 * Derived from cpr_write_statefile(). 1155 * Allocate (or reallocate after exhausting the supply) descriptors for each 1156 * chunk of contiguous sensitive kpages. 1157 */ 1158 static int 1159 i_cpr_storage_desc_alloc(csd_t **basepp, pgcnt_t *pgsp, csd_t **endpp, 1160 int retry) 1161 { 1162 pgcnt_t npages; 1163 int chunks; 1164 csd_t *descp, *end; 1165 size_t len; 1166 char *str = "i_cpr_storage_desc_alloc:"; 1167 1168 /* 1169 * On initial allocation, add some extra to cover overhead caused 1170 * by the allocation for the storage area later. 1171 */ 1172 if (retry == 0) { 1173 chunks = cpr_contig_pages(NULL, STORAGE_DESC_ALLOC) + 1174 EXTRA_DESCS; 1175 npages = mmu_btopr(sizeof (**basepp) * (pgcnt_t)chunks); 1176 CPR_DEBUG(CPR_DEBUG7, "%s chunks %d, ", str, chunks); 1177 } else { 1178 CPR_DEBUG(CPR_DEBUG7, "%s retry %d: ", str, retry); 1179 npages = *pgsp + 1; 1180 } 1181 /* Free old descriptors, if any */ 1182 if (*basepp) 1183 kmem_free((caddr_t)*basepp, mmu_ptob(*pgsp)); 1184 1185 descp = *basepp = kmem_alloc(mmu_ptob(npages), KM_NOSLEEP); 1186 if (descp == NULL) { 1187 CPR_DEBUG(CPR_DEBUG7, "%s no space for descriptors!\n", str); 1188 return (ENOMEM); 1189 } 1190 1191 *pgsp = npages; 1192 len = mmu_ptob(npages); 1193 end = *endpp = descp + (len / (sizeof (**basepp))); 1194 CPR_DEBUG(CPR_DEBUG7, "npages 0x%lx, len 0x%lx, items 0x%lx\n\t*basepp " 1195 "%p, *endpp %p\n", npages, len, (len / (sizeof (**basepp))), 1196 *basepp, *endpp); 1197 i_cpr_storage_desc_init(descp, npages, end); 1198 return (0); 1199 } 1200 1201 static void 1202 i_cpr_storage_desc_init(csd_t *descp, pgcnt_t npages, csd_t *end) 1203 { 1204 size_t len = mmu_ptob(npages); 1205 1206 /* Initialize the descriptors to something impossible. */ 1207 bzero(descp, len); 1208 #ifdef DEBUG 1209 /* 1210 * This condition is tested by an ASSERT 1211 */ 1212 for (; descp < end; descp++) 1213 descp->csd_dirty_spfn = (uint_t)-1; 1214 #endif 1215 } 1216 1217 int 1218 i_cpr_dump_sensitive_kpages(vnode_t *vp) 1219 { 1220 int error = 0; 1221 uint_t spin_cnt = 0; 1222 csd_t *descp; 1223 1224 /* 1225 * These following two variables need to be reinitialized 1226 * for each cpr cycle. 1227 */ 1228 i_cpr_sensitive_bytes_dumped = 0; 1229 i_cpr_sensitive_pgs_dumped = 0; 1230 1231 if (i_cpr_storage_desc_base) { 1232 for (descp = i_cpr_storage_desc_base; 1233 descp <= i_cpr_storage_desc_last_used; descp++) { 1234 if (error = cpr_dump_sensitive(vp, descp)) 1235 return (error); 1236 spin_cnt++; 1237 if ((spin_cnt & 0x5F) == 1) 1238 cpr_spinning_bar(); 1239 } 1240 prom_printf(" \b"); 1241 } 1242 1243 CPR_DEBUG(CPR_DEBUG7, "\ni_cpr_dump_sensitive_kpages: dumped %ld\n", 1244 i_cpr_sensitive_pgs_dumped); 1245 return (0); 1246 } 1247 1248 1249 /* 1250 * 1. Fill the cpr page descriptor with the info of the dirty pages 1251 * and 1252 * write the descriptor out. It will be used at resume. 1253 * 2. Write the clean data in stead of the dirty data out. 1254 * Note: to save space, the clean data is already compressed. 1255 */ 1256 static int 1257 cpr_dump_sensitive(vnode_t *vp, csd_t *descp) 1258 { 1259 int error = 0; 1260 caddr_t datap; 1261 cpd_t cpd; /* cpr page descriptor */ 1262 pfn_t dirty_spfn; 1263 pgcnt_t dirty_npages; 1264 size_t clean_sz; 1265 caddr_t clean_sva; 1266 int clean_compressed; 1267 extern uchar_t cpr_pagecopy[]; 1268 1269 dirty_spfn = descp->csd_dirty_spfn; 1270 dirty_npages = descp->csd_dirty_npages; 1271 clean_sva = (caddr_t)descp->csd_clean_sva; 1272 clean_sz = descp->csd_clean_sz; 1273 clean_compressed = descp->csd_clean_compressed; 1274 1275 /* Fill cpr page descriptor. */ 1276 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 1277 cpd.cpd_pfn = dirty_spfn; 1278 cpd.cpd_flag = 0; /* must init to zero */ 1279 cpd.cpd_pages = dirty_npages; 1280 1281 #ifdef DEBUG 1282 if ((cpd.cpd_usum = descp->csd_usum) != 0) 1283 cpd.cpd_flag |= CPD_USUM; 1284 if ((cpd.cpd_csum = descp->csd_csum) != 0) 1285 cpd.cpd_flag |= CPD_CSUM; 1286 #endif 1287 1288 STAT->cs_dumped_statefsz += mmu_ptob(dirty_npages); 1289 1290 /* 1291 * The sensitive kpages are usually saved with compression 1292 * unless compression could not reduce the size of the data. 1293 * If user choose not to have the statefile compressed, 1294 * we need to decompress the data back before dumping it to disk. 1295 */ 1296 if (CPR->c_flags & C_COMPRESSING) { 1297 cpd.cpd_length = clean_sz; 1298 datap = clean_sva; 1299 if (clean_compressed) 1300 cpd.cpd_flag |= CPD_COMPRESS; 1301 } else { 1302 if (clean_compressed) { 1303 cpd.cpd_length = decompress(clean_sva, cpr_pagecopy, 1304 clean_sz, mmu_ptob(dirty_npages)); 1305 datap = (caddr_t)cpr_pagecopy; 1306 ASSERT(cpd.cpd_length == mmu_ptob(dirty_npages)); 1307 } else { 1308 cpd.cpd_length = clean_sz; 1309 datap = clean_sva; 1310 } 1311 cpd.cpd_csum = 0; 1312 } 1313 1314 /* Write cpr page descriptor */ 1315 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd)); 1316 if (error) { 1317 CPR_DEBUG(CPR_DEBUG7, "descp: %p\n", descp); 1318 #ifdef DEBUG 1319 debug_enter("cpr_dump_sensitive: cpr_write() page " 1320 "descriptor failed!\n"); 1321 #endif 1322 return (error); 1323 } 1324 1325 i_cpr_sensitive_bytes_dumped += sizeof (cpd_t); 1326 1327 /* Write page data */ 1328 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 1329 if (error) { 1330 CPR_DEBUG(CPR_DEBUG7, "error: %x\n", error); 1331 CPR_DEBUG(CPR_DEBUG7, "descp: %p\n", descp); 1332 CPR_DEBUG(CPR_DEBUG7, "cpr_write(%p, %p , %lx)\n", vp, datap, 1333 cpd.cpd_length); 1334 #ifdef DEBUG 1335 debug_enter("cpr_dump_sensitive: cpr_write() data failed!\n"); 1336 #endif 1337 return (error); 1338 } 1339 1340 i_cpr_sensitive_bytes_dumped += cpd.cpd_length; 1341 i_cpr_sensitive_pgs_dumped += dirty_npages; 1342 1343 return (error); 1344 } 1345 1346 1347 /* 1348 * Sanity check to make sure that we have dumped right amount 1349 * of pages from different sources to statefile. 1350 */ 1351 int 1352 i_cpr_check_pgs_dumped(uint_t pgs_expected, uint_t regular_pgs_dumped) 1353 { 1354 uint_t total_pgs_dumped; 1355 1356 total_pgs_dumped = regular_pgs_dumped + i_cpr_sensitive_pgs_dumped; 1357 1358 CPR_DEBUG(CPR_DEBUG7, "\ncheck_pgs: reg %d + sens %ld = %d, " 1359 "expect %d\n\n", regular_pgs_dumped, i_cpr_sensitive_pgs_dumped, 1360 total_pgs_dumped, pgs_expected); 1361 1362 if (pgs_expected == total_pgs_dumped) 1363 return (0); 1364 1365 return (EINVAL); 1366 } 1367 1368 1369 int 1370 i_cpr_reusefini(void) 1371 { 1372 struct vnode *vp; 1373 cdef_t *cdef; 1374 size_t size; 1375 char *bufp; 1376 int rc; 1377 1378 if (cpr_reusable_mode) 1379 cpr_reusable_mode = 0; 1380 1381 if (rc = cpr_open_deffile(FREAD|FWRITE, &vp)) { 1382 if (rc == EROFS) { 1383 cpr_err(CE_CONT, "uadmin A_FREEZE AD_REUSEFINI " 1384 "(uadmin %d %d)\nmust be done with / mounted " 1385 "writeable.\n", A_FREEZE, AD_REUSEFINI); 1386 } 1387 return (rc); 1388 } 1389 1390 cdef = kmem_alloc(sizeof (*cdef), KM_SLEEP); 1391 rc = cpr_rdwr(UIO_READ, vp, cdef, sizeof (*cdef)); 1392 1393 if (rc) { 1394 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1395 cpr_default_path, rc); 1396 } else if (cdef->mini.magic != CPR_DEFAULT_MAGIC) { 1397 cpr_err(CE_WARN, "bad magic number in %s, cannot restore " 1398 "prom values for %s", cpr_default_path, 1399 cpr_enumerate_promprops(&bufp, &size)); 1400 kmem_free(bufp, size); 1401 rc = EINVAL; 1402 } else { 1403 /* 1404 * clean up prom properties 1405 */ 1406 rc = cpr_update_nvram(cdef->props); 1407 if (rc == 0) { 1408 /* 1409 * invalidate the disk copy and turn off reusable 1410 */ 1411 cdef->mini.magic = 0; 1412 cdef->mini.reusable = 0; 1413 if (rc = cpr_rdwr(UIO_WRITE, vp, 1414 &cdef->mini, sizeof (cdef->mini))) { 1415 cpr_err(CE_WARN, "Failed writing %s, errno %d", 1416 cpr_default_path, rc); 1417 } 1418 } 1419 } 1420 1421 (void) VOP_CLOSE(vp, FREAD|FWRITE, 1, (offset_t)0, CRED()); 1422 VN_RELE(vp); 1423 kmem_free(cdef, sizeof (*cdef)); 1424 1425 return (rc); 1426 } 1427 1428 1429 int 1430 i_cpr_reuseinit(void) 1431 { 1432 int rc = 0; 1433 1434 if (rc = cpr_default_setup(1)) 1435 return (rc); 1436 1437 /* 1438 * We need to validate default file 1439 */ 1440 rc = cpr_validate_definfo(1); 1441 if (rc == 0) 1442 cpr_reusable_mode = 1; 1443 else if (rc == EROFS) { 1444 cpr_err(CE_NOTE, "reuseinit must be performed " 1445 "while / is mounted writeable"); 1446 } 1447 1448 (void) cpr_default_setup(0); 1449 1450 return (rc); 1451 } 1452 1453 1454 int 1455 i_cpr_check_cprinfo(void) 1456 { 1457 struct vnode *vp; 1458 cmini_t mini; 1459 int rc = 0; 1460 1461 if (rc = cpr_open_deffile(FREAD, &vp)) { 1462 if (rc == ENOENT) 1463 cpr_err(CE_NOTE, "cprinfo file does not " 1464 "exist. You must run 'uadmin %d %d' " 1465 "command while / is mounted writeable,\n" 1466 "then reboot and run 'uadmin %d %d' " 1467 "to create a reusable statefile", 1468 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1469 return (rc); 1470 } 1471 1472 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 1473 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED()); 1474 VN_RELE(vp); 1475 1476 if (rc) { 1477 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1478 cpr_default_path, rc); 1479 } else if (mini.magic != CPR_DEFAULT_MAGIC) { 1480 cpr_err(CE_CONT, "bad magic number in cprinfo file.\n" 1481 "You must run 'uadmin %d %d' while / is mounted " 1482 "writeable, then reboot and run 'uadmin %d %d' " 1483 "to create a reusable statefile\n", 1484 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1485 rc = EINVAL; 1486 } 1487 1488 return (rc); 1489 } 1490 1491 1492 int 1493 i_cpr_reusable_supported(void) 1494 { 1495 return (1); 1496 } 1497 1498 1499 /* 1500 * find prom phys pages and alloc space for a tmp copy 1501 */ 1502 static int 1503 i_cpr_find_ppages(void) 1504 { 1505 extern struct vnode prom_ppages; 1506 struct page *pp; 1507 struct memlist *pmem; 1508 pgcnt_t npages, pcnt, scnt, vcnt; 1509 pfn_t ppn, plast, *dst; 1510 int mapflag; 1511 1512 cpr_clear_bitmaps(); 1513 mapflag = REGULAR_BITMAP; 1514 1515 /* 1516 * there should be a page_t for each phys page used by the kernel; 1517 * set a bit for each phys page not tracked by a page_t 1518 */ 1519 pcnt = 0; 1520 memlist_read_lock(); 1521 for (pmem = phys_install; pmem; pmem = pmem->next) { 1522 npages = mmu_btop(pmem->size); 1523 ppn = mmu_btop(pmem->address); 1524 for (plast = ppn + npages; ppn < plast; ppn++) { 1525 if (page_numtopp_nolock(ppn)) 1526 continue; 1527 (void) cpr_setbit(ppn, mapflag); 1528 pcnt++; 1529 } 1530 } 1531 memlist_read_unlock(); 1532 1533 /* 1534 * clear bits for phys pages in each segment 1535 */ 1536 scnt = cpr_count_seg_pages(mapflag, cpr_clrbit); 1537 1538 /* 1539 * set bits for phys pages referenced by the prom_ppages vnode; 1540 * these pages are mostly comprised of forthdebug words 1541 */ 1542 vcnt = 0; 1543 for (pp = prom_ppages.v_pages; pp; ) { 1544 if (cpr_setbit(pp->p_offset, mapflag) == 0) 1545 vcnt++; 1546 pp = pp->p_vpnext; 1547 if (pp == prom_ppages.v_pages) 1548 break; 1549 } 1550 1551 /* 1552 * total number of prom pages are: 1553 * (non-page_t pages - seg pages + vnode pages) 1554 */ 1555 ppage_count = pcnt - scnt + vcnt; 1556 CPR_DEBUG(CPR_DEBUG1, 1557 "find_ppages: pcnt %ld - scnt %ld + vcnt %ld = %ld\n", 1558 pcnt, scnt, vcnt, ppage_count); 1559 1560 /* 1561 * alloc array of pfn_t to store phys page list 1562 */ 1563 pphys_list_size = ppage_count * sizeof (pfn_t); 1564 pphys_list = kmem_alloc(pphys_list_size, KM_NOSLEEP); 1565 if (pphys_list == NULL) { 1566 cpr_err(CE_WARN, "cannot alloc pphys_list"); 1567 return (ENOMEM); 1568 } 1569 1570 /* 1571 * phys pages referenced in the bitmap should be 1572 * those used by the prom; scan bitmap and save 1573 * a list of prom phys page numbers 1574 */ 1575 dst = pphys_list; 1576 memlist_read_lock(); 1577 for (pmem = phys_install; pmem; pmem = pmem->next) { 1578 npages = mmu_btop(pmem->size); 1579 ppn = mmu_btop(pmem->address); 1580 for (plast = ppn + npages; ppn < plast; ppn++) { 1581 if (cpr_isset(ppn, mapflag)) { 1582 ASSERT(dst < (pphys_list + ppage_count)); 1583 *dst++ = ppn; 1584 } 1585 } 1586 } 1587 memlist_read_unlock(); 1588 1589 /* 1590 * allocate space to store prom pages 1591 */ 1592 ppage_buf = kmem_alloc(mmu_ptob(ppage_count), KM_NOSLEEP); 1593 if (ppage_buf == NULL) { 1594 kmem_free(pphys_list, pphys_list_size); 1595 pphys_list = NULL; 1596 cpr_err(CE_WARN, "cannot alloc ppage_buf"); 1597 return (ENOMEM); 1598 } 1599 1600 return (0); 1601 } 1602 1603 1604 /* 1605 * save prom pages to kmem pages 1606 */ 1607 static void 1608 i_cpr_save_ppages(void) 1609 { 1610 pfn_t *pphys, *plast; 1611 caddr_t dst; 1612 1613 /* 1614 * map in each prom page and copy to a kmem page 1615 */ 1616 dst = ppage_buf; 1617 plast = pphys_list + ppage_count; 1618 for (pphys = pphys_list; pphys < plast; pphys++) { 1619 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1620 bcopy(cpr_vaddr, dst, MMU_PAGESIZE); 1621 i_cpr_mapout(cpr_vaddr, 1); 1622 dst += MMU_PAGESIZE; 1623 } 1624 1625 CPR_DEBUG(CPR_DEBUG1, "saved %ld prom pages\n", ppage_count); 1626 } 1627 1628 1629 /* 1630 * restore prom pages from kmem pages 1631 */ 1632 static void 1633 i_cpr_restore_ppages(void) 1634 { 1635 pfn_t *pphys, *plast; 1636 caddr_t src; 1637 1638 dcache_flushall(); 1639 1640 /* 1641 * map in each prom page and copy from a kmem page 1642 */ 1643 src = ppage_buf; 1644 plast = pphys_list + ppage_count; 1645 for (pphys = pphys_list; pphys < plast; pphys++) { 1646 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1647 bcopy(src, cpr_vaddr, MMU_PAGESIZE); 1648 i_cpr_mapout(cpr_vaddr, 1); 1649 src += MMU_PAGESIZE; 1650 } 1651 1652 dcache_flushall(); 1653 1654 CPR_DEBUG(CPR_DEBUG1, "restored %ld prom pages\n", ppage_count); 1655 } 1656 1657 1658 /* 1659 * save/restore prom pages or free related allocs 1660 */ 1661 int 1662 i_cpr_prom_pages(int action) 1663 { 1664 int error; 1665 1666 if (action == CPR_PROM_SAVE) { 1667 if (ppage_buf == NULL) { 1668 ASSERT(pphys_list == NULL); 1669 if (error = i_cpr_find_ppages()) 1670 return (error); 1671 i_cpr_save_ppages(); 1672 } 1673 } else if (action == CPR_PROM_RESTORE) { 1674 i_cpr_restore_ppages(); 1675 } else if (action == CPR_PROM_FREE) { 1676 if (pphys_list) { 1677 ASSERT(pphys_list_size); 1678 kmem_free(pphys_list, pphys_list_size); 1679 pphys_list = NULL; 1680 pphys_list_size = 0; 1681 } 1682 if (ppage_buf) { 1683 ASSERT(ppage_count); 1684 kmem_free(ppage_buf, mmu_ptob(ppage_count)); 1685 CPR_DEBUG(CPR_DEBUG1, "freed %ld prom pages\n", 1686 ppage_count); 1687 ppage_buf = NULL; 1688 ppage_count = 0; 1689 } 1690 } 1691 return (0); 1692 } 1693 1694 1695 /* 1696 * record tlb data for the nucleus, bigktsb's, and the cpr module; 1697 * this data is later used by cprboot to install dtlb/itlb entries. 1698 * when we jump into the cpr module during the resume phase, those 1699 * mappings are needed until switching to the kernel trap table. 1700 * to make the dtte/itte info available during resume, we need 1701 * the info recorded prior to saving sensitive pages, otherwise 1702 * all the data would appear as NULLs. 1703 */ 1704 static void 1705 i_cpr_save_tlbinfo(void) 1706 { 1707 cti_t cti = {0}; 1708 1709 /* 1710 * during resume - shortly after jumping into the cpr module, 1711 * sfmmu_load_mmustate() will overwrite any dtlb entry at any 1712 * index used for TSBs; skip is set so that any saved tte will 1713 * target other tlb offsets and prevent being lost during 1714 * resume. now scan the dtlb and save locked entries, 1715 * then add entries for the tmp stack / data page and the 1716 * cpr thread structure. 1717 */ 1718 cti.dst = m_info.dtte; 1719 cti.tail = cti.dst + CPR_MAX_TLB; 1720 cti.reader = dtlb_rd_entry; 1721 cti.writer = NULL; 1722 cti.filter = i_cpr_lnb; 1723 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 1724 1725 if (utsb_dtlb_ttenum != -1) 1726 cti.skip = (1 << utsb_dtlb_ttenum); 1727 1728 if (utsb4m_dtlb_ttenum != -1) 1729 cti.skip |= (1 << utsb4m_dtlb_ttenum); 1730 1731 i_cpr_scan_tlb(&cti); 1732 i_cpr_make_tte(&cti, &i_cpr_data_page, datava); 1733 i_cpr_make_tte(&cti, curthread, datava); 1734 1735 /* 1736 * scan itlb and save locked entries; add an entry for 1737 * the first text page of the cpr module; cprboot will 1738 * jump to that page after restoring kernel pages. 1739 */ 1740 cti.dst = m_info.itte; 1741 cti.tail = cti.dst + CPR_MAX_TLB; 1742 cti.reader = itlb_rd_entry; 1743 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 1744 cti.skip = 0; 1745 i_cpr_scan_tlb(&cti); 1746 i_cpr_make_tte(&cti, (void *)i_cpr_resume_setup, textva); 1747 } 1748 1749 1750 /* ARGSUSED */ 1751 int 1752 i_cpr_dump_setup(vnode_t *vp) 1753 { 1754 /* 1755 * zero out m_info and add info to dtte/itte arrays 1756 */ 1757 bzero(&m_info, sizeof (m_info)); 1758 i_cpr_save_tlbinfo(); 1759 return (0); 1760 } 1761 1762 1763 int 1764 i_cpr_is_supported(void) 1765 { 1766 char es_prop[] = "energystar-v2"; 1767 pnode_t node; 1768 int last; 1769 extern int cpr_supported_override; 1770 extern int cpr_platform_enable; 1771 1772 /* 1773 * The next statement tests if a specific platform has turned off 1774 * cpr support. 1775 */ 1776 if (cpr_supported_override) 1777 return (0); 1778 1779 /* 1780 * Do not inspect energystar-v* property if a platform has 1781 * specifically turned on cpr support 1782 */ 1783 if (cpr_platform_enable) 1784 return (1); 1785 1786 node = prom_rootnode(); 1787 if (prom_getproplen(node, es_prop) != -1) 1788 return (1); 1789 last = strlen(es_prop) - 1; 1790 es_prop[last] = '3'; 1791 return (prom_getproplen(node, es_prop) != -1); 1792 } 1793 1794 1795 /* 1796 * the actual size of the statefile data isn't known until after all the 1797 * compressed pages are written; even the inode size doesn't reflect the 1798 * data size since there are usually many extra fs blocks. for recording 1799 * the actual data size, the first sector of the statefile is copied to 1800 * a tmp buf, and the copy is later updated and flushed to disk. 1801 */ 1802 int 1803 i_cpr_blockzero(char *base, char **bufpp, int *blkno, vnode_t *vp) 1804 { 1805 extern int cpr_flush_write(vnode_t *); 1806 static char cpr_sector[DEV_BSIZE]; 1807 cpr_ext bytes, *dst; 1808 1809 /* 1810 * this routine is called after cdd_t and csu_md_t are copied 1811 * to cpr_buf; mini-hack alert: the save/update method creates 1812 * a dependency on the combined struct size being >= one sector 1813 * or DEV_BSIZE; since introduction in Sol2.7, csu_md_t size is 1814 * over 1K bytes and will probably grow with any changes. 1815 * 1816 * copy when vp is NULL, flush when non-NULL 1817 */ 1818 if (vp == NULL) { 1819 ASSERT((*bufpp - base) >= DEV_BSIZE); 1820 bcopy(base, cpr_sector, sizeof (cpr_sector)); 1821 return (0); 1822 } else { 1823 bytes = dbtob(*blkno); 1824 dst = &((cdd_t *)cpr_sector)->cdd_filesize; 1825 bcopy(&bytes, dst, sizeof (bytes)); 1826 bcopy(cpr_sector, base, sizeof (cpr_sector)); 1827 *bufpp = base + sizeof (cpr_sector); 1828 *blkno = cpr_statefile_offset(); 1829 CPR_DEBUG(CPR_DEBUG1, "statefile data size: %ld\n\n", bytes); 1830 return (cpr_flush_write(vp)); 1831 } 1832 } 1833 1834 1835 /* 1836 * Allocate bitmaps according to the phys_install list. 1837 */ 1838 static int 1839 i_cpr_bitmap_setup(void) 1840 { 1841 struct memlist *pmem; 1842 cbd_t *dp, *tail; 1843 void *space; 1844 size_t size; 1845 1846 /* 1847 * The number of bitmap descriptors will be the count of 1848 * phys_install ranges plus 1 for a trailing NULL struct. 1849 */ 1850 cpr_nbitmaps = 1; 1851 for (pmem = phys_install; pmem; pmem = pmem->next) 1852 cpr_nbitmaps++; 1853 1854 if (cpr_nbitmaps > (CPR_MAX_BMDESC - 1)) { 1855 cpr_err(CE_WARN, "too many physical memory ranges %d, max %d", 1856 cpr_nbitmaps, CPR_MAX_BMDESC - 1); 1857 return (EFBIG); 1858 } 1859 1860 /* Alloc an array of bitmap descriptors. */ 1861 dp = kmem_zalloc(cpr_nbitmaps * sizeof (*dp), KM_NOSLEEP); 1862 if (dp == NULL) { 1863 cpr_nbitmaps = 0; 1864 return (ENOMEM); 1865 } 1866 tail = dp + cpr_nbitmaps; 1867 1868 CPR->c_bmda = dp; 1869 for (pmem = phys_install; pmem; pmem = pmem->next) { 1870 size = BITMAP_BYTES(pmem->size); 1871 space = kmem_zalloc(size * 2, KM_NOSLEEP); 1872 if (space == NULL) 1873 return (ENOMEM); 1874 ASSERT(dp < tail); 1875 dp->cbd_magic = CPR_BITMAP_MAGIC; 1876 dp->cbd_spfn = mmu_btop(pmem->address); 1877 dp->cbd_epfn = mmu_btop(pmem->address + pmem->size) - 1; 1878 dp->cbd_size = size; 1879 dp->cbd_reg_bitmap = (cpr_ptr)space; 1880 dp->cbd_vlt_bitmap = (cpr_ptr)((caddr_t)space + size); 1881 dp++; 1882 } 1883 1884 /* set magic for the last descriptor */ 1885 ASSERT(dp == (tail - 1)); 1886 dp->cbd_magic = CPR_BITMAP_MAGIC; 1887 1888 return (0); 1889 } 1890 1891 1892 void 1893 i_cpr_bitmap_cleanup(void) 1894 { 1895 cbd_t *dp; 1896 1897 if (CPR->c_bmda == NULL) 1898 return; 1899 for (dp = CPR->c_bmda; dp->cbd_size; dp++) 1900 kmem_free((void *)dp->cbd_reg_bitmap, dp->cbd_size * 2); 1901 kmem_free(CPR->c_bmda, cpr_nbitmaps * sizeof (*CPR->c_bmda)); 1902 CPR->c_bmda = NULL; 1903 cpr_nbitmaps = 0; 1904 } 1905 1906 1907 /* 1908 * A "regular" and "volatile" bitmap are created for each range of 1909 * physical memory. The volatile maps are used to count and track pages 1910 * susceptible to heap corruption - caused by drivers that allocate mem 1911 * during VOP_DUMP(); the regular maps are used for all the other non- 1912 * susceptible pages. Before writing the bitmaps to the statefile, 1913 * each bitmap pair gets merged to simplify handling within cprboot. 1914 */ 1915 int 1916 i_cpr_alloc_bitmaps(void) 1917 { 1918 int err; 1919 1920 memlist_read_lock(); 1921 err = i_cpr_bitmap_setup(); 1922 memlist_read_unlock(); 1923 if (err) 1924 i_cpr_bitmap_cleanup(); 1925 return (err); 1926 } 1927