1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Platform specific implementation code 31 */ 32 33 #define SUNDDI_IMPL 34 35 #include <sys/types.h> 36 #include <sys/promif.h> 37 #include <sys/prom_isa.h> 38 #include <sys/prom_plat.h> 39 #include <sys/mmu.h> 40 #include <vm/hat_sfmmu.h> 41 #include <sys/iommu.h> 42 #include <sys/scb.h> 43 #include <sys/cpuvar.h> 44 #include <sys/intreg.h> 45 #include <sys/pte.h> 46 #include <vm/hat.h> 47 #include <vm/page.h> 48 #include <vm/as.h> 49 #include <sys/cpr.h> 50 #include <sys/kmem.h> 51 #include <sys/clock.h> 52 #include <sys/kmem.h> 53 #include <sys/panic.h> 54 #include <vm/seg_kmem.h> 55 #include <sys/cpu_module.h> 56 #include <sys/callb.h> 57 #include <sys/machsystm.h> 58 #include <sys/vmsystm.h> 59 #include <sys/systm.h> 60 #include <sys/archsystm.h> 61 #include <sys/stack.h> 62 #include <sys/fs/ufs_fs.h> 63 #include <sys/memlist.h> 64 #include <sys/bootconf.h> 65 #include <sys/thread.h> 66 #include <vm/vm_dep.h> 67 68 extern void cpr_clear_bitmaps(void); 69 extern void dtlb_wr_entry(uint_t, tte_t *, uint64_t *); 70 extern void itlb_wr_entry(uint_t, tte_t *, uint64_t *); 71 72 static int i_cpr_storage_desc_alloc(csd_t **, pgcnt_t *, csd_t **, int); 73 static void i_cpr_storage_desc_init(csd_t *, pgcnt_t, csd_t *); 74 static caddr_t i_cpr_storage_data_alloc(pgcnt_t, pgcnt_t *, int); 75 static int cpr_dump_sensitive(vnode_t *, csd_t *); 76 static void i_cpr_clear_entries(uint64_t, uint64_t); 77 static void i_cpr_xcall(xcfunc_t); 78 79 void i_cpr_storage_free(void); 80 81 extern void *i_cpr_data_page; 82 extern int cpr_test_mode; 83 extern int cpr_nbitmaps; 84 extern char cpr_default_path[]; 85 extern caddr_t textva, datava; 86 87 static struct cpr_map_info cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 88 caddr_t cpr_vaddr = NULL; 89 90 static uint_t sensitive_pages_saved; 91 static uint_t sensitive_size_saved; 92 93 caddr_t i_cpr_storage_data_base; 94 caddr_t i_cpr_storage_data_end; 95 csd_t *i_cpr_storage_desc_base; 96 csd_t *i_cpr_storage_desc_end; /* one byte beyond last used descp */ 97 csd_t *i_cpr_storage_desc_last_used; /* last used descriptor */ 98 caddr_t sensitive_write_ptr; /* position for next storage write */ 99 100 size_t i_cpr_sensitive_bytes_dumped; 101 pgcnt_t i_cpr_sensitive_pgs_dumped; 102 pgcnt_t i_cpr_storage_data_sz; /* in pages */ 103 pgcnt_t i_cpr_storage_desc_pgcnt; /* in pages */ 104 105 ushort_t cpr_mach_type = CPR_MACHTYPE_4U; 106 static csu_md_t m_info; 107 108 109 #define MAX_STORAGE_RETRY 3 110 #define MAX_STORAGE_ALLOC_RETRY 3 111 #define INITIAL_ALLOC_PCNT 40 /* starting allocation percentage */ 112 #define INTEGRAL 100 /* to get 1% precision */ 113 114 #define EXTRA_RATE 2 /* add EXTRA_RATE% extra space */ 115 #define EXTRA_DESCS 10 116 117 #define CPR_NO_STORAGE_DESC 1 118 #define CPR_NO_STORAGE_DATA 2 119 120 #define CIF_SPLICE 0 121 #define CIF_UNLINK 1 122 123 124 /* 125 * CPR miscellaneous support routines 126 */ 127 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 128 mode, 0600, vpp, CRCREAT, 0)) 129 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 130 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 131 (ssize_t *)NULL)) 132 133 /* 134 * definitions for saving/restoring prom pages 135 */ 136 static void *ppage_buf; 137 static pgcnt_t ppage_count; 138 static pfn_t *pphys_list; 139 static size_t pphys_list_size; 140 141 typedef void (*tlb_rw_t)(uint_t, tte_t *, uint64_t *); 142 typedef void (*tlb_filter_t)(int, tte_t *, uint64_t, void *); 143 144 /* 145 * private struct for tlb handling 146 */ 147 struct cpr_trans_info { 148 sutlb_t *dst; 149 sutlb_t *tail; 150 tlb_rw_t reader; 151 tlb_rw_t writer; 152 tlb_filter_t filter; 153 int index; 154 uint64_t skip; /* assumes TLB <= 64 locked entries */ 155 }; 156 typedef struct cpr_trans_info cti_t; 157 158 159 /* 160 * special handling for tlb info 161 */ 162 #define WITHIN_OFW(va) \ 163 (((va) > (uint64_t)OFW_START_ADDR) && ((va) < (uint64_t)OFW_END_ADDR)) 164 165 #define WITHIN_NUCLEUS(va, base) \ 166 (((va) >= (base)) && \ 167 (((va) + MMU_PAGESIZE) <= ((base) + MMU_PAGESIZE4M))) 168 169 #define IS_BIGKTSB(va) \ 170 (enable_bigktsb && \ 171 ((va) >= (uint64_t)ktsb_base) && \ 172 ((va) < (uint64_t)(ktsb_base + ktsb_sz))) 173 174 175 /* 176 * WARNING: 177 * the text from this file is linked to follow cpr_resume_setup.o; 178 * only add text between here and i_cpr_end_jumpback when it needs 179 * to be called during resume before we switch back to the kernel 180 * trap table. all the text in this range must fit within a page. 181 */ 182 183 184 /* 185 * each time a machine is reset, the prom uses an inconsistent set of phys 186 * pages and the cif cookie may differ as well. so prior to restoring the 187 * original prom, we have to use to use the new/tmp prom's translations 188 * when requesting prom services. 189 * 190 * cif_handler starts out as the original prom cookie, and that gets used 191 * by client_handler() to jump into the prom. here we splice-in a wrapper 192 * routine by writing cif_handler; client_handler() will now jump to the 193 * wrapper which switches the %tba to the new/tmp prom's trap table then 194 * jumps to the new cookie. 195 */ 196 void 197 i_cpr_cif_setup(int action) 198 { 199 extern void *i_cpr_orig_cif, *cif_handler; 200 extern int i_cpr_cif_wrapper(void *); 201 202 /* 203 * save the original cookie and change the current cookie to the 204 * wrapper routine. later we just restore the original cookie. 205 */ 206 if (action == CIF_SPLICE) { 207 i_cpr_orig_cif = cif_handler; 208 cif_handler = (void *)i_cpr_cif_wrapper; 209 } else if (action == CIF_UNLINK) 210 cif_handler = i_cpr_orig_cif; 211 } 212 213 214 /* 215 * launch slave cpus into kernel text, pause them, 216 * and restore the original prom pages 217 */ 218 void 219 i_cpr_mp_setup(void) 220 { 221 extern void restart_other_cpu(int); 222 ihandle_t tmpout = 0; 223 char *str; 224 cpu_t *cp; 225 226 uint64_t kctx = kcontextreg; 227 228 /* 229 * Do not allow setting page size codes in MMU primary context 230 * register while using cif wrapper. This is needed to work 231 * arround OBP incorrect handling of this MMU register. 232 */ 233 kcontextreg = 0; 234 235 /* 236 * reset cpu_ready_set so x_calls work properly 237 */ 238 CPUSET_ZERO(cpu_ready_set); 239 CPUSET_ADD(cpu_ready_set, getprocessorid()); 240 241 /* 242 * setup cif to use the cookie from the new/tmp prom 243 * and setup tmp handling for calling prom services. 244 */ 245 i_cpr_cif_setup(CIF_SPLICE); 246 247 /* 248 * at this point, only the nucleus and a few cpr pages are 249 * mapped in. once we switch to the kernel trap table, 250 * we can access the rest of kernel space. 251 */ 252 prom_set_traptable(&trap_table); 253 254 if (ncpus > 1) { 255 sfmmu_init_tsbs(); 256 257 if (cpr_debug & LEVEL1) { 258 prom_interpret("stdout @ swap l!", (uintptr_t)&tmpout, 259 0, 0, 0, 0); 260 str = "MP startup...\r\n"; 261 (void) prom_write(tmpout, str, strlen(str), 0, 0); 262 } 263 264 mutex_enter(&cpu_lock); 265 /* 266 * All of the slave cpus are not ready at this time, 267 * yet the cpu structures have various cpu_flags set; 268 * clear cpu_flags and mutex_ready. 269 * Since we are coming up from a CPU suspend, the slave cpus 270 * are frozen. 271 */ 272 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) { 273 cp->cpu_flags = CPU_FROZEN; 274 cp->cpu_m.mutex_ready = 0; 275 } 276 277 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) 278 restart_other_cpu(cp->cpu_id); 279 280 pause_cpus(NULL); 281 mutex_exit(&cpu_lock); 282 283 if (cpr_debug & LEVEL1) { 284 str = "MP paused...\r\n"; 285 (void) prom_write(tmpout, str, strlen(str), 0, 0); 286 } 287 288 i_cpr_xcall(i_cpr_clear_entries); 289 } else 290 i_cpr_clear_entries(0, 0); 291 292 /* 293 * now unlink the cif wrapper; WARNING: do not call any 294 * prom_xxx() routines until after prom pages are restored. 295 */ 296 i_cpr_cif_setup(CIF_UNLINK); 297 298 (void) i_cpr_prom_pages(CPR_PROM_RESTORE); 299 300 /* allow setting page size codes in MMU primary context register */ 301 kcontextreg = kctx; 302 } 303 304 305 /* 306 * end marker for jumpback page; 307 * this symbol is used to check the size of i_cpr_resume_setup() 308 * and the above text. For simplicity, the Makefile needs to 309 * link i_cpr_resume_setup.o and cpr_impl.o consecutively. 310 */ 311 void 312 i_cpr_end_jumpback(void) 313 { 314 } 315 316 317 /* 318 * scan tlb entries with reader; when valid entries are found, 319 * the filter routine will selectively save/clear them 320 */ 321 static void 322 i_cpr_scan_tlb(cti_t *ctip) 323 { 324 uint64_t va_tag; 325 int tlb_index; 326 tte_t tte; 327 328 for (tlb_index = ctip->index; tlb_index >= 0; tlb_index--) { 329 (*ctip->reader)((uint_t)tlb_index, &tte, &va_tag); 330 if (va_tag && TTE_IS_VALID(&tte)) 331 (*ctip->filter)(tlb_index, &tte, va_tag, ctip); 332 } 333 } 334 335 336 /* 337 * filter for locked tlb entries that reference the text/data nucleus 338 * and any bigktsb's; these will be reinstalled by cprboot on all cpus 339 */ 340 /* ARGSUSED */ 341 static void 342 i_cpr_lnb(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 343 { 344 cti_t *ctip; 345 346 /* 347 * record tlb data at ctip->dst; the target tlb index starts 348 * at the highest tlb offset and moves towards 0. the prom 349 * reserves both dtlb and itlb index 0. any selected entry 350 * also gets marked to prevent being flushed during resume 351 */ 352 if (TTE_IS_LOCKED(ttep) && (va_tag == (uint64_t)textva || 353 va_tag == (uint64_t)datava || IS_BIGKTSB(va_tag))) { 354 ctip = ctrans; 355 while ((1 << ctip->index) & ctip->skip) 356 ctip->index--; 357 ASSERT(ctip->index > 0); 358 ASSERT(ctip->dst < ctip->tail); 359 ctip->dst->tte.ll = ttep->ll; 360 ctip->dst->va_tag = va_tag; 361 ctip->dst->index = ctip->index--; 362 ctip->dst->tmp = 0; 363 ctip->dst++; 364 } 365 } 366 367 368 /* 369 * some tlb entries are stale, filter for unlocked entries 370 * within the prom virt range and clear them 371 */ 372 static void 373 i_cpr_ufw(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 374 { 375 sutlb_t clr; 376 cti_t *ctip; 377 378 if (!TTE_IS_LOCKED(ttep) && WITHIN_OFW(va_tag)) { 379 ctip = ctrans; 380 bzero(&clr, sizeof (clr)); 381 (*ctip->writer)((uint_t)index, &clr.tte, &clr.va_tag); 382 } 383 } 384 385 386 /* 387 * some of the entries installed by cprboot are needed only on a 388 * short-term basis and need to be flushed to avoid clogging the tlbs. 389 * scan the dtte/itte arrays for items marked as temporary and clear 390 * dtlb/itlb entries using wrfunc. 391 */ 392 static void 393 i_cpr_clear_tmp(sutlb_t *listp, int max, tlb_rw_t wrfunc) 394 { 395 sutlb_t clr, *tail; 396 397 bzero(&clr, sizeof (clr)); 398 for (tail = listp + max; listp < tail && listp->va_tag; listp++) { 399 if (listp->tmp) 400 (*wrfunc)((uint_t)listp->index, &clr.tte, &clr.va_tag); 401 } 402 } 403 404 405 /* ARGSUSED */ 406 static void 407 i_cpr_clear_entries(uint64_t arg1, uint64_t arg2) 408 { 409 extern void demap_all(void); 410 cti_t cti; 411 412 i_cpr_clear_tmp(m_info.dtte, CPR_MAX_TLB, dtlb_wr_entry); 413 i_cpr_clear_tmp(m_info.itte, CPR_MAX_TLB, itlb_wr_entry); 414 415 /* 416 * for newer cpus that implement DEMAP_ALL_TYPE, demap_all is 417 * a second label for vtag_flushall. the call is made using 418 * vtag_flushall() instead of demap_all() due to runtime and 419 * krtld results with both older and newer cpu modules. 420 */ 421 if (&demap_all != 0) { 422 vtag_flushall(); 423 return; 424 } 425 426 /* 427 * for older V9 cpus, scan tlbs and clear stale entries 428 */ 429 bzero(&cti, sizeof (cti)); 430 cti.filter = i_cpr_ufw; 431 432 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 433 cti.reader = dtlb_rd_entry; 434 cti.writer = dtlb_wr_entry; 435 i_cpr_scan_tlb(&cti); 436 437 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 438 cti.reader = itlb_rd_entry; 439 cti.writer = itlb_wr_entry; 440 i_cpr_scan_tlb(&cti); 441 } 442 443 444 /* 445 * craft tlb info for tmp use during resume; this data gets used by 446 * cprboot to install tlb entries. we also mark each struct as tmp 447 * so those tlb entries will get flushed after switching to the kernel 448 * trap table. no data needs to be recorded for vaddr when it falls 449 * within the nucleus since we've already recorded nucleus ttes and 450 * a 8K tte would conflict with a 4MB tte. eg: the cpr module 451 * text/data may have been loaded into the text/data nucleus. 452 */ 453 static void 454 i_cpr_make_tte(cti_t *ctip, void *vaddr, caddr_t nbase) 455 { 456 pfn_t ppn; 457 uint_t rw; 458 459 if (WITHIN_NUCLEUS((caddr_t)vaddr, nbase)) 460 return; 461 462 while ((1 << ctip->index) & ctip->skip) 463 ctip->index--; 464 ASSERT(ctip->index > 0); 465 ASSERT(ctip->dst < ctip->tail); 466 467 /* 468 * without any global service available to lookup 469 * a tte by vaddr, we craft our own here: 470 */ 471 ppn = va_to_pfn(vaddr); 472 rw = (nbase == datava) ? TTE_HWWR_INT : 0; 473 ctip->dst->tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 474 ctip->dst->tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 475 TTE_CP_INT | TTE_PRIV_INT | rw; 476 ctip->dst->va_tag = ((uintptr_t)vaddr & MMU_PAGEMASK); 477 ctip->dst->index = ctip->index--; 478 ctip->dst->tmp = 1; 479 ctip->dst++; 480 } 481 482 483 static void 484 i_cpr_xcall(xcfunc_t func) 485 { 486 uint_t pil, reset_pil; 487 488 pil = getpil(); 489 if (pil < XCALL_PIL) 490 reset_pil = 0; 491 else { 492 reset_pil = 1; 493 setpil(XCALL_PIL - 1); 494 } 495 xc_some(cpu_ready_set, func, 0, 0); 496 if (reset_pil) 497 setpil(pil); 498 } 499 500 501 /* 502 * restart paused slave cpus 503 */ 504 void 505 i_cpr_machdep_setup(void) 506 { 507 if (ncpus > 1) { 508 DEBUG1(errp("MP restarted...\n")); 509 mutex_enter(&cpu_lock); 510 start_cpus(); 511 mutex_exit(&cpu_lock); 512 } 513 } 514 515 516 /* 517 * Stop all interrupt activities in the system 518 */ 519 void 520 i_cpr_stop_intr(void) 521 { 522 (void) spl7(); 523 } 524 525 /* 526 * Set machine up to take interrupts 527 */ 528 void 529 i_cpr_enable_intr(void) 530 { 531 (void) spl0(); 532 } 533 534 535 /* 536 * record cpu nodes and ids 537 */ 538 static void 539 i_cpr_save_cpu_info(void) 540 { 541 struct sun4u_cpu_info *scip; 542 cpu_t *cp; 543 544 scip = m_info.sci; 545 cp = CPU; 546 do { 547 ASSERT(scip < &m_info.sci[NCPU]); 548 scip->cpu_id = cp->cpu_id; 549 scip->node = cpunodes[cp->cpu_id].nodeid; 550 scip++; 551 } while ((cp = cp->cpu_next) != CPU); 552 } 553 554 555 /* 556 * Write necessary machine dependent information to cpr state file, 557 * eg. sun4u mmu ctx secondary for the current running process (cpr) ... 558 */ 559 int 560 i_cpr_write_machdep(vnode_t *vp) 561 { 562 extern uint_t getpstate(), getwstate(); 563 extern uint_t i_cpr_tstack_size; 564 const char ustr[] = ": unix-tte 2drop false ;"; 565 uintptr_t tinfo; 566 label_t *ltp; 567 cmd_t cmach; 568 char *fmt; 569 int rc; 570 571 /* 572 * ustr[] is used as temporary forth words during 573 * slave startup sequence, see sfmmu_mp_startup() 574 */ 575 576 cmach.md_magic = (uint_t)CPR_MACHDEP_MAGIC; 577 cmach.md_size = sizeof (m_info) + sizeof (ustr); 578 579 if (rc = cpr_write(vp, (caddr_t)&cmach, sizeof (cmach))) { 580 cpr_err(CE_WARN, "Failed to write descriptor."); 581 return (rc); 582 } 583 584 /* 585 * m_info is now cleared in i_cpr_dump_setup() 586 */ 587 m_info.ksb = (uint32_t)STACK_BIAS; 588 m_info.kpstate = (uint16_t)getpstate(); 589 m_info.kwstate = (uint16_t)getwstate(); 590 DEBUG1(errp("stack bias 0x%x, pstate 0x%x, wstate 0x%x\n", 591 m_info.ksb, m_info.kpstate, m_info.kwstate)); 592 593 ltp = &ttolwp(curthread)->lwp_qsav; 594 m_info.qsav_pc = (cpr_ext)ltp->val[0]; 595 m_info.qsav_sp = (cpr_ext)ltp->val[1]; 596 597 /* 598 * Set secondary context to INVALID_CONTEXT to force the HAT 599 * to re-setup the MMU registers and locked TTEs it needs for 600 * TLB miss handling. 601 */ 602 m_info.mmu_ctx_sec = INVALID_CONTEXT; 603 m_info.mmu_ctx_pri = KCONTEXT; 604 605 tinfo = (uintptr_t)curthread; 606 m_info.thrp = (cpr_ptr)tinfo; 607 608 tinfo = (uintptr_t)i_cpr_resume_setup; 609 m_info.func = (cpr_ptr)tinfo; 610 611 /* 612 * i_cpr_data_page is comprised of a 4K stack area and a few 613 * trailing data symbols; the page is shared by the prom and 614 * kernel during resume. the stack size is recorded here 615 * and used by cprboot to set %sp 616 */ 617 tinfo = (uintptr_t)&i_cpr_data_page; 618 m_info.tmp_stack = (cpr_ptr)tinfo; 619 m_info.tmp_stacksize = i_cpr_tstack_size; 620 621 m_info.test_mode = cpr_test_mode; 622 623 i_cpr_save_cpu_info(); 624 625 if (rc = cpr_write(vp, (caddr_t)&m_info, sizeof (m_info))) { 626 cpr_err(CE_WARN, "Failed to write machdep info."); 627 return (rc); 628 } 629 630 fmt = "error writing %s forth info"; 631 if (rc = cpr_write(vp, (caddr_t)ustr, sizeof (ustr))) 632 cpr_err(CE_WARN, fmt, "unix-tte"); 633 634 return (rc); 635 } 636 637 638 /* 639 * Save miscellaneous information which needs to be written to the 640 * state file. This information is required to re-initialize 641 * kernel/prom handshaking. 642 */ 643 void 644 i_cpr_save_machdep_info(void) 645 { 646 DEBUG5(errp("jumpback size = 0x%lx\n", 647 (uintptr_t)&i_cpr_end_jumpback - 648 (uintptr_t)i_cpr_resume_setup)); 649 650 /* 651 * Verify the jumpback code all falls in one page. 652 */ 653 if (((uintptr_t)&i_cpr_end_jumpback & MMU_PAGEMASK) != 654 ((uintptr_t)i_cpr_resume_setup & MMU_PAGEMASK)) 655 cpr_err(CE_PANIC, "jumpback code exceeds one page."); 656 } 657 658 659 void 660 i_cpr_set_tbr(void) 661 { 662 } 663 664 665 /* 666 * cpu0 should contain bootcpu info 667 */ 668 cpu_t * 669 i_cpr_bootcpu(void) 670 { 671 return (&cpu0); 672 } 673 674 675 /* 676 * Return the virtual address of the mapping area 677 */ 678 caddr_t 679 i_cpr_map_setup(void) 680 { 681 /* 682 * Allocate a virtual memory range spanned by an hmeblk. 683 * This would be 8 hments or 64k bytes. Starting VA 684 * must be 64k (8-page) aligned. 685 */ 686 cpr_vaddr = vmem_xalloc(heap_arena, 687 mmu_ptob(NHMENTS), mmu_ptob(NHMENTS), 688 0, 0, NULL, NULL, VM_NOSLEEP); 689 return (cpr_vaddr); 690 } 691 692 /* 693 * create tmp locked tlb entries for a group of phys pages; 694 * 695 * i_cpr_mapin/i_cpr_mapout should always be called in pairs, 696 * otherwise would fill up a tlb with locked entries 697 */ 698 void 699 i_cpr_mapin(caddr_t vaddr, uint_t pages, pfn_t ppn) 700 { 701 tte_t tte; 702 extern pfn_t curthreadpfn; 703 extern int curthreadremapped; 704 705 curthreadremapped = (ppn <= curthreadpfn && curthreadpfn < ppn + pages); 706 707 for (; pages--; ppn++, vaddr += MMU_PAGESIZE) { 708 tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 709 tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 710 TTE_CP_INT | TTE_PRIV_INT | TTE_HWWR_INT; 711 sfmmu_dtlb_ld(vaddr, KCONTEXT, &tte); 712 } 713 } 714 715 void 716 i_cpr_mapout(caddr_t vaddr, uint_t pages) 717 { 718 extern int curthreadremapped; 719 720 if (curthreadremapped && vaddr <= (caddr_t)curthread && 721 (caddr_t)curthread < vaddr + pages * MMU_PAGESIZE) 722 curthreadremapped = 0; 723 724 for (; pages--; vaddr += MMU_PAGESIZE) 725 vtag_flushpage(vaddr, KCONTEXT); 726 } 727 728 /* 729 * We're done using the mapping area; release virtual space 730 */ 731 void 732 i_cpr_map_destroy(void) 733 { 734 vmem_free(heap_arena, cpr_vaddr, mmu_ptob(NHMENTS)); 735 cpr_vaddr = NULL; 736 } 737 738 /* ARGSUSED */ 739 void 740 i_cpr_handle_xc(int flag) 741 { 742 } 743 744 745 /* 746 * This function takes care of pages which are not in kas or need to be 747 * taken care of in a special way. For example, panicbuf pages are not 748 * in kas and their pages are allocated via prom_retain(). 749 */ 750 pgcnt_t 751 i_cpr_count_special_kpages(int mapflag, bitfunc_t bitfunc) 752 { 753 struct cpr_map_info *pri, *tail; 754 pgcnt_t pages, total = 0; 755 pfn_t pfn; 756 757 /* 758 * Save information about prom retained panicbuf pages 759 */ 760 if (bitfunc == cpr_setbit) { 761 pri = &cpr_prom_retain[CPR_PANICBUF]; 762 pri->virt = (cpr_ptr)panicbuf; 763 pri->phys = va_to_pa(panicbuf); 764 pri->size = sizeof (panicbuf); 765 } 766 767 /* 768 * Go through the prom_retain array to tag those pages. 769 */ 770 tail = &cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 771 for (pri = cpr_prom_retain; pri < tail; pri++) { 772 pages = mmu_btopr(pri->size); 773 for (pfn = ADDR_TO_PN(pri->phys); pages--; pfn++) { 774 if (pf_is_memory(pfn)) { 775 if (bitfunc == cpr_setbit) { 776 if ((*bitfunc)(pfn, mapflag) == 0) 777 total++; 778 } else 779 total++; 780 } 781 } 782 } 783 784 return (total); 785 } 786 787 788 /* 789 * Free up memory-related resources here. We start by freeing buffers 790 * allocated during suspend initialization. Also, free up the mapping 791 * resources allocated in cpr_init(). 792 */ 793 void 794 i_cpr_free_memory_resources(void) 795 { 796 (void) i_cpr_prom_pages(CPR_PROM_FREE); 797 i_cpr_map_destroy(); 798 i_cpr_storage_free(); 799 } 800 801 802 /* 803 * Derived from cpr_write_statefile(). 804 * Save the sensitive pages to the storage area and do bookkeeping 805 * using the sensitive descriptors. Each descriptor will contain no more 806 * than CPR_MAXCONTIG amount of contiguous pages to match the max amount 807 * of pages that statefile gets written to disk at each write. 808 * XXX The CPR_MAXCONTIG can be changed to the size of the compression 809 * scratch area. 810 */ 811 static int 812 i_cpr_save_to_storage(void) 813 { 814 sensitive_size_saved = 0; 815 sensitive_pages_saved = 0; 816 sensitive_write_ptr = i_cpr_storage_data_base; 817 return (cpr_contig_pages(NULL, SAVE_TO_STORAGE)); 818 } 819 820 821 /* 822 * This routine allocates space to save the sensitive kernel pages, 823 * i.e. kernel data nucleus, kvalloc and kvseg segments. 824 * It's assumed that those segments are the only areas that can be 825 * contaminated by memory allocations during statefile dumping. 826 * The space allocated here contains: 827 * A list of descriptors describing the saved sensitive pages. 828 * The storage area for saving the compressed sensitive kernel pages. 829 * Since storage pages are allocated from segkmem, they need to be 830 * excluded when saving. 831 */ 832 int 833 i_cpr_save_sensitive_kpages(void) 834 { 835 static const char pages_fmt[] = "\n%s %s allocs\n" 836 " spages %ld, vpages %ld, diff %ld\n"; 837 int retry_cnt; 838 int error = 0; 839 pgcnt_t pages, spages, vpages; 840 caddr_t addr; 841 char *str; 842 843 /* 844 * Tag sensitive kpages. Allocate space for storage descriptors 845 * and storage data area based on the resulting bitmaps. 846 * Note: The storage space will be part of the sensitive 847 * segment, so we need to tag kpages here before the storage 848 * is actually allocated just so their space won't be accounted 849 * for. They will not be part of the statefile although those 850 * pages will be claimed by cprboot. 851 */ 852 cpr_clear_bitmaps(); 853 854 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 855 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 856 pages = spages - vpages; 857 858 str = "i_cpr_save_sensitive_kpages:"; 859 DEBUG7(errp(pages_fmt, "before", str, spages, vpages, pages)); 860 861 /* 862 * Allocate space to save the clean sensitive kpages 863 */ 864 for (retry_cnt = 0; retry_cnt < MAX_STORAGE_ALLOC_RETRY; retry_cnt++) { 865 /* 866 * Alloc on first pass or realloc if we are retrying because 867 * of insufficient storage for sensitive pages 868 */ 869 if (retry_cnt == 0 || error == ENOMEM) { 870 if (i_cpr_storage_data_base) { 871 kmem_free(i_cpr_storage_data_base, 872 mmu_ptob(i_cpr_storage_data_sz)); 873 i_cpr_storage_data_base = NULL; 874 i_cpr_storage_data_sz = 0; 875 } 876 addr = i_cpr_storage_data_alloc(pages, 877 &i_cpr_storage_data_sz, retry_cnt); 878 if (addr == NULL) { 879 DEBUG7(errp( 880 "\n%s can't allocate data storage space!\n", 881 str)); 882 return (ENOMEM); 883 } 884 i_cpr_storage_data_base = addr; 885 i_cpr_storage_data_end = 886 addr + mmu_ptob(i_cpr_storage_data_sz); 887 } 888 889 /* 890 * Allocate on first pass, only realloc if retry is because of 891 * insufficient descriptors, but reset contents on each pass 892 * (desc_alloc resets contents as well) 893 */ 894 if (retry_cnt == 0 || error == -1) { 895 error = i_cpr_storage_desc_alloc( 896 &i_cpr_storage_desc_base, &i_cpr_storage_desc_pgcnt, 897 &i_cpr_storage_desc_end, retry_cnt); 898 if (error != 0) 899 return (error); 900 } else { 901 i_cpr_storage_desc_init(i_cpr_storage_desc_base, 902 i_cpr_storage_desc_pgcnt, i_cpr_storage_desc_end); 903 } 904 905 /* 906 * We are ready to save the sensitive kpages to storage. 907 * We cannot trust what's tagged in the bitmaps anymore 908 * after storage allocations. Clear up the bitmaps and 909 * retag the sensitive kpages again. The storage pages 910 * should be untagged. 911 */ 912 cpr_clear_bitmaps(); 913 914 spages = 915 i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 916 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 917 918 DEBUG7(errp(pages_fmt, "after ", str, 919 spages, vpages, spages - vpages)); 920 921 /* 922 * Returns 0 on success, -1 if too few descriptors, and 923 * ENOMEM if not enough space to save sensitive pages 924 */ 925 DEBUG1(errp("compressing pages to storage...\n")); 926 error = i_cpr_save_to_storage(); 927 if (error == 0) { 928 /* Saving to storage succeeded */ 929 DEBUG1(errp("compressed %d pages\n", 930 sensitive_pages_saved)); 931 break; 932 } else if (error == -1) 933 DEBUG1(errp("%s too few descriptors\n", str)); 934 } 935 if (error == -1) 936 error = ENOMEM; 937 return (error); 938 } 939 940 941 /* 942 * Estimate how much memory we will need to save 943 * the sensitive pages with compression. 944 */ 945 static caddr_t 946 i_cpr_storage_data_alloc(pgcnt_t pages, pgcnt_t *alloc_pages, int retry_cnt) 947 { 948 pgcnt_t alloc_pcnt, last_pcnt; 949 caddr_t addr; 950 char *str; 951 952 str = "i_cpr_storage_data_alloc:"; 953 if (retry_cnt == 0) { 954 /* 955 * common compression ratio is about 3:1 956 * initial storage allocation is estimated at 40% 957 * to cover the majority of cases 958 */ 959 alloc_pcnt = INITIAL_ALLOC_PCNT; 960 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 961 DEBUG7(errp("%s sensitive pages: %ld\n", str, pages)); 962 DEBUG7(errp("%s initial est pages: %ld, alloc %ld%%\n", 963 str, *alloc_pages, alloc_pcnt)); 964 } else { 965 /* 966 * calculate the prior compression percentage (x100) 967 * from the last attempt to save sensitive pages 968 */ 969 ASSERT(sensitive_pages_saved != 0); 970 last_pcnt = (mmu_btopr(sensitive_size_saved) * INTEGRAL) / 971 sensitive_pages_saved; 972 DEBUG7(errp("%s last ratio %ld%%\n", str, last_pcnt)); 973 974 /* 975 * new estimated storage size is based on 976 * the larger ratio + 5% for each retry: 977 * pages * (last + [5%, 10%]) 978 */ 979 alloc_pcnt = MAX(last_pcnt, INITIAL_ALLOC_PCNT) + 980 (retry_cnt * 5); 981 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 982 DEBUG7(errp("%s Retry est pages: %ld, alloc %ld%%\n", 983 str, *alloc_pages, alloc_pcnt)); 984 } 985 986 addr = kmem_alloc(mmu_ptob(*alloc_pages), KM_NOSLEEP); 987 DEBUG7(errp("%s alloc %ld pages\n", str, *alloc_pages)); 988 return (addr); 989 } 990 991 992 void 993 i_cpr_storage_free(void) 994 { 995 /* Free descriptors */ 996 if (i_cpr_storage_desc_base) { 997 kmem_free(i_cpr_storage_desc_base, 998 mmu_ptob(i_cpr_storage_desc_pgcnt)); 999 i_cpr_storage_desc_base = NULL; 1000 i_cpr_storage_desc_pgcnt = 0; 1001 } 1002 1003 1004 /* Data storage */ 1005 if (i_cpr_storage_data_base) { 1006 kmem_free(i_cpr_storage_data_base, 1007 mmu_ptob(i_cpr_storage_data_sz)); 1008 i_cpr_storage_data_base = NULL; 1009 i_cpr_storage_data_sz = 0; 1010 } 1011 } 1012 1013 1014 /* 1015 * This routine is derived from cpr_compress_and_write(). 1016 * 1. Do bookkeeping in the descriptor for the contiguous sensitive chunk. 1017 * 2. Compress and save the clean sensitive pages into the storage area. 1018 */ 1019 int 1020 i_cpr_compress_and_save(int chunks, pfn_t spfn, pgcnt_t pages) 1021 { 1022 extern char *cpr_compress_pages(cpd_t *, pgcnt_t, int); 1023 extern caddr_t i_cpr_storage_data_end; 1024 uint_t remaining, datalen; 1025 uint32_t test_usum; 1026 char *datap; 1027 csd_t *descp; 1028 cpd_t cpd; 1029 int error; 1030 1031 /* 1032 * Fill next empty storage descriptor 1033 */ 1034 descp = i_cpr_storage_desc_base + chunks - 1; 1035 if (descp >= i_cpr_storage_desc_end) { 1036 DEBUG1(errp("ran out of descriptors, base 0x%p, chunks %d, " 1037 "end 0x%p, descp 0x%p\n", i_cpr_storage_desc_base, chunks, 1038 i_cpr_storage_desc_end, descp)); 1039 return (-1); 1040 } 1041 ASSERT(descp->csd_dirty_spfn == (uint_t)-1); 1042 i_cpr_storage_desc_last_used = descp; 1043 1044 descp->csd_dirty_spfn = spfn; 1045 descp->csd_dirty_npages = pages; 1046 1047 i_cpr_mapin(CPR->c_mapping_area, pages, spfn); 1048 1049 /* 1050 * try compressing pages and copy cpd fields 1051 * pfn is copied for debug use 1052 */ 1053 cpd.cpd_pfn = spfn; 1054 datap = cpr_compress_pages(&cpd, pages, C_COMPRESSING); 1055 datalen = cpd.cpd_length; 1056 descp->csd_clean_compressed = (cpd.cpd_flag & CPD_COMPRESS); 1057 #ifdef DEBUG 1058 descp->csd_usum = cpd.cpd_usum; 1059 descp->csd_csum = cpd.cpd_csum; 1060 #endif 1061 1062 error = 0; 1063 1064 /* 1065 * Save the raw or compressed data to the storage area pointed to by 1066 * sensitive_write_ptr. Make sure the storage space is big enough to 1067 * hold the result. Otherwise roll back to increase the storage space. 1068 */ 1069 descp->csd_clean_sva = (cpr_ptr)sensitive_write_ptr; 1070 descp->csd_clean_sz = datalen; 1071 if ((sensitive_write_ptr + datalen) < i_cpr_storage_data_end) { 1072 extern void cprbcopy(void *, void *, size_t); 1073 1074 cprbcopy(datap, sensitive_write_ptr, datalen); 1075 sensitive_size_saved += datalen; 1076 sensitive_pages_saved += descp->csd_dirty_npages; 1077 sensitive_write_ptr += datalen; 1078 } else { 1079 remaining = (i_cpr_storage_data_end - sensitive_write_ptr); 1080 DEBUG1(errp("i_cpr_compress_and_save: The storage " 1081 "space is too small!\ngot %d, want %d\n\n", 1082 remaining, (remaining + datalen))); 1083 #ifdef DEBUG 1084 /* 1085 * Check to see if the content of the sensitive pages that we 1086 * just copied have changed during this small time window. 1087 */ 1088 test_usum = checksum32(CPR->c_mapping_area, mmu_ptob(pages)); 1089 descp->csd_usum = cpd.cpd_usum; 1090 if (test_usum != descp->csd_usum) { 1091 DEBUG1(errp("\nWARNING: i_cpr_compress_and_save: " 1092 "Data in the range of pfn 0x%x to pfn " 1093 "0x%x has changed after they are saved " 1094 "into storage.", spfn, (spfn + pages - 1))); 1095 } 1096 #endif 1097 error = ENOMEM; 1098 } 1099 1100 i_cpr_mapout(CPR->c_mapping_area, pages); 1101 return (error); 1102 } 1103 1104 1105 /* 1106 * This routine is derived from cpr_count_kpages(). 1107 * It goes through kernel data nucleus and segkmem segments to select 1108 * pages in use and mark them in the corresponding bitmap. 1109 */ 1110 pgcnt_t 1111 i_cpr_count_sensitive_kpages(int mapflag, bitfunc_t bitfunc) 1112 { 1113 pgcnt_t kdata_cnt = 0, segkmem_cnt = 0; 1114 extern caddr_t e_moddata; 1115 extern struct seg kvalloc; 1116 extern struct seg kmem64; 1117 size_t size; 1118 1119 /* 1120 * Kernel data nucleus pages 1121 */ 1122 size = e_moddata - s_data; 1123 kdata_cnt += cpr_count_pages(s_data, size, 1124 mapflag, bitfunc, DBG_SHOWRANGE); 1125 1126 /* 1127 * kvseg and kvalloc pages 1128 */ 1129 segkmem_cnt += cpr_scan_kvseg(mapflag, bitfunc, &kvseg); 1130 segkmem_cnt += cpr_count_pages(kvalloc.s_base, kvalloc.s_size, 1131 mapflag, bitfunc, DBG_SHOWRANGE); 1132 1133 /* segment to support kernel memory usage above 32-bit space (4GB) */ 1134 if (kmem64.s_base) 1135 segkmem_cnt += cpr_count_pages(kmem64.s_base, kmem64.s_size, 1136 mapflag, bitfunc, DBG_SHOWRANGE); 1137 1138 DEBUG7(errp("\ni_cpr_count_sensitive_kpages:\n" 1139 "\tkdata_cnt %ld + segkmem_cnt %ld = %ld pages\n", 1140 kdata_cnt, segkmem_cnt, kdata_cnt + segkmem_cnt)); 1141 1142 return (kdata_cnt + segkmem_cnt); 1143 } 1144 1145 1146 pgcnt_t 1147 i_cpr_count_storage_pages(int mapflag, bitfunc_t bitfunc) 1148 { 1149 pgcnt_t count = 0; 1150 1151 if (i_cpr_storage_desc_base) { 1152 count += cpr_count_pages((caddr_t)i_cpr_storage_desc_base, 1153 (size_t)mmu_ptob(i_cpr_storage_desc_pgcnt), 1154 mapflag, bitfunc, DBG_SHOWRANGE); 1155 } 1156 if (i_cpr_storage_data_base) { 1157 count += cpr_count_pages(i_cpr_storage_data_base, 1158 (size_t)mmu_ptob(i_cpr_storage_data_sz), 1159 mapflag, bitfunc, DBG_SHOWRANGE); 1160 } 1161 return (count); 1162 } 1163 1164 1165 /* 1166 * Derived from cpr_write_statefile(). 1167 * Allocate (or reallocate after exhausting the supply) descriptors for each 1168 * chunk of contiguous sensitive kpages. 1169 */ 1170 static int 1171 i_cpr_storage_desc_alloc(csd_t **basepp, pgcnt_t *pgsp, csd_t **endpp, 1172 int retry) 1173 { 1174 pgcnt_t npages; 1175 int chunks; 1176 csd_t *descp, *end; 1177 size_t len; 1178 char *str = "i_cpr_storage_desc_alloc:"; 1179 1180 /* 1181 * On initial allocation, add some extra to cover overhead caused 1182 * by the allocation for the storage area later. 1183 */ 1184 if (retry == 0) { 1185 chunks = cpr_contig_pages(NULL, STORAGE_DESC_ALLOC) + 1186 EXTRA_DESCS; 1187 npages = mmu_btopr(sizeof (**basepp) * (pgcnt_t)chunks); 1188 DEBUG7(errp("%s chunks %d, ", str, chunks)); 1189 } else { 1190 DEBUG7(errp("%s retry %d: ", str, retry)); 1191 npages = *pgsp + 1; 1192 } 1193 /* Free old descriptors, if any */ 1194 if (*basepp) 1195 kmem_free((caddr_t)*basepp, mmu_ptob(*pgsp)); 1196 1197 descp = *basepp = kmem_alloc(mmu_ptob(npages), KM_NOSLEEP); 1198 if (descp == NULL) { 1199 DEBUG7(errp("%s no space for descriptors!\n", str)); 1200 return (ENOMEM); 1201 } 1202 1203 *pgsp = npages; 1204 len = mmu_ptob(npages); 1205 end = *endpp = descp + (len / (sizeof (**basepp))); 1206 DEBUG7(errp("npages 0x%x, len 0x%x, items 0x%x\n\t*basepp " 1207 "%p, *endpp %p\n", npages, len, (len / (sizeof (**basepp))), 1208 *basepp, *endpp)); 1209 i_cpr_storage_desc_init(descp, npages, end); 1210 return (0); 1211 } 1212 1213 static void 1214 i_cpr_storage_desc_init(csd_t *descp, pgcnt_t npages, csd_t *end) 1215 { 1216 size_t len = mmu_ptob(npages); 1217 1218 /* Initialize the descriptors to something impossible. */ 1219 bzero(descp, len); 1220 #ifdef DEBUG 1221 /* 1222 * This condition is tested by an ASSERT 1223 */ 1224 for (; descp < end; descp++) 1225 descp->csd_dirty_spfn = (uint_t)-1; 1226 #endif 1227 } 1228 1229 int 1230 i_cpr_dump_sensitive_kpages(vnode_t *vp) 1231 { 1232 int error = 0; 1233 uint_t spin_cnt = 0; 1234 csd_t *descp; 1235 1236 /* 1237 * These following two variables need to be reinitialized 1238 * for each cpr cycle. 1239 */ 1240 i_cpr_sensitive_bytes_dumped = 0; 1241 i_cpr_sensitive_pgs_dumped = 0; 1242 1243 if (i_cpr_storage_desc_base) { 1244 for (descp = i_cpr_storage_desc_base; 1245 descp <= i_cpr_storage_desc_last_used; descp++) { 1246 if (error = cpr_dump_sensitive(vp, descp)) 1247 return (error); 1248 spin_cnt++; 1249 if ((spin_cnt & 0x5F) == 1) 1250 cpr_spinning_bar(); 1251 } 1252 prom_printf(" \b"); 1253 } 1254 1255 DEBUG7(errp("\ni_cpr_dump_sensitive_kpages: dumped %d\n", 1256 i_cpr_sensitive_pgs_dumped)); 1257 return (0); 1258 } 1259 1260 1261 /* 1262 * 1. Fill the cpr page descriptor with the info of the dirty pages 1263 * and 1264 * write the descriptor out. It will be used at resume. 1265 * 2. Write the clean data in stead of the dirty data out. 1266 * Note: to save space, the clean data is already compressed. 1267 */ 1268 static int 1269 cpr_dump_sensitive(vnode_t *vp, csd_t *descp) 1270 { 1271 int error = 0; 1272 caddr_t datap; 1273 cpd_t cpd; /* cpr page descriptor */ 1274 pfn_t dirty_spfn; 1275 pgcnt_t dirty_npages; 1276 size_t clean_sz; 1277 caddr_t clean_sva; 1278 int clean_compressed; 1279 extern uchar_t cpr_pagecopy[]; 1280 1281 dirty_spfn = descp->csd_dirty_spfn; 1282 dirty_npages = descp->csd_dirty_npages; 1283 clean_sva = (caddr_t)descp->csd_clean_sva; 1284 clean_sz = descp->csd_clean_sz; 1285 clean_compressed = descp->csd_clean_compressed; 1286 1287 /* Fill cpr page descriptor. */ 1288 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 1289 cpd.cpd_pfn = dirty_spfn; 1290 cpd.cpd_flag = 0; /* must init to zero */ 1291 cpd.cpd_pages = dirty_npages; 1292 1293 #ifdef DEBUG 1294 if ((cpd.cpd_usum = descp->csd_usum) != 0) 1295 cpd.cpd_flag |= CPD_USUM; 1296 if ((cpd.cpd_csum = descp->csd_csum) != 0) 1297 cpd.cpd_flag |= CPD_CSUM; 1298 #endif 1299 1300 STAT->cs_dumped_statefsz += mmu_ptob(dirty_npages); 1301 1302 /* 1303 * The sensitive kpages are usually saved with compression 1304 * unless compression could not reduce the size of the data. 1305 * If user choose not to have the statefile compressed, 1306 * we need to decompress the data back before dumping it to disk. 1307 */ 1308 if (CPR->c_flags & C_COMPRESSING) { 1309 cpd.cpd_length = clean_sz; 1310 datap = clean_sva; 1311 if (clean_compressed) 1312 cpd.cpd_flag |= CPD_COMPRESS; 1313 } else { 1314 if (clean_compressed) { 1315 cpd.cpd_length = decompress(clean_sva, cpr_pagecopy, 1316 clean_sz, mmu_ptob(dirty_npages)); 1317 datap = (caddr_t)cpr_pagecopy; 1318 ASSERT(cpd.cpd_length == mmu_ptob(dirty_npages)); 1319 } else { 1320 cpd.cpd_length = clean_sz; 1321 datap = clean_sva; 1322 } 1323 cpd.cpd_csum = 0; 1324 } 1325 1326 /* Write cpr page descriptor */ 1327 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd)); 1328 if (error) { 1329 DEBUG7(errp("descp: %x\n", descp)); 1330 #ifdef DEBUG 1331 debug_enter("cpr_dump_sensitive: cpr_write() page " 1332 "descriptor failed!\n"); 1333 #endif 1334 return (error); 1335 } 1336 1337 i_cpr_sensitive_bytes_dumped += sizeof (cpd_t); 1338 1339 /* Write page data */ 1340 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 1341 if (error) { 1342 DEBUG7(errp("error: %x\n", error)); 1343 DEBUG7(errp("descp: %x\n", descp)); 1344 DEBUG7(errp("cpr_write(%x, %x , %x)\n", vp, datap, 1345 cpd.cpd_length)); 1346 #ifdef DEBUG 1347 debug_enter("cpr_dump_sensitive: cpr_write() data failed!\n"); 1348 #endif 1349 return (error); 1350 } 1351 1352 i_cpr_sensitive_bytes_dumped += cpd.cpd_length; 1353 i_cpr_sensitive_pgs_dumped += dirty_npages; 1354 1355 return (error); 1356 } 1357 1358 1359 /* 1360 * Sanity check to make sure that we have dumped right amount 1361 * of pages from different sources to statefile. 1362 */ 1363 int 1364 i_cpr_check_pgs_dumped(uint_t pgs_expected, uint_t regular_pgs_dumped) 1365 { 1366 uint_t total_pgs_dumped; 1367 1368 total_pgs_dumped = regular_pgs_dumped + i_cpr_sensitive_pgs_dumped; 1369 1370 DEBUG7(errp("\ncheck_pgs: reg %d + sens %d = %d, expect %d\n\n", 1371 regular_pgs_dumped, i_cpr_sensitive_pgs_dumped, 1372 total_pgs_dumped, pgs_expected)); 1373 1374 if (pgs_expected == total_pgs_dumped) 1375 return (0); 1376 1377 return (EINVAL); 1378 } 1379 1380 1381 int 1382 i_cpr_reusefini(void) 1383 { 1384 struct vnode *vp; 1385 cdef_t *cdef; 1386 size_t size; 1387 char *bufp; 1388 int rc; 1389 1390 if (cpr_reusable_mode) 1391 cpr_reusable_mode = 0; 1392 1393 if (rc = cpr_open_deffile(FREAD|FWRITE, &vp)) { 1394 if (rc == EROFS) { 1395 cpr_err(CE_CONT, "uadmin A_FREEZE AD_REUSEFINI " 1396 "(uadmin %d %d)\nmust be done with / mounted " 1397 "writeable.\n", A_FREEZE, AD_REUSEFINI); 1398 } 1399 return (rc); 1400 } 1401 1402 cdef = kmem_alloc(sizeof (*cdef), KM_SLEEP); 1403 rc = cpr_rdwr(UIO_READ, vp, cdef, sizeof (*cdef)); 1404 1405 if (rc) { 1406 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1407 cpr_default_path, rc); 1408 } else if (cdef->mini.magic != CPR_DEFAULT_MAGIC) { 1409 cpr_err(CE_WARN, "bad magic number in %s, cannot restore " 1410 "prom values for %s", cpr_default_path, 1411 cpr_enumerate_promprops(&bufp, &size)); 1412 kmem_free(bufp, size); 1413 rc = EINVAL; 1414 } else { 1415 /* 1416 * clean up prom properties 1417 */ 1418 rc = cpr_update_nvram(cdef->props); 1419 if (rc == 0) { 1420 /* 1421 * invalidate the disk copy and turn off reusable 1422 */ 1423 cdef->mini.magic = 0; 1424 cdef->mini.reusable = 0; 1425 if (rc = cpr_rdwr(UIO_WRITE, vp, 1426 &cdef->mini, sizeof (cdef->mini))) { 1427 cpr_err(CE_WARN, "Failed writing %s, errno %d", 1428 cpr_default_path, rc); 1429 } 1430 } 1431 } 1432 1433 (void) VOP_CLOSE(vp, FREAD|FWRITE, 1, (offset_t)0, CRED()); 1434 VN_RELE(vp); 1435 kmem_free(cdef, sizeof (*cdef)); 1436 1437 return (rc); 1438 } 1439 1440 1441 int 1442 i_cpr_reuseinit(void) 1443 { 1444 int rc = 0; 1445 1446 if (rc = cpr_default_setup(1)) 1447 return (rc); 1448 1449 /* 1450 * We need to validate default file 1451 */ 1452 rc = cpr_validate_definfo(1); 1453 if (rc == 0) 1454 cpr_reusable_mode = 1; 1455 else if (rc == EROFS) { 1456 cpr_err(CE_NOTE, "reuseinit must be performed " 1457 "while / is mounted writeable"); 1458 } 1459 1460 (void) cpr_default_setup(0); 1461 1462 return (rc); 1463 } 1464 1465 1466 int 1467 i_cpr_check_cprinfo(void) 1468 { 1469 struct vnode *vp; 1470 cmini_t mini; 1471 int rc = 0; 1472 1473 if (rc = cpr_open_deffile(FREAD, &vp)) { 1474 if (rc == ENOENT) 1475 cpr_err(CE_NOTE, "cprinfo file does not " 1476 "exist. You must run 'uadmin %d %d' " 1477 "command while / is mounted writeable,\n" 1478 "then reboot and run 'uadmin %d %d' " 1479 "to create a reusable statefile", 1480 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1481 return (rc); 1482 } 1483 1484 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 1485 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED()); 1486 VN_RELE(vp); 1487 1488 if (rc) { 1489 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1490 cpr_default_path, rc); 1491 } else if (mini.magic != CPR_DEFAULT_MAGIC) { 1492 cpr_err(CE_CONT, "bad magic number in cprinfo file.\n" 1493 "You must run 'uadmin %d %d' while / is mounted " 1494 "writeable, then reboot and run 'uadmin %d %d' " 1495 "to create a reusable statefile\n", 1496 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1497 rc = EINVAL; 1498 } 1499 1500 return (rc); 1501 } 1502 1503 1504 int 1505 i_cpr_reusable_supported(void) 1506 { 1507 return (1); 1508 } 1509 1510 1511 /* 1512 * find prom phys pages and alloc space for a tmp copy 1513 */ 1514 static int 1515 i_cpr_find_ppages(void) 1516 { 1517 extern struct vnode prom_ppages; 1518 struct page *pp; 1519 struct memlist *pmem; 1520 pgcnt_t npages, pcnt, scnt, vcnt; 1521 pfn_t ppn, plast, *dst; 1522 int mapflag; 1523 1524 cpr_clear_bitmaps(); 1525 mapflag = REGULAR_BITMAP; 1526 1527 /* 1528 * there should be a page_t for each phys page used by the kernel; 1529 * set a bit for each phys page not tracked by a page_t 1530 */ 1531 pcnt = 0; 1532 memlist_read_lock(); 1533 for (pmem = phys_install; pmem; pmem = pmem->next) { 1534 npages = mmu_btop(pmem->size); 1535 ppn = mmu_btop(pmem->address); 1536 for (plast = ppn + npages; ppn < plast; ppn++) { 1537 if (page_numtopp_nolock(ppn)) 1538 continue; 1539 (void) cpr_setbit(ppn, mapflag); 1540 pcnt++; 1541 } 1542 } 1543 memlist_read_unlock(); 1544 1545 /* 1546 * clear bits for phys pages in each segment 1547 */ 1548 scnt = cpr_count_seg_pages(mapflag, cpr_clrbit); 1549 1550 /* 1551 * set bits for phys pages referenced by the prom_ppages vnode; 1552 * these pages are mostly comprised of forthdebug words 1553 */ 1554 vcnt = 0; 1555 for (pp = prom_ppages.v_pages; pp; ) { 1556 if (cpr_setbit(pp->p_offset, mapflag) == 0) 1557 vcnt++; 1558 pp = pp->p_vpnext; 1559 if (pp == prom_ppages.v_pages) 1560 break; 1561 } 1562 1563 /* 1564 * total number of prom pages are: 1565 * (non-page_t pages - seg pages + vnode pages) 1566 */ 1567 ppage_count = pcnt - scnt + vcnt; 1568 DEBUG1(errp("find_ppages: pcnt %ld - scnt %ld + vcnt %ld = %ld\n", 1569 pcnt, scnt, vcnt, ppage_count)); 1570 1571 /* 1572 * alloc array of pfn_t to store phys page list 1573 */ 1574 pphys_list_size = ppage_count * sizeof (pfn_t); 1575 pphys_list = kmem_alloc(pphys_list_size, KM_NOSLEEP); 1576 if (pphys_list == NULL) { 1577 cpr_err(CE_WARN, "cannot alloc pphys_list"); 1578 return (ENOMEM); 1579 } 1580 1581 /* 1582 * phys pages referenced in the bitmap should be 1583 * those used by the prom; scan bitmap and save 1584 * a list of prom phys page numbers 1585 */ 1586 dst = pphys_list; 1587 memlist_read_lock(); 1588 for (pmem = phys_install; pmem; pmem = pmem->next) { 1589 npages = mmu_btop(pmem->size); 1590 ppn = mmu_btop(pmem->address); 1591 for (plast = ppn + npages; ppn < plast; ppn++) { 1592 if (cpr_isset(ppn, mapflag)) { 1593 ASSERT(dst < (pphys_list + ppage_count)); 1594 *dst++ = ppn; 1595 } 1596 } 1597 } 1598 memlist_read_unlock(); 1599 1600 /* 1601 * allocate space to store prom pages 1602 */ 1603 ppage_buf = kmem_alloc(mmu_ptob(ppage_count), KM_NOSLEEP); 1604 if (ppage_buf == NULL) { 1605 kmem_free(pphys_list, pphys_list_size); 1606 pphys_list = NULL; 1607 cpr_err(CE_WARN, "cannot alloc ppage_buf"); 1608 return (ENOMEM); 1609 } 1610 1611 return (0); 1612 } 1613 1614 1615 /* 1616 * save prom pages to kmem pages 1617 */ 1618 static void 1619 i_cpr_save_ppages(void) 1620 { 1621 pfn_t *pphys, *plast; 1622 caddr_t dst; 1623 1624 /* 1625 * map in each prom page and copy to a kmem page 1626 */ 1627 dst = ppage_buf; 1628 plast = pphys_list + ppage_count; 1629 for (pphys = pphys_list; pphys < plast; pphys++) { 1630 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1631 bcopy(cpr_vaddr, dst, MMU_PAGESIZE); 1632 i_cpr_mapout(cpr_vaddr, 1); 1633 dst += MMU_PAGESIZE; 1634 } 1635 1636 DEBUG1(errp("saved %d prom pages\n", ppage_count)); 1637 } 1638 1639 1640 /* 1641 * restore prom pages from kmem pages 1642 */ 1643 static void 1644 i_cpr_restore_ppages(void) 1645 { 1646 pfn_t *pphys, *plast; 1647 caddr_t src; 1648 1649 dcache_flushall(); 1650 1651 /* 1652 * map in each prom page and copy from a kmem page 1653 */ 1654 src = ppage_buf; 1655 plast = pphys_list + ppage_count; 1656 for (pphys = pphys_list; pphys < plast; pphys++) { 1657 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1658 bcopy(src, cpr_vaddr, MMU_PAGESIZE); 1659 i_cpr_mapout(cpr_vaddr, 1); 1660 src += MMU_PAGESIZE; 1661 } 1662 1663 dcache_flushall(); 1664 1665 DEBUG1(errp("restored %d prom pages\n", ppage_count)); 1666 } 1667 1668 1669 /* 1670 * save/restore prom pages or free related allocs 1671 */ 1672 int 1673 i_cpr_prom_pages(int action) 1674 { 1675 int error; 1676 1677 if (action == CPR_PROM_SAVE) { 1678 if (ppage_buf == NULL) { 1679 ASSERT(pphys_list == NULL); 1680 if (error = i_cpr_find_ppages()) 1681 return (error); 1682 i_cpr_save_ppages(); 1683 } 1684 } else if (action == CPR_PROM_RESTORE) { 1685 i_cpr_restore_ppages(); 1686 } else if (action == CPR_PROM_FREE) { 1687 if (pphys_list) { 1688 ASSERT(pphys_list_size); 1689 kmem_free(pphys_list, pphys_list_size); 1690 pphys_list = NULL; 1691 pphys_list_size = 0; 1692 } 1693 if (ppage_buf) { 1694 ASSERT(ppage_count); 1695 kmem_free(ppage_buf, mmu_ptob(ppage_count)); 1696 DEBUG1(errp("freed %d prom pages\n", ppage_count)); 1697 ppage_buf = NULL; 1698 ppage_count = 0; 1699 } 1700 } 1701 return (0); 1702 } 1703 1704 1705 /* 1706 * record tlb data for the nucleus, bigktsb's, and the cpr module; 1707 * this data is later used by cprboot to install dtlb/itlb entries. 1708 * when we jump into the cpr module during the resume phase, those 1709 * mappings are needed until switching to the kernel trap table. 1710 * to make the dtte/itte info available during resume, we need 1711 * the info recorded prior to saving sensitive pages, otherwise 1712 * all the data would appear as NULLs. 1713 */ 1714 static void 1715 i_cpr_save_tlbinfo(void) 1716 { 1717 cti_t cti; 1718 1719 /* 1720 * during resume - shortly after jumping into the cpr module, 1721 * sfmmu_load_mmustate() will overwrite any dtlb entry at any 1722 * index used for TSBs; skip is set so that any saved tte will 1723 * target other tlb offsets and prevent being lost during 1724 * resume. now scan the dtlb and save locked entries, 1725 * then add entries for the tmp stack / data page and the 1726 * cpr thread structure. 1727 */ 1728 cti.dst = m_info.dtte; 1729 cti.tail = cti.dst + CPR_MAX_TLB; 1730 cti.reader = dtlb_rd_entry; 1731 cti.writer = NULL; 1732 cti.filter = i_cpr_lnb; 1733 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 1734 cti.skip = (1 << utsb_dtlb_ttenum); 1735 cti.skip |= (1 << utsb4m_dtlb_ttenum); 1736 i_cpr_scan_tlb(&cti); 1737 i_cpr_make_tte(&cti, &i_cpr_data_page, datava); 1738 i_cpr_make_tte(&cti, curthread, datava); 1739 1740 /* 1741 * scan itlb and save locked entries; add an entry for 1742 * the first text page of the cpr module; cprboot will 1743 * jump to that page after restoring kernel pages. 1744 */ 1745 cti.dst = m_info.itte; 1746 cti.tail = cti.dst + CPR_MAX_TLB; 1747 cti.reader = itlb_rd_entry; 1748 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 1749 cti.skip = 0; 1750 i_cpr_scan_tlb(&cti); 1751 i_cpr_make_tte(&cti, (void *)i_cpr_resume_setup, textva); 1752 } 1753 1754 1755 /* ARGSUSED */ 1756 int 1757 i_cpr_dump_setup(vnode_t *vp) 1758 { 1759 /* 1760 * zero out m_info and add info to dtte/itte arrays 1761 */ 1762 bzero(&m_info, sizeof (m_info)); 1763 i_cpr_save_tlbinfo(); 1764 return (0); 1765 } 1766 1767 1768 int 1769 i_cpr_is_supported(void) 1770 { 1771 char es_prop[] = "energystar-v2"; 1772 dnode_t node; 1773 int last; 1774 extern int cpr_supported_override; 1775 extern int cpr_platform_enable; 1776 1777 /* 1778 * The next statement tests if a specific platform has turned off 1779 * cpr support. 1780 */ 1781 if (cpr_supported_override) 1782 return (0); 1783 1784 /* 1785 * Do not inspect energystar-v* property if a platform has 1786 * specifically turned on cpr support 1787 */ 1788 if (cpr_platform_enable) 1789 return (1); 1790 1791 node = prom_rootnode(); 1792 if (prom_getproplen(node, es_prop) != -1) 1793 return (1); 1794 last = strlen(es_prop) - 1; 1795 es_prop[last] = '3'; 1796 return (prom_getproplen(node, es_prop) != -1); 1797 } 1798 1799 1800 /* 1801 * the actual size of the statefile data isn't known until after all the 1802 * compressed pages are written; even the inode size doesn't reflect the 1803 * data size since there are usually many extra fs blocks. for recording 1804 * the actual data size, the first sector of the statefile is copied to 1805 * a tmp buf, and the copy is later updated and flushed to disk. 1806 */ 1807 int 1808 i_cpr_blockzero(char *base, char **bufpp, int *blkno, vnode_t *vp) 1809 { 1810 extern int cpr_flush_write(vnode_t *); 1811 static char cpr_sector[DEV_BSIZE]; 1812 cpr_ext bytes, *dst; 1813 1814 /* 1815 * this routine is called after cdd_t and csu_md_t are copied 1816 * to cpr_buf; mini-hack alert: the save/update method creates 1817 * a dependency on the combined struct size being >= one sector 1818 * or DEV_BSIZE; since introduction in Sol2.7, csu_md_t size is 1819 * over 1K bytes and will probably grow with any changes. 1820 * 1821 * copy when vp is NULL, flush when non-NULL 1822 */ 1823 if (vp == NULL) { 1824 ASSERT((*bufpp - base) >= DEV_BSIZE); 1825 bcopy(base, cpr_sector, sizeof (cpr_sector)); 1826 return (0); 1827 } else { 1828 bytes = dbtob(*blkno); 1829 dst = &((cdd_t *)cpr_sector)->cdd_filesize; 1830 bcopy(&bytes, dst, sizeof (bytes)); 1831 bcopy(cpr_sector, base, sizeof (cpr_sector)); 1832 *bufpp = base + sizeof (cpr_sector); 1833 *blkno = cpr_statefile_offset(); 1834 DEBUG1(errp("statefile data size: %lld\n\n", bytes)); 1835 return (cpr_flush_write(vp)); 1836 } 1837 } 1838 1839 1840 /* 1841 * Allocate bitmaps according to the phys_install list. 1842 */ 1843 static int 1844 i_cpr_bitmap_setup(void) 1845 { 1846 struct memlist *pmem; 1847 cbd_t *dp, *tail; 1848 void *space; 1849 size_t size; 1850 1851 /* 1852 * The number of bitmap descriptors will be the count of 1853 * phys_install ranges plus 1 for a trailing NULL struct. 1854 */ 1855 cpr_nbitmaps = 1; 1856 for (pmem = phys_install; pmem; pmem = pmem->next) 1857 cpr_nbitmaps++; 1858 1859 if (cpr_nbitmaps > (CPR_MAX_BMDESC - 1)) { 1860 cpr_err(CE_WARN, "too many physical memory ranges %d, max %d", 1861 cpr_nbitmaps, CPR_MAX_BMDESC - 1); 1862 return (EFBIG); 1863 } 1864 1865 /* Alloc an array of bitmap descriptors. */ 1866 dp = kmem_zalloc(cpr_nbitmaps * sizeof (*dp), KM_NOSLEEP); 1867 if (dp == NULL) { 1868 cpr_nbitmaps = 0; 1869 return (ENOMEM); 1870 } 1871 tail = dp + cpr_nbitmaps; 1872 1873 CPR->c_bmda = dp; 1874 for (pmem = phys_install; pmem; pmem = pmem->next) { 1875 size = BITMAP_BYTES(pmem->size); 1876 space = kmem_zalloc(size * 2, KM_NOSLEEP); 1877 if (space == NULL) 1878 return (ENOMEM); 1879 ASSERT(dp < tail); 1880 dp->cbd_magic = CPR_BITMAP_MAGIC; 1881 dp->cbd_spfn = mmu_btop(pmem->address); 1882 dp->cbd_epfn = mmu_btop(pmem->address + pmem->size) - 1; 1883 dp->cbd_size = size; 1884 dp->cbd_reg_bitmap = (cpr_ptr)space; 1885 dp->cbd_vlt_bitmap = (cpr_ptr)((caddr_t)space + size); 1886 dp++; 1887 } 1888 1889 /* set magic for the last descriptor */ 1890 ASSERT(dp == (tail - 1)); 1891 dp->cbd_magic = CPR_BITMAP_MAGIC; 1892 1893 return (0); 1894 } 1895 1896 1897 void 1898 i_cpr_bitmap_cleanup(void) 1899 { 1900 cbd_t *dp; 1901 1902 if (CPR->c_bmda == NULL) 1903 return; 1904 for (dp = CPR->c_bmda; dp->cbd_size; dp++) 1905 kmem_free((void *)dp->cbd_reg_bitmap, dp->cbd_size * 2); 1906 kmem_free(CPR->c_bmda, cpr_nbitmaps * sizeof (*CPR->c_bmda)); 1907 CPR->c_bmda = NULL; 1908 cpr_nbitmaps = 0; 1909 } 1910 1911 1912 /* 1913 * A "regular" and "volatile" bitmap are created for each range of 1914 * physical memory. The volatile maps are used to count and track pages 1915 * susceptible to heap corruption - caused by drivers that allocate mem 1916 * during VOP_DUMP(); the regular maps are used for all the other non- 1917 * susceptible pages. Before writing the bitmaps to the statefile, 1918 * each bitmap pair gets merged to simplify handling within cprboot. 1919 */ 1920 int 1921 i_cpr_alloc_bitmaps(void) 1922 { 1923 int err; 1924 1925 memlist_read_lock(); 1926 err = i_cpr_bitmap_setup(); 1927 memlist_read_unlock(); 1928 if (err) 1929 i_cpr_bitmap_cleanup(); 1930 return (err); 1931 } 1932