1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Platform specific implementation code 31 */ 32 33 #define SUNDDI_IMPL 34 35 #include <sys/types.h> 36 #include <sys/promif.h> 37 #include <sys/prom_isa.h> 38 #include <sys/prom_plat.h> 39 #include <sys/mmu.h> 40 #include <vm/hat_sfmmu.h> 41 #include <sys/iommu.h> 42 #include <sys/scb.h> 43 #include <sys/cpuvar.h> 44 #include <sys/intreg.h> 45 #include <sys/pte.h> 46 #include <vm/hat.h> 47 #include <vm/page.h> 48 #include <vm/as.h> 49 #include <sys/cpr.h> 50 #include <sys/kmem.h> 51 #include <sys/clock.h> 52 #include <sys/kmem.h> 53 #include <sys/panic.h> 54 #include <vm/seg_kmem.h> 55 #include <sys/cpu_module.h> 56 #include <sys/callb.h> 57 #include <sys/machsystm.h> 58 #include <sys/vmsystm.h> 59 #include <sys/systm.h> 60 #include <sys/archsystm.h> 61 #include <sys/stack.h> 62 #include <sys/fs/ufs_fs.h> 63 #include <sys/memlist.h> 64 #include <sys/bootconf.h> 65 #include <sys/thread.h> 66 67 extern void cpr_clear_bitmaps(void); 68 extern void dtlb_wr_entry(uint_t, tte_t *, uint64_t *); 69 extern void itlb_wr_entry(uint_t, tte_t *, uint64_t *); 70 71 static int i_cpr_storage_desc_alloc(csd_t **, pgcnt_t *, csd_t **, int); 72 static void i_cpr_storage_desc_init(csd_t *, pgcnt_t, csd_t *); 73 static caddr_t i_cpr_storage_data_alloc(pgcnt_t, pgcnt_t *, int); 74 static int cpr_dump_sensitive(vnode_t *, csd_t *); 75 static void i_cpr_clear_entries(uint64_t, uint64_t); 76 static void i_cpr_xcall(xcfunc_t); 77 78 void i_cpr_storage_free(void); 79 80 extern void *i_cpr_data_page; 81 extern int cpr_test_mode; 82 extern int cpr_nbitmaps; 83 extern char cpr_default_path[]; 84 extern caddr_t textva, datava; 85 86 static struct cpr_map_info cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 87 caddr_t cpr_vaddr = NULL; 88 89 static uint_t sensitive_pages_saved; 90 static uint_t sensitive_size_saved; 91 92 caddr_t i_cpr_storage_data_base; 93 caddr_t i_cpr_storage_data_end; 94 csd_t *i_cpr_storage_desc_base; 95 csd_t *i_cpr_storage_desc_end; /* one byte beyond last used descp */ 96 csd_t *i_cpr_storage_desc_last_used; /* last used descriptor */ 97 caddr_t sensitive_write_ptr; /* position for next storage write */ 98 99 size_t i_cpr_sensitive_bytes_dumped; 100 pgcnt_t i_cpr_sensitive_pgs_dumped; 101 pgcnt_t i_cpr_storage_data_sz; /* in pages */ 102 pgcnt_t i_cpr_storage_desc_pgcnt; /* in pages */ 103 104 ushort_t cpr_mach_type = CPR_MACHTYPE_4U; 105 static csu_md_t m_info; 106 107 108 #define MAX_STORAGE_RETRY 3 109 #define MAX_STORAGE_ALLOC_RETRY 3 110 #define INITIAL_ALLOC_PCNT 40 /* starting allocation percentage */ 111 #define INTEGRAL 100 /* to get 1% precision */ 112 113 #define EXTRA_RATE 2 /* add EXTRA_RATE% extra space */ 114 #define EXTRA_DESCS 10 115 116 #define CPR_NO_STORAGE_DESC 1 117 #define CPR_NO_STORAGE_DATA 2 118 119 #define CIF_SPLICE 0 120 #define CIF_UNLINK 1 121 122 123 /* 124 * CPR miscellaneous support routines 125 */ 126 #define cpr_open(path, mode, vpp) (vn_open(path, UIO_SYSSPACE, \ 127 mode, 0600, vpp, CRCREAT, 0)) 128 #define cpr_rdwr(rw, vp, basep, cnt) (vn_rdwr(rw, vp, (caddr_t)(basep), \ 129 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \ 130 (ssize_t *)NULL)) 131 132 /* 133 * definitions for saving/restoring prom pages 134 */ 135 static void *ppage_buf; 136 static pgcnt_t ppage_count; 137 static pfn_t *pphys_list; 138 static size_t pphys_list_size; 139 140 typedef void (*tlb_rw_t)(uint_t, tte_t *, uint64_t *); 141 typedef void (*tlb_filter_t)(int, tte_t *, uint64_t, void *); 142 143 /* 144 * private struct for tlb handling 145 */ 146 struct cpr_trans_info { 147 sutlb_t *dst; 148 sutlb_t *tail; 149 tlb_rw_t reader; 150 tlb_rw_t writer; 151 tlb_filter_t filter; 152 int index; 153 uint64_t skip; /* assumes TLB <= 64 locked entries */ 154 }; 155 typedef struct cpr_trans_info cti_t; 156 157 158 /* 159 * special handling for tlb info 160 */ 161 #define WITHIN_OFW(va) \ 162 (((va) > (uint64_t)OFW_START_ADDR) && ((va) < (uint64_t)OFW_END_ADDR)) 163 164 #define WITHIN_NUCLEUS(va, base) \ 165 (((va) >= (base)) && \ 166 (((va) + MMU_PAGESIZE) <= ((base) + MMU_PAGESIZE4M))) 167 168 #define IS_BIGKTSB(va) \ 169 (enable_bigktsb && \ 170 ((va) >= (uint64_t)ktsb_base) && \ 171 ((va) < (uint64_t)(ktsb_base + ktsb_sz))) 172 173 174 /* 175 * WARNING: 176 * the text from this file is linked to follow cpr_resume_setup.o; 177 * only add text between here and i_cpr_end_jumpback when it needs 178 * to be called during resume before we switch back to the kernel 179 * trap table. all the text in this range must fit within a page. 180 */ 181 182 183 /* 184 * each time a machine is reset, the prom uses an inconsistent set of phys 185 * pages and the cif cookie may differ as well. so prior to restoring the 186 * original prom, we have to use to use the new/tmp prom's translations 187 * when requesting prom services. 188 * 189 * cif_handler starts out as the original prom cookie, and that gets used 190 * by client_handler() to jump into the prom. here we splice-in a wrapper 191 * routine by writing cif_handler; client_handler() will now jump to the 192 * wrapper which switches the %tba to the new/tmp prom's trap table then 193 * jumps to the new cookie. 194 */ 195 void 196 i_cpr_cif_setup(int action) 197 { 198 extern void *i_cpr_orig_cif, *cif_handler; 199 extern int i_cpr_cif_wrapper(void *); 200 201 /* 202 * save the original cookie and change the current cookie to the 203 * wrapper routine. later we just restore the original cookie. 204 */ 205 if (action == CIF_SPLICE) { 206 i_cpr_orig_cif = cif_handler; 207 cif_handler = (void *)i_cpr_cif_wrapper; 208 } else if (action == CIF_UNLINK) 209 cif_handler = i_cpr_orig_cif; 210 } 211 212 213 /* 214 * launch slave cpus into kernel text, pause them, 215 * and restore the original prom pages 216 */ 217 void 218 i_cpr_mp_setup(void) 219 { 220 extern void restart_other_cpu(int); 221 ihandle_t tmpout = 0; 222 char *str; 223 cpu_t *cp; 224 225 /* 226 * reset cpu_ready_set so x_calls work properly 227 */ 228 CPUSET_ZERO(cpu_ready_set); 229 CPUSET_ADD(cpu_ready_set, getprocessorid()); 230 231 /* 232 * setup cif to use the cookie from the new/tmp prom 233 * and setup tmp handling for calling prom services. 234 */ 235 i_cpr_cif_setup(CIF_SPLICE); 236 237 /* 238 * at this point, only the nucleus and a few cpr pages are 239 * mapped in. once we switch to the kernel trap table, 240 * we can access the rest of kernel space. 241 */ 242 prom_set_traptable(&trap_table); 243 244 if (ncpus > 1) { 245 sfmmu_init_tsbs(); 246 247 if (cpr_debug & LEVEL1) { 248 prom_interpret("stdout @ swap l!", (uintptr_t)&tmpout, 249 0, 0, 0, 0); 250 str = "MP startup...\r\n"; 251 (void) prom_write(tmpout, str, strlen(str), 0, 0); 252 } 253 254 mutex_enter(&cpu_lock); 255 /* 256 * All of the slave cpus are not ready at this time, 257 * yet the cpu structures have various cpu_flags set; 258 * clear cpu_flags and mutex_ready. 259 * Since we are coming up from a CPU suspend, the slave cpus 260 * are frozen. 261 */ 262 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) { 263 cp->cpu_flags = CPU_FROZEN; 264 cp->cpu_m.mutex_ready = 0; 265 } 266 267 for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) 268 restart_other_cpu(cp->cpu_id); 269 270 pause_cpus(NULL); 271 mutex_exit(&cpu_lock); 272 273 if (cpr_debug & LEVEL1) { 274 str = "MP paused...\r\n"; 275 (void) prom_write(tmpout, str, strlen(str), 0, 0); 276 } 277 278 i_cpr_xcall(i_cpr_clear_entries); 279 } else 280 i_cpr_clear_entries(0, 0); 281 282 /* 283 * now unlink the cif wrapper; WARNING: do not call any 284 * prom_xxx() routines until after prom pages are restored. 285 */ 286 i_cpr_cif_setup(CIF_UNLINK); 287 288 (void) i_cpr_prom_pages(CPR_PROM_RESTORE); 289 } 290 291 292 /* 293 * end marker for jumpback page; 294 * this symbol is used to check the size of i_cpr_resume_setup() 295 * and the above text. For simplicity, the Makefile needs to 296 * link i_cpr_resume_setup.o and cpr_impl.o consecutively. 297 */ 298 void 299 i_cpr_end_jumpback(void) 300 { 301 } 302 303 304 /* 305 * scan tlb entries with reader; when valid entries are found, 306 * the filter routine will selectively save/clear them 307 */ 308 static void 309 i_cpr_scan_tlb(cti_t *ctip) 310 { 311 uint64_t va_tag; 312 int tlb_index; 313 tte_t tte; 314 315 for (tlb_index = ctip->index; tlb_index >= 0; tlb_index--) { 316 (*ctip->reader)((uint_t)tlb_index, &tte, &va_tag); 317 if (va_tag && TTE_IS_VALID(&tte)) 318 (*ctip->filter)(tlb_index, &tte, va_tag, ctip); 319 } 320 } 321 322 323 /* 324 * filter for locked tlb entries that reference the text/data nucleus 325 * and any bigktsb's; these will be reinstalled by cprboot on all cpus 326 */ 327 /* ARGSUSED */ 328 static void 329 i_cpr_lnb(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 330 { 331 cti_t *ctip; 332 333 /* 334 * record tlb data at ctip->dst; the target tlb index starts 335 * at the highest tlb offset and moves towards 0. the prom 336 * reserves both dtlb and itlb index 0. any selected entry 337 * also gets marked to prevent being flushed during resume 338 */ 339 if (TTE_IS_LOCKED(ttep) && (va_tag == (uint64_t)textva || 340 va_tag == (uint64_t)datava || IS_BIGKTSB(va_tag))) { 341 ctip = ctrans; 342 while ((1 << ctip->index) & ctip->skip) 343 ctip->index--; 344 ASSERT(ctip->index > 0); 345 ASSERT(ctip->dst < ctip->tail); 346 ctip->dst->tte.ll = ttep->ll; 347 ctip->dst->va_tag = va_tag; 348 ctip->dst->index = ctip->index--; 349 ctip->dst->tmp = 0; 350 ctip->dst++; 351 } 352 } 353 354 355 /* 356 * some tlb entries are stale, filter for unlocked entries 357 * within the prom virt range and clear them 358 */ 359 static void 360 i_cpr_ufw(int index, tte_t *ttep, uint64_t va_tag, void *ctrans) 361 { 362 sutlb_t clr; 363 cti_t *ctip; 364 365 if (!TTE_IS_LOCKED(ttep) && WITHIN_OFW(va_tag)) { 366 ctip = ctrans; 367 bzero(&clr, sizeof (clr)); 368 (*ctip->writer)((uint_t)index, &clr.tte, &clr.va_tag); 369 } 370 } 371 372 373 /* 374 * some of the entries installed by cprboot are needed only on a 375 * short-term basis and need to be flushed to avoid clogging the tlbs. 376 * scan the dtte/itte arrays for items marked as temporary and clear 377 * dtlb/itlb entries using wrfunc. 378 */ 379 static void 380 i_cpr_clear_tmp(sutlb_t *listp, int max, tlb_rw_t wrfunc) 381 { 382 sutlb_t clr, *tail; 383 384 bzero(&clr, sizeof (clr)); 385 for (tail = listp + max; listp < tail && listp->va_tag; listp++) { 386 if (listp->tmp) 387 (*wrfunc)((uint_t)listp->index, &clr.tte, &clr.va_tag); 388 } 389 } 390 391 392 /* ARGSUSED */ 393 static void 394 i_cpr_clear_entries(uint64_t arg1, uint64_t arg2) 395 { 396 extern void demap_all(void); 397 cti_t cti; 398 399 i_cpr_clear_tmp(m_info.dtte, CPR_MAX_TLB, dtlb_wr_entry); 400 i_cpr_clear_tmp(m_info.itte, CPR_MAX_TLB, itlb_wr_entry); 401 402 /* 403 * for newer cpus that implement DEMAP_ALL_TYPE, demap_all is 404 * a second label for vtag_flushall. the call is made using 405 * vtag_flushall() instead of demap_all() due to runtime and 406 * krtld results with both older and newer cpu modules. 407 */ 408 if (&demap_all != 0) { 409 vtag_flushall(); 410 return; 411 } 412 413 /* 414 * for older V9 cpus, scan tlbs and clear stale entries 415 */ 416 bzero(&cti, sizeof (cti)); 417 cti.filter = i_cpr_ufw; 418 419 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 420 cti.reader = dtlb_rd_entry; 421 cti.writer = dtlb_wr_entry; 422 i_cpr_scan_tlb(&cti); 423 424 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 425 cti.reader = itlb_rd_entry; 426 cti.writer = itlb_wr_entry; 427 i_cpr_scan_tlb(&cti); 428 } 429 430 431 /* 432 * craft tlb info for tmp use during resume; this data gets used by 433 * cprboot to install tlb entries. we also mark each struct as tmp 434 * so those tlb entries will get flushed after switching to the kernel 435 * trap table. no data needs to be recorded for vaddr when it falls 436 * within the nucleus since we've already recorded nucleus ttes and 437 * a 8K tte would conflict with a 4MB tte. eg: the cpr module 438 * text/data may have been loaded into the text/data nucleus. 439 */ 440 static void 441 i_cpr_make_tte(cti_t *ctip, void *vaddr, caddr_t nbase) 442 { 443 pfn_t ppn; 444 uint_t rw; 445 446 if (WITHIN_NUCLEUS((caddr_t)vaddr, nbase)) 447 return; 448 449 while ((1 << ctip->index) & ctip->skip) 450 ctip->index--; 451 ASSERT(ctip->index > 0); 452 ASSERT(ctip->dst < ctip->tail); 453 454 /* 455 * without any global service available to lookup 456 * a tte by vaddr, we craft our own here: 457 */ 458 ppn = va_to_pfn(vaddr); 459 rw = (nbase == datava) ? TTE_HWWR_INT : 0; 460 ctip->dst->tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 461 ctip->dst->tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 462 TTE_CP_INT | TTE_PRIV_INT | rw; 463 ctip->dst->va_tag = ((uintptr_t)vaddr & MMU_PAGEMASK); 464 ctip->dst->index = ctip->index--; 465 ctip->dst->tmp = 1; 466 ctip->dst++; 467 } 468 469 470 static void 471 i_cpr_xcall(xcfunc_t func) 472 { 473 uint_t pil, reset_pil; 474 475 pil = getpil(); 476 if (pil < XCALL_PIL) 477 reset_pil = 0; 478 else { 479 reset_pil = 1; 480 setpil(XCALL_PIL - 1); 481 } 482 xc_some(cpu_ready_set, func, 0, 0); 483 if (reset_pil) 484 setpil(pil); 485 } 486 487 488 /* 489 * restart paused slave cpus 490 */ 491 void 492 i_cpr_machdep_setup(void) 493 { 494 if (ncpus > 1) { 495 DEBUG1(errp("MP restarted...\n")); 496 mutex_enter(&cpu_lock); 497 start_cpus(); 498 mutex_exit(&cpu_lock); 499 } 500 } 501 502 503 /* 504 * Stop all interrupt activities in the system 505 */ 506 void 507 i_cpr_stop_intr(void) 508 { 509 (void) spl7(); 510 } 511 512 /* 513 * Set machine up to take interrupts 514 */ 515 void 516 i_cpr_enable_intr(void) 517 { 518 (void) spl0(); 519 } 520 521 522 /* 523 * record cpu nodes and ids 524 */ 525 static void 526 i_cpr_save_cpu_info(void) 527 { 528 struct sun4u_cpu_info *scip; 529 cpu_t *cp; 530 531 scip = m_info.sci; 532 cp = CPU; 533 do { 534 ASSERT(scip < &m_info.sci[NCPU]); 535 scip->cpu_id = cp->cpu_id; 536 scip->node = cpunodes[cp->cpu_id].nodeid; 537 scip++; 538 } while ((cp = cp->cpu_next) != CPU); 539 } 540 541 542 /* 543 * Write necessary machine dependent information to cpr state file, 544 * eg. sun4u mmu ctx secondary for the current running process (cpr) ... 545 */ 546 int 547 i_cpr_write_machdep(vnode_t *vp) 548 { 549 extern uint_t getpstate(), getwstate(); 550 extern uint_t i_cpr_tstack_size; 551 const char ustr[] = ": unix-tte 2drop false ;"; 552 uintptr_t tinfo; 553 label_t *ltp; 554 cmd_t cmach; 555 char *fmt; 556 int rc; 557 558 /* 559 * ustr[] is used as temporary forth words during 560 * slave startup sequence, see sfmmu_mp_startup() 561 */ 562 563 cmach.md_magic = (uint_t)CPR_MACHDEP_MAGIC; 564 cmach.md_size = sizeof (m_info) + sizeof (ustr); 565 566 if (rc = cpr_write(vp, (caddr_t)&cmach, sizeof (cmach))) { 567 cpr_err(CE_WARN, "Failed to write descriptor."); 568 return (rc); 569 } 570 571 /* 572 * m_info is now cleared in i_cpr_dump_setup() 573 */ 574 m_info.ksb = (uint32_t)STACK_BIAS; 575 m_info.kpstate = (uint16_t)getpstate(); 576 m_info.kwstate = (uint16_t)getwstate(); 577 DEBUG1(errp("stack bias 0x%x, pstate 0x%x, wstate 0x%x\n", 578 m_info.ksb, m_info.kpstate, m_info.kwstate)); 579 580 ltp = &ttolwp(curthread)->lwp_qsav; 581 m_info.qsav_pc = (cpr_ext)ltp->val[0]; 582 m_info.qsav_sp = (cpr_ext)ltp->val[1]; 583 584 /* 585 * Set secondary context to INVALID_CONTEXT to force the HAT 586 * to re-setup the MMU registers and locked TTEs it needs for 587 * TLB miss handling. 588 */ 589 m_info.mmu_ctx_sec = INVALID_CONTEXT; 590 m_info.mmu_ctx_pri = sfmmu_getctx_pri(); 591 592 tinfo = (uintptr_t)curthread; 593 m_info.thrp = (cpr_ptr)tinfo; 594 595 tinfo = (uintptr_t)i_cpr_resume_setup; 596 m_info.func = (cpr_ptr)tinfo; 597 598 /* 599 * i_cpr_data_page is comprised of a 4K stack area and a few 600 * trailing data symbols; the page is shared by the prom and 601 * kernel during resume. the stack size is recorded here 602 * and used by cprboot to set %sp 603 */ 604 tinfo = (uintptr_t)&i_cpr_data_page; 605 m_info.tmp_stack = (cpr_ptr)tinfo; 606 m_info.tmp_stacksize = i_cpr_tstack_size; 607 608 m_info.test_mode = cpr_test_mode; 609 610 i_cpr_save_cpu_info(); 611 612 if (rc = cpr_write(vp, (caddr_t)&m_info, sizeof (m_info))) { 613 cpr_err(CE_WARN, "Failed to write machdep info."); 614 return (rc); 615 } 616 617 fmt = "error writing %s forth info"; 618 if (rc = cpr_write(vp, (caddr_t)ustr, sizeof (ustr))) 619 cpr_err(CE_WARN, fmt, "unix-tte"); 620 621 return (rc); 622 } 623 624 625 /* 626 * Save miscellaneous information which needs to be written to the 627 * state file. This information is required to re-initialize 628 * kernel/prom handshaking. 629 */ 630 void 631 i_cpr_save_machdep_info(void) 632 { 633 DEBUG5(errp("jumpback size = 0x%lx\n", 634 (uintptr_t)&i_cpr_end_jumpback - 635 (uintptr_t)i_cpr_resume_setup)); 636 637 /* 638 * Verify the jumpback code all falls in one page. 639 */ 640 if (((uintptr_t)&i_cpr_end_jumpback & MMU_PAGEMASK) != 641 ((uintptr_t)i_cpr_resume_setup & MMU_PAGEMASK)) 642 cpr_err(CE_PANIC, "jumpback code exceeds one page."); 643 } 644 645 646 void 647 i_cpr_set_tbr(void) 648 { 649 } 650 651 652 /* 653 * cpu0 should contain bootcpu info 654 */ 655 cpu_t * 656 i_cpr_bootcpu(void) 657 { 658 return (&cpu0); 659 } 660 661 662 /* 663 * Return the virtual address of the mapping area 664 */ 665 caddr_t 666 i_cpr_map_setup(void) 667 { 668 /* 669 * Allocate a virtual memory range spanned by an hmeblk. 670 * This would be 8 hments or 64k bytes. Starting VA 671 * must be 64k (8-page) aligned. 672 */ 673 cpr_vaddr = vmem_xalloc(heap_arena, 674 mmu_ptob(NHMENTS), mmu_ptob(NHMENTS), 675 0, 0, NULL, NULL, VM_NOSLEEP); 676 return (cpr_vaddr); 677 } 678 679 /* 680 * create tmp locked tlb entries for a group of phys pages; 681 * 682 * i_cpr_mapin/i_cpr_mapout should always be called in pairs, 683 * otherwise would fill up a tlb with locked entries 684 */ 685 void 686 i_cpr_mapin(caddr_t vaddr, uint_t pages, pfn_t ppn) 687 { 688 tte_t tte; 689 extern pfn_t curthreadpfn; 690 extern int curthreadremapped; 691 692 curthreadremapped = (ppn <= curthreadpfn && curthreadpfn < ppn + pages); 693 694 for (; pages--; ppn++, vaddr += MMU_PAGESIZE) { 695 tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(ppn); 696 tte.tte_intlo = TTE_PFN_INTLO(ppn) | TTE_LCK_INT | 697 TTE_CP_INT | TTE_PRIV_INT | TTE_HWWR_INT; 698 sfmmu_dtlb_ld(vaddr, KCONTEXT, &tte); 699 } 700 } 701 702 void 703 i_cpr_mapout(caddr_t vaddr, uint_t pages) 704 { 705 extern int curthreadremapped; 706 707 if (curthreadremapped && vaddr <= (caddr_t)curthread && 708 (caddr_t)curthread < vaddr + pages * MMU_PAGESIZE) 709 curthreadremapped = 0; 710 711 for (; pages--; vaddr += MMU_PAGESIZE) 712 vtag_flushpage(vaddr, KCONTEXT); 713 } 714 715 /* 716 * We're done using the mapping area; release virtual space 717 */ 718 void 719 i_cpr_map_destroy(void) 720 { 721 vmem_free(heap_arena, cpr_vaddr, mmu_ptob(NHMENTS)); 722 cpr_vaddr = NULL; 723 } 724 725 /* ARGSUSED */ 726 void 727 i_cpr_handle_xc(int flag) 728 { 729 } 730 731 732 /* 733 * This function takes care of pages which are not in kas or need to be 734 * taken care of in a special way. For example, panicbuf pages are not 735 * in kas and their pages are allocated via prom_retain(). 736 */ 737 pgcnt_t 738 i_cpr_count_special_kpages(int mapflag, bitfunc_t bitfunc) 739 { 740 struct cpr_map_info *pri, *tail; 741 pgcnt_t pages, total = 0; 742 pfn_t pfn; 743 744 /* 745 * Save information about prom retained panicbuf pages 746 */ 747 if (bitfunc == cpr_setbit) { 748 pri = &cpr_prom_retain[CPR_PANICBUF]; 749 pri->virt = (cpr_ptr)panicbuf; 750 pri->phys = va_to_pa(panicbuf); 751 pri->size = sizeof (panicbuf); 752 } 753 754 /* 755 * Go through the prom_retain array to tag those pages. 756 */ 757 tail = &cpr_prom_retain[CPR_PROM_RETAIN_CNT]; 758 for (pri = cpr_prom_retain; pri < tail; pri++) { 759 pages = mmu_btopr(pri->size); 760 for (pfn = ADDR_TO_PN(pri->phys); pages--; pfn++) { 761 if (pf_is_memory(pfn)) { 762 if (bitfunc == cpr_setbit) { 763 if ((*bitfunc)(pfn, mapflag) == 0) 764 total++; 765 } else 766 total++; 767 } 768 } 769 } 770 771 return (total); 772 } 773 774 775 /* 776 * Free up memory-related resources here. We start by freeing buffers 777 * allocated during suspend initialization. Also, free up the mapping 778 * resources allocated in cpr_init(). 779 */ 780 void 781 i_cpr_free_memory_resources(void) 782 { 783 (void) i_cpr_prom_pages(CPR_PROM_FREE); 784 i_cpr_map_destroy(); 785 i_cpr_storage_free(); 786 } 787 788 789 /* 790 * Derived from cpr_write_statefile(). 791 * Save the sensitive pages to the storage area and do bookkeeping 792 * using the sensitive descriptors. Each descriptor will contain no more 793 * than CPR_MAXCONTIG amount of contiguous pages to match the max amount 794 * of pages that statefile gets written to disk at each write. 795 * XXX The CPR_MAXCONTIG can be changed to the size of the compression 796 * scratch area. 797 */ 798 static int 799 i_cpr_save_to_storage(void) 800 { 801 sensitive_size_saved = 0; 802 sensitive_pages_saved = 0; 803 sensitive_write_ptr = i_cpr_storage_data_base; 804 return (cpr_contig_pages(NULL, SAVE_TO_STORAGE)); 805 } 806 807 808 /* 809 * This routine allocates space to save the sensitive kernel pages, 810 * i.e. kernel data nucleus, kvalloc and kvseg segments. 811 * It's assumed that those segments are the only areas that can be 812 * contaminated by memory allocations during statefile dumping. 813 * The space allocated here contains: 814 * A list of descriptors describing the saved sensitive pages. 815 * The storage area for saving the compressed sensitive kernel pages. 816 * Since storage pages are allocated from segkmem, they need to be 817 * excluded when saving. 818 */ 819 int 820 i_cpr_save_sensitive_kpages(void) 821 { 822 static const char pages_fmt[] = "\n%s %s allocs\n" 823 " spages %ld, vpages %ld, diff %ld\n"; 824 int retry_cnt; 825 int error = 0; 826 pgcnt_t pages, spages, vpages; 827 caddr_t addr; 828 char *str; 829 830 /* 831 * Tag sensitive kpages. Allocate space for storage descriptors 832 * and storage data area based on the resulting bitmaps. 833 * Note: The storage space will be part of the sensitive 834 * segment, so we need to tag kpages here before the storage 835 * is actually allocated just so their space won't be accounted 836 * for. They will not be part of the statefile although those 837 * pages will be claimed by cprboot. 838 */ 839 cpr_clear_bitmaps(); 840 841 spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 842 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 843 pages = spages - vpages; 844 845 str = "i_cpr_save_sensitive_kpages:"; 846 DEBUG7(errp(pages_fmt, "before", str, spages, vpages, pages)); 847 848 /* 849 * Allocate space to save the clean sensitive kpages 850 */ 851 for (retry_cnt = 0; retry_cnt < MAX_STORAGE_ALLOC_RETRY; retry_cnt++) { 852 /* 853 * Alloc on first pass or realloc if we are retrying because 854 * of insufficient storage for sensitive pages 855 */ 856 if (retry_cnt == 0 || error == ENOMEM) { 857 if (i_cpr_storage_data_base) { 858 kmem_free(i_cpr_storage_data_base, 859 mmu_ptob(i_cpr_storage_data_sz)); 860 i_cpr_storage_data_base = NULL; 861 i_cpr_storage_data_sz = 0; 862 } 863 addr = i_cpr_storage_data_alloc(pages, 864 &i_cpr_storage_data_sz, retry_cnt); 865 if (addr == NULL) { 866 DEBUG7(errp( 867 "\n%s can't allocate data storage space!\n", 868 str)); 869 return (ENOMEM); 870 } 871 i_cpr_storage_data_base = addr; 872 i_cpr_storage_data_end = 873 addr + mmu_ptob(i_cpr_storage_data_sz); 874 } 875 876 /* 877 * Allocate on first pass, only realloc if retry is because of 878 * insufficient descriptors, but reset contents on each pass 879 * (desc_alloc resets contents as well) 880 */ 881 if (retry_cnt == 0 || error == -1) { 882 error = i_cpr_storage_desc_alloc( 883 &i_cpr_storage_desc_base, &i_cpr_storage_desc_pgcnt, 884 &i_cpr_storage_desc_end, retry_cnt); 885 if (error != 0) 886 return (error); 887 } else { 888 i_cpr_storage_desc_init(i_cpr_storage_desc_base, 889 i_cpr_storage_desc_pgcnt, i_cpr_storage_desc_end); 890 } 891 892 /* 893 * We are ready to save the sensitive kpages to storage. 894 * We cannot trust what's tagged in the bitmaps anymore 895 * after storage allocations. Clear up the bitmaps and 896 * retag the sensitive kpages again. The storage pages 897 * should be untagged. 898 */ 899 cpr_clear_bitmaps(); 900 901 spages = 902 i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); 903 vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); 904 905 DEBUG7(errp(pages_fmt, "after ", str, 906 spages, vpages, spages - vpages)); 907 908 /* 909 * Returns 0 on success, -1 if too few descriptors, and 910 * ENOMEM if not enough space to save sensitive pages 911 */ 912 DEBUG1(errp("compressing pages to storage...\n")); 913 error = i_cpr_save_to_storage(); 914 if (error == 0) { 915 /* Saving to storage succeeded */ 916 DEBUG1(errp("compressed %d pages\n", 917 sensitive_pages_saved)); 918 break; 919 } else if (error == -1) 920 DEBUG1(errp("%s too few descriptors\n", str)); 921 } 922 if (error == -1) 923 error = ENOMEM; 924 return (error); 925 } 926 927 928 /* 929 * Estimate how much memory we will need to save 930 * the sensitive pages with compression. 931 */ 932 static caddr_t 933 i_cpr_storage_data_alloc(pgcnt_t pages, pgcnt_t *alloc_pages, int retry_cnt) 934 { 935 pgcnt_t alloc_pcnt, last_pcnt; 936 caddr_t addr; 937 char *str; 938 939 str = "i_cpr_storage_data_alloc:"; 940 if (retry_cnt == 0) { 941 /* 942 * common compression ratio is about 3:1 943 * initial storage allocation is estimated at 40% 944 * to cover the majority of cases 945 */ 946 alloc_pcnt = INITIAL_ALLOC_PCNT; 947 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 948 DEBUG7(errp("%s sensitive pages: %ld\n", str, pages)); 949 DEBUG7(errp("%s initial est pages: %ld, alloc %ld%%\n", 950 str, *alloc_pages, alloc_pcnt)); 951 } else { 952 /* 953 * calculate the prior compression percentage (x100) 954 * from the last attempt to save sensitive pages 955 */ 956 ASSERT(sensitive_pages_saved != 0); 957 last_pcnt = (mmu_btopr(sensitive_size_saved) * INTEGRAL) / 958 sensitive_pages_saved; 959 DEBUG7(errp("%s last ratio %ld%%\n", str, last_pcnt)); 960 961 /* 962 * new estimated storage size is based on 963 * the larger ratio + 5% for each retry: 964 * pages * (last + [5%, 10%]) 965 */ 966 alloc_pcnt = MAX(last_pcnt, INITIAL_ALLOC_PCNT) + 967 (retry_cnt * 5); 968 *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; 969 DEBUG7(errp("%s Retry est pages: %ld, alloc %ld%%\n", 970 str, *alloc_pages, alloc_pcnt)); 971 } 972 973 addr = kmem_alloc(mmu_ptob(*alloc_pages), KM_NOSLEEP); 974 DEBUG7(errp("%s alloc %ld pages\n", str, *alloc_pages)); 975 return (addr); 976 } 977 978 979 void 980 i_cpr_storage_free(void) 981 { 982 /* Free descriptors */ 983 if (i_cpr_storage_desc_base) { 984 kmem_free(i_cpr_storage_desc_base, 985 mmu_ptob(i_cpr_storage_desc_pgcnt)); 986 i_cpr_storage_desc_base = NULL; 987 i_cpr_storage_desc_pgcnt = 0; 988 } 989 990 991 /* Data storage */ 992 if (i_cpr_storage_data_base) { 993 kmem_free(i_cpr_storage_data_base, 994 mmu_ptob(i_cpr_storage_data_sz)); 995 i_cpr_storage_data_base = NULL; 996 i_cpr_storage_data_sz = 0; 997 } 998 } 999 1000 1001 /* 1002 * This routine is derived from cpr_compress_and_write(). 1003 * 1. Do bookkeeping in the descriptor for the contiguous sensitive chunk. 1004 * 2. Compress and save the clean sensitive pages into the storage area. 1005 */ 1006 int 1007 i_cpr_compress_and_save(int chunks, pfn_t spfn, pgcnt_t pages) 1008 { 1009 extern char *cpr_compress_pages(cpd_t *, pgcnt_t, int); 1010 extern caddr_t i_cpr_storage_data_end; 1011 uint_t remaining, datalen; 1012 uint32_t test_usum; 1013 char *datap; 1014 csd_t *descp; 1015 cpd_t cpd; 1016 int error; 1017 1018 /* 1019 * Fill next empty storage descriptor 1020 */ 1021 descp = i_cpr_storage_desc_base + chunks - 1; 1022 if (descp >= i_cpr_storage_desc_end) { 1023 DEBUG1(errp("ran out of descriptors, base 0x%p, chunks %d, " 1024 "end 0x%p, descp 0x%p\n", i_cpr_storage_desc_base, chunks, 1025 i_cpr_storage_desc_end, descp)); 1026 return (-1); 1027 } 1028 ASSERT(descp->csd_dirty_spfn == (uint_t)-1); 1029 i_cpr_storage_desc_last_used = descp; 1030 1031 descp->csd_dirty_spfn = spfn; 1032 descp->csd_dirty_npages = pages; 1033 1034 i_cpr_mapin(CPR->c_mapping_area, pages, spfn); 1035 1036 /* 1037 * try compressing pages and copy cpd fields 1038 * pfn is copied for debug use 1039 */ 1040 cpd.cpd_pfn = spfn; 1041 datap = cpr_compress_pages(&cpd, pages, C_COMPRESSING); 1042 datalen = cpd.cpd_length; 1043 descp->csd_clean_compressed = (cpd.cpd_flag & CPD_COMPRESS); 1044 #ifdef DEBUG 1045 descp->csd_usum = cpd.cpd_usum; 1046 descp->csd_csum = cpd.cpd_csum; 1047 #endif 1048 1049 error = 0; 1050 1051 /* 1052 * Save the raw or compressed data to the storage area pointed to by 1053 * sensitive_write_ptr. Make sure the storage space is big enough to 1054 * hold the result. Otherwise roll back to increase the storage space. 1055 */ 1056 descp->csd_clean_sva = (cpr_ptr)sensitive_write_ptr; 1057 descp->csd_clean_sz = datalen; 1058 if ((sensitive_write_ptr + datalen) < i_cpr_storage_data_end) { 1059 extern void cprbcopy(void *, void *, size_t); 1060 1061 cprbcopy(datap, sensitive_write_ptr, datalen); 1062 sensitive_size_saved += datalen; 1063 sensitive_pages_saved += descp->csd_dirty_npages; 1064 sensitive_write_ptr += datalen; 1065 } else { 1066 remaining = (i_cpr_storage_data_end - sensitive_write_ptr); 1067 DEBUG1(errp("i_cpr_compress_and_save: The storage " 1068 "space is too small!\ngot %d, want %d\n\n", 1069 remaining, (remaining + datalen))); 1070 #ifdef DEBUG 1071 /* 1072 * Check to see if the content of the sensitive pages that we 1073 * just copied have changed during this small time window. 1074 */ 1075 test_usum = checksum32(CPR->c_mapping_area, mmu_ptob(pages)); 1076 descp->csd_usum = cpd.cpd_usum; 1077 if (test_usum != descp->csd_usum) { 1078 DEBUG1(errp("\nWARNING: i_cpr_compress_and_save: " 1079 "Data in the range of pfn 0x%x to pfn " 1080 "0x%x has changed after they are saved " 1081 "into storage.", spfn, (spfn + pages - 1))); 1082 } 1083 #endif 1084 error = ENOMEM; 1085 } 1086 1087 i_cpr_mapout(CPR->c_mapping_area, pages); 1088 return (error); 1089 } 1090 1091 1092 /* 1093 * This routine is derived from cpr_count_kpages(). 1094 * It goes through kernel data nucleus and segkmem segments to select 1095 * pages in use and mark them in the corresponding bitmap. 1096 */ 1097 pgcnt_t 1098 i_cpr_count_sensitive_kpages(int mapflag, bitfunc_t bitfunc) 1099 { 1100 pgcnt_t kdata_cnt = 0, segkmem_cnt = 0; 1101 extern caddr_t e_moddata; 1102 extern struct seg kvalloc; 1103 extern struct seg kmem64; 1104 size_t size; 1105 1106 /* 1107 * Kernel data nucleus pages 1108 */ 1109 size = e_moddata - s_data; 1110 kdata_cnt += cpr_count_pages(s_data, size, 1111 mapflag, bitfunc, DBG_SHOWRANGE); 1112 1113 /* 1114 * kvseg and kvalloc pages 1115 */ 1116 segkmem_cnt += cpr_scan_kvseg(mapflag, bitfunc, &kvseg); 1117 segkmem_cnt += cpr_count_pages(kvalloc.s_base, kvalloc.s_size, 1118 mapflag, bitfunc, DBG_SHOWRANGE); 1119 1120 /* segment to support kernel memory usage above 32-bit space (4GB) */ 1121 if (kmem64.s_base) 1122 segkmem_cnt += cpr_count_pages(kmem64.s_base, kmem64.s_size, 1123 mapflag, bitfunc, DBG_SHOWRANGE); 1124 1125 DEBUG7(errp("\ni_cpr_count_sensitive_kpages:\n" 1126 "\tkdata_cnt %ld + segkmem_cnt %ld = %ld pages\n", 1127 kdata_cnt, segkmem_cnt, kdata_cnt + segkmem_cnt)); 1128 1129 return (kdata_cnt + segkmem_cnt); 1130 } 1131 1132 1133 pgcnt_t 1134 i_cpr_count_storage_pages(int mapflag, bitfunc_t bitfunc) 1135 { 1136 pgcnt_t count = 0; 1137 1138 if (i_cpr_storage_desc_base) { 1139 count += cpr_count_pages((caddr_t)i_cpr_storage_desc_base, 1140 (size_t)mmu_ptob(i_cpr_storage_desc_pgcnt), 1141 mapflag, bitfunc, DBG_SHOWRANGE); 1142 } 1143 if (i_cpr_storage_data_base) { 1144 count += cpr_count_pages(i_cpr_storage_data_base, 1145 (size_t)mmu_ptob(i_cpr_storage_data_sz), 1146 mapflag, bitfunc, DBG_SHOWRANGE); 1147 } 1148 return (count); 1149 } 1150 1151 1152 /* 1153 * Derived from cpr_write_statefile(). 1154 * Allocate (or reallocate after exhausting the supply) descriptors for each 1155 * chunk of contiguous sensitive kpages. 1156 */ 1157 static int 1158 i_cpr_storage_desc_alloc(csd_t **basepp, pgcnt_t *pgsp, csd_t **endpp, 1159 int retry) 1160 { 1161 pgcnt_t npages; 1162 int chunks; 1163 csd_t *descp, *end; 1164 size_t len; 1165 char *str = "i_cpr_storage_desc_alloc:"; 1166 1167 /* 1168 * On initial allocation, add some extra to cover overhead caused 1169 * by the allocation for the storage area later. 1170 */ 1171 if (retry == 0) { 1172 chunks = cpr_contig_pages(NULL, STORAGE_DESC_ALLOC) + 1173 EXTRA_DESCS; 1174 npages = mmu_btopr(sizeof (**basepp) * (pgcnt_t)chunks); 1175 DEBUG7(errp("%s chunks %d, ", str, chunks)); 1176 } else { 1177 DEBUG7(errp("%s retry %d: ", str, retry)); 1178 npages = *pgsp + 1; 1179 } 1180 /* Free old descriptors, if any */ 1181 if (*basepp) 1182 kmem_free((caddr_t)*basepp, mmu_ptob(*pgsp)); 1183 1184 descp = *basepp = kmem_alloc(mmu_ptob(npages), KM_NOSLEEP); 1185 if (descp == NULL) { 1186 DEBUG7(errp("%s no space for descriptors!\n", str)); 1187 return (ENOMEM); 1188 } 1189 1190 *pgsp = npages; 1191 len = mmu_ptob(npages); 1192 end = *endpp = descp + (len / (sizeof (**basepp))); 1193 DEBUG7(errp("npages 0x%x, len 0x%x, items 0x%x\n\t*basepp " 1194 "%p, *endpp %p\n", npages, len, (len / (sizeof (**basepp))), 1195 *basepp, *endpp)); 1196 i_cpr_storage_desc_init(descp, npages, end); 1197 return (0); 1198 } 1199 1200 static void 1201 i_cpr_storage_desc_init(csd_t *descp, pgcnt_t npages, csd_t *end) 1202 { 1203 size_t len = mmu_ptob(npages); 1204 1205 /* Initialize the descriptors to something impossible. */ 1206 bzero(descp, len); 1207 #ifdef DEBUG 1208 /* 1209 * This condition is tested by an ASSERT 1210 */ 1211 for (; descp < end; descp++) 1212 descp->csd_dirty_spfn = (uint_t)-1; 1213 #endif 1214 } 1215 1216 int 1217 i_cpr_dump_sensitive_kpages(vnode_t *vp) 1218 { 1219 int error = 0; 1220 uint_t spin_cnt = 0; 1221 csd_t *descp; 1222 1223 /* 1224 * These following two variables need to be reinitialized 1225 * for each cpr cycle. 1226 */ 1227 i_cpr_sensitive_bytes_dumped = 0; 1228 i_cpr_sensitive_pgs_dumped = 0; 1229 1230 if (i_cpr_storage_desc_base) { 1231 for (descp = i_cpr_storage_desc_base; 1232 descp <= i_cpr_storage_desc_last_used; descp++) { 1233 if (error = cpr_dump_sensitive(vp, descp)) 1234 return (error); 1235 spin_cnt++; 1236 if ((spin_cnt & 0x5F) == 1) 1237 cpr_spinning_bar(); 1238 } 1239 prom_printf(" \b"); 1240 } 1241 1242 DEBUG7(errp("\ni_cpr_dump_sensitive_kpages: dumped %d\n", 1243 i_cpr_sensitive_pgs_dumped)); 1244 return (0); 1245 } 1246 1247 1248 /* 1249 * 1. Fill the cpr page descriptor with the info of the dirty pages 1250 * and 1251 * write the descriptor out. It will be used at resume. 1252 * 2. Write the clean data in stead of the dirty data out. 1253 * Note: to save space, the clean data is already compressed. 1254 */ 1255 static int 1256 cpr_dump_sensitive(vnode_t *vp, csd_t *descp) 1257 { 1258 int error = 0; 1259 caddr_t datap; 1260 cpd_t cpd; /* cpr page descriptor */ 1261 pfn_t dirty_spfn; 1262 pgcnt_t dirty_npages; 1263 size_t clean_sz; 1264 caddr_t clean_sva; 1265 int clean_compressed; 1266 extern uchar_t cpr_pagecopy[]; 1267 1268 dirty_spfn = descp->csd_dirty_spfn; 1269 dirty_npages = descp->csd_dirty_npages; 1270 clean_sva = (caddr_t)descp->csd_clean_sva; 1271 clean_sz = descp->csd_clean_sz; 1272 clean_compressed = descp->csd_clean_compressed; 1273 1274 /* Fill cpr page descriptor. */ 1275 cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC; 1276 cpd.cpd_pfn = dirty_spfn; 1277 cpd.cpd_flag = 0; /* must init to zero */ 1278 cpd.cpd_pages = dirty_npages; 1279 1280 #ifdef DEBUG 1281 if ((cpd.cpd_usum = descp->csd_usum) != 0) 1282 cpd.cpd_flag |= CPD_USUM; 1283 if ((cpd.cpd_csum = descp->csd_csum) != 0) 1284 cpd.cpd_flag |= CPD_CSUM; 1285 #endif 1286 1287 STAT->cs_dumped_statefsz += mmu_ptob(dirty_npages); 1288 1289 /* 1290 * The sensitive kpages are usually saved with compression 1291 * unless compression could not reduce the size of the data. 1292 * If user choose not to have the statefile compressed, 1293 * we need to decompress the data back before dumping it to disk. 1294 */ 1295 if (CPR->c_flags & C_COMPRESSING) { 1296 cpd.cpd_length = clean_sz; 1297 datap = clean_sva; 1298 if (clean_compressed) 1299 cpd.cpd_flag |= CPD_COMPRESS; 1300 } else { 1301 if (clean_compressed) { 1302 cpd.cpd_length = decompress(clean_sva, cpr_pagecopy, 1303 clean_sz, mmu_ptob(dirty_npages)); 1304 datap = (caddr_t)cpr_pagecopy; 1305 ASSERT(cpd.cpd_length == mmu_ptob(dirty_npages)); 1306 } else { 1307 cpd.cpd_length = clean_sz; 1308 datap = clean_sva; 1309 } 1310 cpd.cpd_csum = 0; 1311 } 1312 1313 /* Write cpr page descriptor */ 1314 error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd)); 1315 if (error) { 1316 DEBUG7(errp("descp: %x\n", descp)); 1317 #ifdef DEBUG 1318 debug_enter("cpr_dump_sensitive: cpr_write() page " 1319 "descriptor failed!\n"); 1320 #endif 1321 return (error); 1322 } 1323 1324 i_cpr_sensitive_bytes_dumped += sizeof (cpd_t); 1325 1326 /* Write page data */ 1327 error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length); 1328 if (error) { 1329 DEBUG7(errp("error: %x\n", error)); 1330 DEBUG7(errp("descp: %x\n", descp)); 1331 DEBUG7(errp("cpr_write(%x, %x , %x)\n", vp, datap, 1332 cpd.cpd_length)); 1333 #ifdef DEBUG 1334 debug_enter("cpr_dump_sensitive: cpr_write() data failed!\n"); 1335 #endif 1336 return (error); 1337 } 1338 1339 i_cpr_sensitive_bytes_dumped += cpd.cpd_length; 1340 i_cpr_sensitive_pgs_dumped += dirty_npages; 1341 1342 return (error); 1343 } 1344 1345 1346 /* 1347 * Sanity check to make sure that we have dumped right amount 1348 * of pages from different sources to statefile. 1349 */ 1350 int 1351 i_cpr_check_pgs_dumped(uint_t pgs_expected, uint_t regular_pgs_dumped) 1352 { 1353 uint_t total_pgs_dumped; 1354 1355 total_pgs_dumped = regular_pgs_dumped + i_cpr_sensitive_pgs_dumped; 1356 1357 DEBUG7(errp("\ncheck_pgs: reg %d + sens %d = %d, expect %d\n\n", 1358 regular_pgs_dumped, i_cpr_sensitive_pgs_dumped, 1359 total_pgs_dumped, pgs_expected)); 1360 1361 if (pgs_expected == total_pgs_dumped) 1362 return (0); 1363 1364 return (EINVAL); 1365 } 1366 1367 1368 int 1369 i_cpr_reusefini(void) 1370 { 1371 struct vnode *vp; 1372 cdef_t *cdef; 1373 size_t size; 1374 char *bufp; 1375 int rc; 1376 1377 if (cpr_reusable_mode) 1378 cpr_reusable_mode = 0; 1379 1380 if (rc = cpr_open_deffile(FREAD|FWRITE, &vp)) { 1381 if (rc == EROFS) { 1382 cpr_err(CE_CONT, "uadmin A_FREEZE AD_REUSEFINI " 1383 "(uadmin %d %d)\nmust be done with / mounted " 1384 "writeable.\n", A_FREEZE, AD_REUSEFINI); 1385 } 1386 return (rc); 1387 } 1388 1389 cdef = kmem_alloc(sizeof (*cdef), KM_SLEEP); 1390 rc = cpr_rdwr(UIO_READ, vp, cdef, sizeof (*cdef)); 1391 1392 if (rc) { 1393 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1394 cpr_default_path, rc); 1395 } else if (cdef->mini.magic != CPR_DEFAULT_MAGIC) { 1396 cpr_err(CE_WARN, "bad magic number in %s, cannot restore " 1397 "prom values for %s", cpr_default_path, 1398 cpr_enumerate_promprops(&bufp, &size)); 1399 kmem_free(bufp, size); 1400 rc = EINVAL; 1401 } else { 1402 /* 1403 * clean up prom properties 1404 */ 1405 rc = cpr_update_nvram(cdef->props); 1406 if (rc == 0) { 1407 /* 1408 * invalidate the disk copy and turn off reusable 1409 */ 1410 cdef->mini.magic = 0; 1411 cdef->mini.reusable = 0; 1412 if (rc = cpr_rdwr(UIO_WRITE, vp, 1413 &cdef->mini, sizeof (cdef->mini))) { 1414 cpr_err(CE_WARN, "Failed writing %s, errno %d", 1415 cpr_default_path, rc); 1416 } 1417 } 1418 } 1419 1420 (void) VOP_CLOSE(vp, FREAD|FWRITE, 1, (offset_t)0, CRED()); 1421 VN_RELE(vp); 1422 kmem_free(cdef, sizeof (*cdef)); 1423 1424 return (rc); 1425 } 1426 1427 1428 int 1429 i_cpr_reuseinit(void) 1430 { 1431 int rc = 0; 1432 1433 if (rc = cpr_default_setup(1)) 1434 return (rc); 1435 1436 /* 1437 * We need to validate default file 1438 */ 1439 rc = cpr_validate_definfo(1); 1440 if (rc == 0) 1441 cpr_reusable_mode = 1; 1442 else if (rc == EROFS) { 1443 cpr_err(CE_NOTE, "reuseinit must be performed " 1444 "while / is mounted writeable"); 1445 } 1446 1447 (void) cpr_default_setup(0); 1448 1449 return (rc); 1450 } 1451 1452 1453 int 1454 i_cpr_check_cprinfo(void) 1455 { 1456 struct vnode *vp; 1457 cmini_t mini; 1458 int rc = 0; 1459 1460 if (rc = cpr_open_deffile(FREAD, &vp)) { 1461 if (rc == ENOENT) 1462 cpr_err(CE_NOTE, "cprinfo file does not " 1463 "exist. You must run 'uadmin %d %d' " 1464 "command while / is mounted writeable,\n" 1465 "then reboot and run 'uadmin %d %d' " 1466 "to create a reusable statefile", 1467 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1468 return (rc); 1469 } 1470 1471 rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini)); 1472 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED()); 1473 VN_RELE(vp); 1474 1475 if (rc) { 1476 cpr_err(CE_WARN, "Failed reading %s, errno = %d", 1477 cpr_default_path, rc); 1478 } else if (mini.magic != CPR_DEFAULT_MAGIC) { 1479 cpr_err(CE_CONT, "bad magic number in cprinfo file.\n" 1480 "You must run 'uadmin %d %d' while / is mounted " 1481 "writeable, then reboot and run 'uadmin %d %d' " 1482 "to create a reusable statefile\n", 1483 A_FREEZE, AD_REUSEINIT, A_FREEZE, AD_REUSABLE); 1484 rc = EINVAL; 1485 } 1486 1487 return (rc); 1488 } 1489 1490 1491 int 1492 i_cpr_reusable_supported(void) 1493 { 1494 return (1); 1495 } 1496 1497 1498 /* 1499 * find prom phys pages and alloc space for a tmp copy 1500 */ 1501 static int 1502 i_cpr_find_ppages(void) 1503 { 1504 extern struct vnode prom_ppages; 1505 struct page *pp; 1506 struct memlist *pmem; 1507 pgcnt_t npages, pcnt, scnt, vcnt; 1508 pfn_t ppn, plast, *dst; 1509 int mapflag; 1510 1511 cpr_clear_bitmaps(); 1512 mapflag = REGULAR_BITMAP; 1513 1514 /* 1515 * there should be a page_t for each phys page used by the kernel; 1516 * set a bit for each phys page not tracked by a page_t 1517 */ 1518 pcnt = 0; 1519 memlist_read_lock(); 1520 for (pmem = phys_install; pmem; pmem = pmem->next) { 1521 npages = mmu_btop(pmem->size); 1522 ppn = mmu_btop(pmem->address); 1523 for (plast = ppn + npages; ppn < plast; ppn++) { 1524 if (page_numtopp_nolock(ppn)) 1525 continue; 1526 (void) cpr_setbit(ppn, mapflag); 1527 pcnt++; 1528 } 1529 } 1530 memlist_read_unlock(); 1531 1532 /* 1533 * clear bits for phys pages in each segment 1534 */ 1535 scnt = cpr_count_seg_pages(mapflag, cpr_clrbit); 1536 1537 /* 1538 * set bits for phys pages referenced by the prom_ppages vnode; 1539 * these pages are mostly comprised of forthdebug words 1540 */ 1541 vcnt = 0; 1542 for (pp = prom_ppages.v_pages; pp; ) { 1543 if (cpr_setbit(pp->p_offset, mapflag) == 0) 1544 vcnt++; 1545 pp = pp->p_vpnext; 1546 if (pp == prom_ppages.v_pages) 1547 break; 1548 } 1549 1550 /* 1551 * total number of prom pages are: 1552 * (non-page_t pages - seg pages + vnode pages) 1553 */ 1554 ppage_count = pcnt - scnt + vcnt; 1555 DEBUG1(errp("find_ppages: pcnt %ld - scnt %ld + vcnt %ld = %ld\n", 1556 pcnt, scnt, vcnt, ppage_count)); 1557 1558 /* 1559 * alloc array of pfn_t to store phys page list 1560 */ 1561 pphys_list_size = ppage_count * sizeof (pfn_t); 1562 pphys_list = kmem_alloc(pphys_list_size, KM_NOSLEEP); 1563 if (pphys_list == NULL) { 1564 cpr_err(CE_WARN, "cannot alloc pphys_list"); 1565 return (ENOMEM); 1566 } 1567 1568 /* 1569 * phys pages referenced in the bitmap should be 1570 * those used by the prom; scan bitmap and save 1571 * a list of prom phys page numbers 1572 */ 1573 dst = pphys_list; 1574 memlist_read_lock(); 1575 for (pmem = phys_install; pmem; pmem = pmem->next) { 1576 npages = mmu_btop(pmem->size); 1577 ppn = mmu_btop(pmem->address); 1578 for (plast = ppn + npages; ppn < plast; ppn++) { 1579 if (cpr_isset(ppn, mapflag)) { 1580 ASSERT(dst < (pphys_list + ppage_count)); 1581 *dst++ = ppn; 1582 } 1583 } 1584 } 1585 memlist_read_unlock(); 1586 1587 /* 1588 * allocate space to store prom pages 1589 */ 1590 ppage_buf = kmem_alloc(mmu_ptob(ppage_count), KM_NOSLEEP); 1591 if (ppage_buf == NULL) { 1592 kmem_free(pphys_list, pphys_list_size); 1593 pphys_list = NULL; 1594 cpr_err(CE_WARN, "cannot alloc ppage_buf"); 1595 return (ENOMEM); 1596 } 1597 1598 return (0); 1599 } 1600 1601 1602 /* 1603 * save prom pages to kmem pages 1604 */ 1605 static void 1606 i_cpr_save_ppages(void) 1607 { 1608 pfn_t *pphys, *plast; 1609 caddr_t dst; 1610 1611 /* 1612 * map in each prom page and copy to a kmem page 1613 */ 1614 dst = ppage_buf; 1615 plast = pphys_list + ppage_count; 1616 for (pphys = pphys_list; pphys < plast; pphys++) { 1617 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1618 bcopy(cpr_vaddr, dst, MMU_PAGESIZE); 1619 i_cpr_mapout(cpr_vaddr, 1); 1620 dst += MMU_PAGESIZE; 1621 } 1622 1623 DEBUG1(errp("saved %d prom pages\n", ppage_count)); 1624 } 1625 1626 1627 /* 1628 * restore prom pages from kmem pages 1629 */ 1630 static void 1631 i_cpr_restore_ppages(void) 1632 { 1633 pfn_t *pphys, *plast; 1634 caddr_t src; 1635 1636 dcache_flushall(); 1637 1638 /* 1639 * map in each prom page and copy from a kmem page 1640 */ 1641 src = ppage_buf; 1642 plast = pphys_list + ppage_count; 1643 for (pphys = pphys_list; pphys < plast; pphys++) { 1644 i_cpr_mapin(cpr_vaddr, 1, *pphys); 1645 bcopy(src, cpr_vaddr, MMU_PAGESIZE); 1646 i_cpr_mapout(cpr_vaddr, 1); 1647 src += MMU_PAGESIZE; 1648 } 1649 1650 dcache_flushall(); 1651 1652 DEBUG1(errp("restored %d prom pages\n", ppage_count)); 1653 } 1654 1655 1656 /* 1657 * save/restore prom pages or free related allocs 1658 */ 1659 int 1660 i_cpr_prom_pages(int action) 1661 { 1662 int error; 1663 1664 if (action == CPR_PROM_SAVE) { 1665 if (ppage_buf == NULL) { 1666 ASSERT(pphys_list == NULL); 1667 if (error = i_cpr_find_ppages()) 1668 return (error); 1669 i_cpr_save_ppages(); 1670 } 1671 } else if (action == CPR_PROM_RESTORE) { 1672 i_cpr_restore_ppages(); 1673 } else if (action == CPR_PROM_FREE) { 1674 if (pphys_list) { 1675 ASSERT(pphys_list_size); 1676 kmem_free(pphys_list, pphys_list_size); 1677 pphys_list = NULL; 1678 pphys_list_size = 0; 1679 } 1680 if (ppage_buf) { 1681 ASSERT(ppage_count); 1682 kmem_free(ppage_buf, mmu_ptob(ppage_count)); 1683 DEBUG1(errp("freed %d prom pages\n", ppage_count)); 1684 ppage_buf = NULL; 1685 ppage_count = 0; 1686 } 1687 } 1688 return (0); 1689 } 1690 1691 1692 /* 1693 * record tlb data for the nucleus, bigktsb's, and the cpr module; 1694 * this data is later used by cprboot to install dtlb/itlb entries. 1695 * when we jump into the cpr module during the resume phase, those 1696 * mappings are needed until switching to the kernel trap table. 1697 * to make the dtte/itte info available during resume, we need 1698 * the info recorded prior to saving sensitive pages, otherwise 1699 * all the data would appear as NULLs. 1700 */ 1701 static void 1702 i_cpr_save_tlbinfo(void) 1703 { 1704 cti_t cti; 1705 1706 /* 1707 * during resume - shortly after jumping into the cpr module, 1708 * sfmmu_load_mmustate() will overwrite any dtlb entry at any 1709 * index used for TSBs; skip is set so that any saved tte will 1710 * target other tlb offsets and prevent being lost during 1711 * resume. now scan the dtlb and save locked entries, 1712 * then add entries for the tmp stack / data page and the 1713 * cpr thread structure. 1714 */ 1715 cti.dst = m_info.dtte; 1716 cti.tail = cti.dst + CPR_MAX_TLB; 1717 cti.reader = dtlb_rd_entry; 1718 cti.writer = NULL; 1719 cti.filter = i_cpr_lnb; 1720 cti.index = cpunodes[CPU->cpu_id].dtlb_size - 1; 1721 cti.skip = (1 << utsb_dtlb_ttenum); 1722 cti.skip |= (1 << utsb4m_dtlb_ttenum); 1723 i_cpr_scan_tlb(&cti); 1724 i_cpr_make_tte(&cti, &i_cpr_data_page, datava); 1725 i_cpr_make_tte(&cti, curthread, datava); 1726 1727 /* 1728 * scan itlb and save locked entries; add an entry for 1729 * the first text page of the cpr module; cprboot will 1730 * jump to that page after restoring kernel pages. 1731 */ 1732 cti.dst = m_info.itte; 1733 cti.tail = cti.dst + CPR_MAX_TLB; 1734 cti.reader = itlb_rd_entry; 1735 cti.index = cpunodes[CPU->cpu_id].itlb_size - 1; 1736 cti.skip = 0; 1737 i_cpr_scan_tlb(&cti); 1738 i_cpr_make_tte(&cti, (void *)i_cpr_resume_setup, textva); 1739 } 1740 1741 1742 /* ARGSUSED */ 1743 int 1744 i_cpr_dump_setup(vnode_t *vp) 1745 { 1746 /* 1747 * zero out m_info and add info to dtte/itte arrays 1748 */ 1749 bzero(&m_info, sizeof (m_info)); 1750 i_cpr_save_tlbinfo(); 1751 return (0); 1752 } 1753 1754 1755 int 1756 i_cpr_is_supported(void) 1757 { 1758 char es_prop[] = "energystar-v2"; 1759 dnode_t node; 1760 int last; 1761 extern int cpr_supported_override; 1762 extern int cpr_platform_enable; 1763 1764 /* 1765 * The next statement tests if a specific platform has turned off 1766 * cpr support. 1767 */ 1768 if (cpr_supported_override) 1769 return (0); 1770 1771 /* 1772 * Do not inspect energystar-v* property if a platform has 1773 * specifically turned on cpr support 1774 */ 1775 if (cpr_platform_enable) 1776 return (1); 1777 1778 node = prom_rootnode(); 1779 if (prom_getproplen(node, es_prop) != -1) 1780 return (1); 1781 last = strlen(es_prop) - 1; 1782 es_prop[last] = '3'; 1783 return (prom_getproplen(node, es_prop) != -1); 1784 } 1785 1786 1787 /* 1788 * the actual size of the statefile data isn't known until after all the 1789 * compressed pages are written; even the inode size doesn't reflect the 1790 * data size since there are usually many extra fs blocks. for recording 1791 * the actual data size, the first sector of the statefile is copied to 1792 * a tmp buf, and the copy is later updated and flushed to disk. 1793 */ 1794 int 1795 i_cpr_blockzero(char *base, char **bufpp, int *blkno, vnode_t *vp) 1796 { 1797 extern int cpr_flush_write(vnode_t *); 1798 static char cpr_sector[DEV_BSIZE]; 1799 cpr_ext bytes, *dst; 1800 1801 /* 1802 * this routine is called after cdd_t and csu_md_t are copied 1803 * to cpr_buf; mini-hack alert: the save/update method creates 1804 * a dependency on the combined struct size being >= one sector 1805 * or DEV_BSIZE; since introduction in Sol2.7, csu_md_t size is 1806 * over 1K bytes and will probably grow with any changes. 1807 * 1808 * copy when vp is NULL, flush when non-NULL 1809 */ 1810 if (vp == NULL) { 1811 ASSERT((*bufpp - base) >= DEV_BSIZE); 1812 bcopy(base, cpr_sector, sizeof (cpr_sector)); 1813 return (0); 1814 } else { 1815 bytes = dbtob(*blkno); 1816 dst = &((cdd_t *)cpr_sector)->cdd_filesize; 1817 bcopy(&bytes, dst, sizeof (bytes)); 1818 bcopy(cpr_sector, base, sizeof (cpr_sector)); 1819 *bufpp = base + sizeof (cpr_sector); 1820 *blkno = cpr_statefile_offset(); 1821 DEBUG1(errp("statefile data size: %lld\n\n", bytes)); 1822 return (cpr_flush_write(vp)); 1823 } 1824 } 1825 1826 1827 /* 1828 * Allocate bitmaps according to the phys_install list. 1829 */ 1830 static int 1831 i_cpr_bitmap_setup(void) 1832 { 1833 struct memlist *pmem; 1834 cbd_t *dp, *tail; 1835 void *space; 1836 size_t size; 1837 1838 /* 1839 * The number of bitmap descriptors will be the count of 1840 * phys_install ranges plus 1 for a trailing NULL struct. 1841 */ 1842 cpr_nbitmaps = 1; 1843 for (pmem = phys_install; pmem; pmem = pmem->next) 1844 cpr_nbitmaps++; 1845 1846 if (cpr_nbitmaps > (CPR_MAX_BMDESC - 1)) { 1847 cpr_err(CE_WARN, "too many physical memory ranges %d, max %d", 1848 cpr_nbitmaps, CPR_MAX_BMDESC - 1); 1849 return (EFBIG); 1850 } 1851 1852 /* Alloc an array of bitmap descriptors. */ 1853 dp = kmem_zalloc(cpr_nbitmaps * sizeof (*dp), KM_NOSLEEP); 1854 if (dp == NULL) { 1855 cpr_nbitmaps = 0; 1856 return (ENOMEM); 1857 } 1858 tail = dp + cpr_nbitmaps; 1859 1860 CPR->c_bmda = dp; 1861 for (pmem = phys_install; pmem; pmem = pmem->next) { 1862 size = BITMAP_BYTES(pmem->size); 1863 space = kmem_zalloc(size * 2, KM_NOSLEEP); 1864 if (space == NULL) 1865 return (ENOMEM); 1866 ASSERT(dp < tail); 1867 dp->cbd_magic = CPR_BITMAP_MAGIC; 1868 dp->cbd_spfn = mmu_btop(pmem->address); 1869 dp->cbd_epfn = mmu_btop(pmem->address + pmem->size) - 1; 1870 dp->cbd_size = size; 1871 dp->cbd_reg_bitmap = (cpr_ptr)space; 1872 dp->cbd_vlt_bitmap = (cpr_ptr)((caddr_t)space + size); 1873 dp++; 1874 } 1875 1876 /* set magic for the last descriptor */ 1877 ASSERT(dp == (tail - 1)); 1878 dp->cbd_magic = CPR_BITMAP_MAGIC; 1879 1880 return (0); 1881 } 1882 1883 1884 void 1885 i_cpr_bitmap_cleanup(void) 1886 { 1887 cbd_t *dp; 1888 1889 if (CPR->c_bmda == NULL) 1890 return; 1891 for (dp = CPR->c_bmda; dp->cbd_size; dp++) 1892 kmem_free((void *)dp->cbd_reg_bitmap, dp->cbd_size * 2); 1893 kmem_free(CPR->c_bmda, cpr_nbitmaps * sizeof (*CPR->c_bmda)); 1894 CPR->c_bmda = NULL; 1895 cpr_nbitmaps = 0; 1896 } 1897 1898 1899 /* 1900 * A "regular" and "volatile" bitmap are created for each range of 1901 * physical memory. The volatile maps are used to count and track pages 1902 * susceptible to heap corruption - caused by drivers that allocate mem 1903 * during VOP_DUMP(); the regular maps are used for all the other non- 1904 * susceptible pages. Before writing the bitmaps to the statefile, 1905 * each bitmap pair gets merged to simplify handling within cprboot. 1906 */ 1907 int 1908 i_cpr_alloc_bitmaps(void) 1909 { 1910 int err; 1911 1912 memlist_read_lock(); 1913 err = i_cpr_bitmap_setup(); 1914 memlist_read_unlock(); 1915 if (err) 1916 i_cpr_bitmap_cleanup(); 1917 return (err); 1918 } 1919