1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/t_lock.h> 29 #include <sys/memlist.h> 30 #include <sys/cpuvar.h> 31 #include <sys/vmem.h> 32 #include <sys/mman.h> 33 #include <sys/vm.h> 34 #include <sys/kmem.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/vm_machparam.h> 38 #include <sys/tss.h> 39 #include <sys/vnode.h> 40 #include <vm/hat.h> 41 #include <vm/anon.h> 42 #include <vm/as.h> 43 #include <vm/page.h> 44 #include <vm/seg.h> 45 #include <vm/seg_kmem.h> 46 #include <vm/seg_map.h> 47 #include <vm/hat_i86.h> 48 #include <sys/promif.h> 49 #include <sys/x86_archext.h> 50 #include <sys/systm.h> 51 #include <sys/archsystm.h> 52 #include <sys/sunddi.h> 53 #include <sys/ddidmareq.h> 54 #include <sys/controlregs.h> 55 #include <sys/reboot.h> 56 #include <sys/kdi.h> 57 #include <sys/bootconf.h> 58 #include <sys/bootsvcs.h> 59 #include <sys/bootinfo.h> 60 #include <vm/kboot_mmu.h> 61 62 #ifdef __xpv 63 #include <sys/hypervisor.h> 64 #endif 65 66 caddr_t 67 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) 68 { 69 caddr_t addr; 70 caddr_t addr1; 71 page_t *pp; 72 73 addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); 74 75 for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { 76 pp = page_numtopp_nolock(pf); 77 if (pp == NULL) { 78 hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, 79 prot | HAT_NOSYNC, HAT_LOAD_LOCK); 80 } else { 81 hat_memload(kas.a_hat, addr, pp, 82 prot | HAT_NOSYNC, HAT_LOAD_LOCK); 83 } 84 } 85 86 return (addr1); 87 } 88 89 /* 90 * This routine is like page_numtopp, but accepts only free pages, which 91 * it allocates (unfrees) and returns with the exclusive lock held. 92 * It is used by machdep.c/dma_init() to find contiguous free pages. 93 * 94 * XXX this and some others should probably be in vm_machdep.c 95 */ 96 page_t * 97 page_numtopp_alloc(pfn_t pfnum) 98 { 99 page_t *pp; 100 101 retry: 102 pp = page_numtopp_nolock(pfnum); 103 if (pp == NULL) { 104 return (NULL); 105 } 106 107 if (!page_trylock(pp, SE_EXCL)) { 108 return (NULL); 109 } 110 111 if (page_pptonum(pp) != pfnum) { 112 page_unlock(pp); 113 goto retry; 114 } 115 116 if (!PP_ISFREE(pp)) { 117 page_unlock(pp); 118 return (NULL); 119 } 120 if (pp->p_szc) { 121 page_demote_free_pages(pp); 122 page_unlock(pp); 123 goto retry; 124 } 125 126 /* If associated with a vnode, destroy mappings */ 127 128 if (pp->p_vnode) { 129 130 page_destroy_free(pp); 131 132 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { 133 return (NULL); 134 } 135 136 if (page_pptonum(pp) != pfnum) { 137 page_unlock(pp); 138 goto retry; 139 } 140 } 141 142 if (!PP_ISFREE(pp) || !page_reclaim(pp, (kmutex_t *)NULL)) { 143 page_unlock(pp); 144 return (NULL); 145 } 146 147 return (pp); 148 } 149 150 /* 151 * Flag is not set early in boot. Once it is set we are no longer 152 * using boot's page tables. 153 */ 154 uint_t khat_running = 0; 155 156 /* 157 * This procedure is callable only while the boot loader is in charge of the 158 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in 159 * kboot_mmu.c since it's used from common code. 160 */ 161 pfn_t 162 va_to_pfn(void *vaddr) 163 { 164 uintptr_t des_va = ALIGN2PAGE(vaddr); 165 uintptr_t va = des_va; 166 size_t len; 167 uint_t prot; 168 pfn_t pfn; 169 170 if (khat_running) 171 panic("va_to_pfn(): called too late\n"); 172 173 if (kbm_probe(&va, &len, &pfn, &prot) == 0) 174 return (PFN_INVALID); 175 if (va > des_va) 176 return (PFN_INVALID); 177 if (va < des_va) 178 pfn += mmu_btop(des_va - va); 179 return (pfn); 180 } 181 182 /* 183 * Initialize a special area in the kernel that always holds some PTEs for 184 * faster performance. This always holds segmap's PTEs. 185 * In the 32 bit kernel this maps the kernel heap too. 186 */ 187 void 188 hat_kmap_init(uintptr_t base, size_t len) 189 { 190 uintptr_t map_addr; /* base rounded down to large page size */ 191 uintptr_t map_eaddr; /* base + len rounded up */ 192 size_t map_len; 193 caddr_t ptes; /* mapping area in kernel for kmap ptes */ 194 size_t window_size; /* size of mapping area for ptes */ 195 ulong_t htable_cnt; /* # of page tables to cover map_len */ 196 ulong_t i; 197 htable_t *ht; 198 uintptr_t va; 199 200 /* 201 * We have to map in an area that matches an entire page table. 202 * The PTEs are large page aligned to avoid spurious pagefaults 203 * on the hypervisor. 204 */ 205 map_addr = base & LEVEL_MASK(1); 206 map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1); 207 map_len = map_eaddr - map_addr; 208 window_size = mmu_btop(map_len) * mmu.pte_size; 209 window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1); 210 htable_cnt = map_len >> LEVEL_SHIFT(1); 211 212 /* 213 * allocate vmem for the kmap_ptes 214 */ 215 ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0, 216 0, NULL, NULL, VM_SLEEP); 217 mmu.kmap_htables = 218 kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP); 219 220 /* 221 * Map the page tables that cover kmap into the allocated range. 222 * Note we don't ever htable_release() the kmap page tables - they 223 * can't ever be stolen, freed, etc. 224 */ 225 for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) { 226 ht = htable_create(kas.a_hat, va, 0, NULL); 227 if (ht == NULL) 228 panic("hat_kmap_init: ht == NULL"); 229 mmu.kmap_htables[i] = ht; 230 231 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE, 232 MMU_PAGESIZE, ht->ht_pfn, 233 #ifdef __xpv 234 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, 235 #else 236 PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK, 237 #endif 238 HAT_LOAD | HAT_LOAD_NOCONSIST); 239 } 240 241 /* 242 * set information in mmu to activate handling of kmap 243 */ 244 mmu.kmap_addr = map_addr; 245 mmu.kmap_eaddr = map_eaddr; 246 mmu.kmap_ptes = (x86pte_t *)ptes; 247 } 248 249 extern caddr_t kpm_vbase; 250 extern size_t kpm_size; 251 252 #ifdef __xpv 253 /* 254 * Create the initial segkpm mappings for the hypervisor. To avoid having 255 * to deal with page tables being read only, we make all mappings 256 * read only at first. 257 */ 258 static void 259 xen_kpm_create(paddr_t paddr, level_t lvl) 260 { 261 ulong_t pg_off; 262 263 for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) { 264 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1); 265 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off, 266 paddr + pg_off); 267 } 268 } 269 270 /* 271 * Try to make all kpm mappings writable. Failures are ok, as those 272 * are just pagetable, GDT, etc. pages. 273 */ 274 static void 275 xen_kpm_finish_init(void) 276 { 277 pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa); 278 pfn_t pfn; 279 page_t *pp; 280 281 for (pfn = 0; pfn < mfn_count; ++pfn) { 282 /* 283 * skip gdt 284 */ 285 if (pfn == gdtpfn) 286 continue; 287 288 /* 289 * p_index is a hint that this is a pagetable 290 */ 291 pp = page_numtopp_nolock(pfn); 292 if (pp && pp->p_index) { 293 pp->p_index = 0; 294 continue; 295 } 296 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE); 297 } 298 } 299 #endif 300 301 /* 302 * Routine to pre-allocate data structures for hat_kern_setup(). It computes 303 * how many pagetables it needs by walking the boot loader's page tables. 304 */ 305 /*ARGSUSED*/ 306 void 307 hat_kern_alloc( 308 caddr_t segmap_base, 309 size_t segmap_size, 310 caddr_t ekernelheap) 311 { 312 uintptr_t last_va = (uintptr_t)-1; /* catch 1st time */ 313 uintptr_t va = 0; 314 size_t size; 315 pfn_t pfn; 316 uint_t prot; 317 uint_t table_cnt = 1; 318 uint_t mapping_cnt; 319 level_t start_level; 320 level_t l; 321 struct memlist *pmem; 322 level_t lpagel = mmu.max_page_level; 323 uint64_t paddr; 324 int64_t psize; 325 int nwindows; 326 327 if (kpm_size > 0) { 328 /* 329 * Create the kpm page tables. When running on the 330 * hypervisor these are made read/only at first. 331 * Later we'll add write permission where possible. 332 */ 333 for (pmem = phys_install; pmem; pmem = pmem->next) { 334 paddr = pmem->address; 335 psize = pmem->size; 336 while (psize >= MMU_PAGESIZE) { 337 if ((paddr & LEVEL_OFFSET(lpagel)) == 0 && 338 psize > LEVEL_SIZE(lpagel)) 339 l = lpagel; 340 else 341 l = 0; 342 #if defined(__xpv) 343 /* 344 * Create read/only mappings to avoid 345 * conflicting with pagetable usage 346 */ 347 xen_kpm_create(paddr, l); 348 #else 349 kbm_map((uintptr_t)kpm_vbase + paddr, paddr, 350 l, 1); 351 #endif 352 paddr += LEVEL_SIZE(l); 353 psize -= LEVEL_SIZE(l); 354 } 355 } 356 } 357 358 /* 359 * If this machine doesn't have a kpm segment, we need to allocate 360 * a small number of 'windows' which can be used to map pagetables. 361 */ 362 nwindows = (kpm_size == 0) ? 2 * NCPU : 0; 363 364 #if defined(__xpv) 365 /* 366 * On a hypervisor, these windows are also used by the xpv_panic 367 * code, where we need one window for each level of the pagetable 368 * hierarchy. 369 */ 370 nwindows = MAX(nwindows, mmu.max_level); 371 #endif 372 373 if (nwindows != 0) { 374 /* 375 * Create the page windows and 1 page of VA in 376 * which we map the PTEs of those windows. 377 */ 378 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE, 379 LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP); 380 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size); 381 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE, 382 MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 383 384 /* 385 * Find/Create the page table window mappings. 386 */ 387 paddr = 0; 388 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0); 389 ASSERT(paddr != 0); 390 ASSERT((paddr & MMU_PAGEOFFSET) == 0); 391 mmu.pwin_pte_pa = paddr; 392 #ifdef __xpv 393 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0); 394 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa); 395 #else 396 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1); 397 #endif 398 } 399 400 /* 401 * Walk the boot loader's page tables and figure out 402 * how many tables and page mappings there will be. 403 */ 404 while (kbm_probe(&va, &size, &pfn, &prot) != 0) { 405 /* 406 * At each level, if the last_va falls into a new htable, 407 * increment table_cnt. We can stop at the 1st level where 408 * they are in the same htable. 409 */ 410 if (size == MMU_PAGESIZE) 411 start_level = 0; 412 else 413 start_level = 1; 414 415 for (l = start_level; l < mmu.max_level; ++l) { 416 if (va >> LEVEL_SHIFT(l + 1) == 417 last_va >> LEVEL_SHIFT(l + 1)) 418 break; 419 ++table_cnt; 420 } 421 last_va = va; 422 va = (va & LEVEL_MASK(1)) + LEVEL_SIZE(1); 423 } 424 425 /* 426 * Besides the boot loader mappings, we're going to fill in 427 * the entire top level page table for the kernel. Make sure there's 428 * enough reserve for that too. 429 */ 430 table_cnt += mmu.top_level_count - ((kernelbase >> 431 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1)); 432 433 #if defined(__i386) 434 /* 435 * The 32 bit PAE hat allocates tables one level below the top when 436 * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate 437 * a bunch more to the reserve. Any unused will be returned later. 438 * Note we've already counted these mappings, just not the extra 439 * pagetables. 440 */ 441 if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0) 442 table_cnt += mmu.ptes_per_table - 443 ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >> 444 LEVEL_SHIFT(mmu.max_level - 1)); 445 #endif 446 447 /* 448 * Add 1/4 more into table_cnt for extra slop. The unused 449 * slop is freed back when we htable_adjust_reserve() later. 450 */ 451 table_cnt += table_cnt >> 2; 452 453 /* 454 * We only need mapping entries (hments) for shared pages. 455 * This should be far, far fewer than the total possible, 456 * We'll allocate enough for 1/16 of all possible PTEs. 457 */ 458 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4; 459 460 /* 461 * Now create the initial htable/hment reserves 462 */ 463 htable_initial_reserve(table_cnt); 464 hment_reserve(mapping_cnt); 465 x86pte_cpu_init(CPU); 466 } 467 468 469 /* 470 * This routine handles the work of creating the kernel's initial mappings 471 * by deciphering the mappings in the page tables created by the boot program. 472 * 473 * We maintain large page mappings, but only to a level 1 pagesize. 474 * The boot loader can only add new mappings once this function starts. 475 * In particular it can not change the pagesize used for any existing 476 * mappings or this code breaks! 477 */ 478 479 void 480 hat_kern_setup(void) 481 { 482 /* 483 * Attach htables to the existing pagetables 484 */ 485 /* BEGIN CSTYLED */ 486 htable_attach(kas.a_hat, 0, mmu.max_level, NULL, 487 #ifdef __xpv 488 mmu_btop(xen_info->pt_base - ONE_GIG)); 489 #else 490 mmu_btop(getcr3())); 491 #endif 492 /* END CSTYLED */ 493 494 #if defined(__i386) && !defined(__xpv) 495 CPU->cpu_tss->tss_cr3 = dftss0.tss_cr3 = getcr3(); 496 #endif /* __i386 */ 497 498 #if defined(__xpv) && defined(__amd64) 499 /* 500 * Try to make the kpm mappings r/w. Failures here are OK, as 501 * it's probably just a pagetable 502 */ 503 xen_kpm_finish_init(); 504 #endif 505 506 /* 507 * The kernel HAT is now officially open for business. 508 */ 509 khat_running = 1; 510 511 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id); 512 CPU->cpu_current_hat = kas.a_hat; 513 } 514