1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/t_lock.h> 29 #include <sys/memlist.h> 30 #include <sys/cpuvar.h> 31 #include <sys/vmem.h> 32 #include <sys/mman.h> 33 #include <sys/vm.h> 34 #include <sys/kmem.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/vm_machparam.h> 38 #include <sys/tss.h> 39 #include <sys/vnode.h> 40 #include <vm/hat.h> 41 #include <vm/anon.h> 42 #include <vm/as.h> 43 #include <vm/page.h> 44 #include <vm/seg.h> 45 #include <vm/seg_kmem.h> 46 #include <vm/seg_map.h> 47 #include <vm/hat_i86.h> 48 #include <sys/promif.h> 49 #include <sys/x86_archext.h> 50 #include <sys/systm.h> 51 #include <sys/archsystm.h> 52 #include <sys/sunddi.h> 53 #include <sys/ddidmareq.h> 54 #include <sys/controlregs.h> 55 #include <sys/reboot.h> 56 #include <sys/kdi.h> 57 #include <sys/bootconf.h> 58 #include <sys/bootsvcs.h> 59 #include <sys/bootinfo.h> 60 #include <vm/kboot_mmu.h> 61 62 #ifdef __xpv 63 #include <sys/hypervisor.h> 64 #endif 65 66 caddr_t 67 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) 68 { 69 caddr_t addr; 70 caddr_t addr1; 71 page_t *pp; 72 73 addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); 74 75 for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { 76 pp = page_numtopp_nolock(pf); 77 if (pp == NULL) { 78 hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, 79 prot | HAT_NOSYNC, HAT_LOAD_LOCK); 80 } else { 81 hat_memload(kas.a_hat, addr, pp, 82 prot | HAT_NOSYNC, HAT_LOAD_LOCK); 83 } 84 } 85 86 return (addr1); 87 } 88 89 /* 90 * This routine is like page_numtopp, but accepts only free pages, which 91 * it allocates (unfrees) and returns with the exclusive lock held. 92 * It is used by machdep.c/dma_init() to find contiguous free pages. 93 * 94 * XXX this and some others should probably be in vm_machdep.c 95 */ 96 page_t * 97 page_numtopp_alloc(pfn_t pfnum) 98 { 99 page_t *pp; 100 101 retry: 102 pp = page_numtopp_nolock(pfnum); 103 if (pp == NULL) { 104 return (NULL); 105 } 106 107 if (!page_trylock(pp, SE_EXCL)) { 108 return (NULL); 109 } 110 111 if (page_pptonum(pp) != pfnum) { 112 page_unlock(pp); 113 goto retry; 114 } 115 116 if (!PP_ISFREE(pp)) { 117 page_unlock(pp); 118 return (NULL); 119 } 120 if (pp->p_szc) { 121 page_demote_free_pages(pp); 122 page_unlock(pp); 123 goto retry; 124 } 125 126 /* If associated with a vnode, destroy mappings */ 127 128 if (pp->p_vnode) { 129 130 page_destroy_free(pp); 131 132 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { 133 return (NULL); 134 } 135 136 if (page_pptonum(pp) != pfnum) { 137 page_unlock(pp); 138 goto retry; 139 } 140 } 141 142 if (!PP_ISFREE(pp)) { 143 page_unlock(pp); 144 return (NULL); 145 } 146 147 if (!page_reclaim(pp, (kmutex_t *)NULL)) 148 return (NULL); 149 150 return (pp); 151 } 152 153 /* 154 * Flag is not set early in boot. Once it is set we are no longer 155 * using boot's page tables. 156 */ 157 uint_t khat_running = 0; 158 159 /* 160 * This procedure is callable only while the boot loader is in charge of the 161 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in 162 * kboot_mmu.c since it's used from common code. 163 */ 164 pfn_t 165 va_to_pfn(void *vaddr) 166 { 167 uintptr_t des_va = ALIGN2PAGE(vaddr); 168 uintptr_t va = des_va; 169 size_t len; 170 uint_t prot; 171 pfn_t pfn; 172 173 if (khat_running) 174 panic("va_to_pfn(): called too late\n"); 175 176 if (kbm_probe(&va, &len, &pfn, &prot) == 0) 177 return (PFN_INVALID); 178 if (va > des_va) 179 return (PFN_INVALID); 180 if (va < des_va) 181 pfn += mmu_btop(des_va - va); 182 return (pfn); 183 } 184 185 /* 186 * Initialize a special area in the kernel that always holds some PTEs for 187 * faster performance. This always holds segmap's PTEs. 188 * In the 32 bit kernel this maps the kernel heap too. 189 */ 190 void 191 hat_kmap_init(uintptr_t base, size_t len) 192 { 193 uintptr_t map_addr; /* base rounded down to large page size */ 194 uintptr_t map_eaddr; /* base + len rounded up */ 195 size_t map_len; 196 caddr_t ptes; /* mapping area in kernel for kmap ptes */ 197 size_t window_size; /* size of mapping area for ptes */ 198 ulong_t htable_cnt; /* # of page tables to cover map_len */ 199 ulong_t i; 200 htable_t *ht; 201 uintptr_t va; 202 203 /* 204 * We have to map in an area that matches an entire page table. 205 * The PTEs are large page aligned to avoid spurious pagefaults 206 * on the hypervisor. 207 */ 208 map_addr = base & LEVEL_MASK(1); 209 map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1); 210 map_len = map_eaddr - map_addr; 211 window_size = mmu_btop(map_len) * mmu.pte_size; 212 window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1); 213 htable_cnt = map_len >> LEVEL_SHIFT(1); 214 215 /* 216 * allocate vmem for the kmap_ptes 217 */ 218 ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0, 219 0, NULL, NULL, VM_SLEEP); 220 mmu.kmap_htables = 221 kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP); 222 223 /* 224 * Map the page tables that cover kmap into the allocated range. 225 * Note we don't ever htable_release() the kmap page tables - they 226 * can't ever be stolen, freed, etc. 227 */ 228 for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) { 229 ht = htable_create(kas.a_hat, va, 0, NULL); 230 if (ht == NULL) 231 panic("hat_kmap_init: ht == NULL"); 232 mmu.kmap_htables[i] = ht; 233 234 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE, 235 MMU_PAGESIZE, ht->ht_pfn, 236 #ifdef __xpv 237 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, 238 #else 239 PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK, 240 #endif 241 HAT_LOAD | HAT_LOAD_NOCONSIST); 242 } 243 244 /* 245 * set information in mmu to activate handling of kmap 246 */ 247 mmu.kmap_addr = map_addr; 248 mmu.kmap_eaddr = map_eaddr; 249 mmu.kmap_ptes = (x86pte_t *)ptes; 250 } 251 252 extern caddr_t kpm_vbase; 253 extern size_t kpm_size; 254 255 #ifdef __xpv 256 /* 257 * Create the initial segkpm mappings for the hypervisor. To avoid having 258 * to deal with page tables being read only, we make all mappings 259 * read only at first. 260 */ 261 static void 262 xen_kpm_create(paddr_t paddr, level_t lvl) 263 { 264 ulong_t pg_off; 265 266 for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) { 267 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1); 268 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off, 269 paddr + pg_off); 270 } 271 } 272 273 /* 274 * Try to make all kpm mappings writable. Failures are ok, as those 275 * are just pagetable, GDT, etc. pages. 276 */ 277 static void 278 xen_kpm_finish_init(void) 279 { 280 pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa); 281 pfn_t pfn; 282 page_t *pp; 283 284 for (pfn = 0; pfn < mfn_count; ++pfn) { 285 /* 286 * skip gdt 287 */ 288 if (pfn == gdtpfn) 289 continue; 290 291 /* 292 * p_index is a hint that this is a pagetable 293 */ 294 pp = page_numtopp_nolock(pfn); 295 if (pp && pp->p_index) { 296 pp->p_index = 0; 297 continue; 298 } 299 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE); 300 } 301 } 302 #endif 303 304 /* 305 * Routine to pre-allocate data structures for hat_kern_setup(). It computes 306 * how many pagetables it needs by walking the boot loader's page tables. 307 */ 308 /*ARGSUSED*/ 309 void 310 hat_kern_alloc( 311 caddr_t segmap_base, 312 size_t segmap_size, 313 caddr_t ekernelheap) 314 { 315 uintptr_t last_va = (uintptr_t)-1; /* catch 1st time */ 316 uintptr_t va = 0; 317 size_t size; 318 pfn_t pfn; 319 uint_t prot; 320 uint_t table_cnt = 1; 321 uint_t mapping_cnt; 322 level_t start_level; 323 level_t l; 324 struct memlist *pmem; 325 level_t lpagel = mmu.max_page_level; 326 uint64_t paddr; 327 int64_t psize; 328 int nwindows; 329 330 if (kpm_size > 0) { 331 /* 332 * Create the kpm page tables. When running on the 333 * hypervisor these are made read/only at first. 334 * Later we'll add write permission where possible. 335 */ 336 for (pmem = phys_install; pmem; pmem = pmem->next) { 337 paddr = pmem->address; 338 psize = pmem->size; 339 while (psize >= MMU_PAGESIZE) { 340 /* find the largest page size */ 341 for (l = lpagel; l > 0; l--) { 342 if ((paddr & LEVEL_OFFSET(l)) == 0 && 343 psize > LEVEL_SIZE(l)) 344 break; 345 } 346 347 #if defined(__xpv) 348 /* 349 * Create read/only mappings to avoid 350 * conflicting with pagetable usage 351 */ 352 xen_kpm_create(paddr, l); 353 #else 354 kbm_map((uintptr_t)kpm_vbase + paddr, paddr, 355 l, 1); 356 #endif 357 paddr += LEVEL_SIZE(l); 358 psize -= LEVEL_SIZE(l); 359 } 360 } 361 } 362 363 /* 364 * If this machine doesn't have a kpm segment, we need to allocate 365 * a small number of 'windows' which can be used to map pagetables. 366 */ 367 nwindows = (kpm_size == 0) ? 2 * NCPU : 0; 368 369 #if defined(__xpv) 370 /* 371 * On a hypervisor, these windows are also used by the xpv_panic 372 * code, where we need one window for each level of the pagetable 373 * hierarchy. 374 */ 375 nwindows = MAX(nwindows, mmu.max_level); 376 #endif 377 378 if (nwindows != 0) { 379 /* 380 * Create the page windows and 1 page of VA in 381 * which we map the PTEs of those windows. 382 */ 383 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE, 384 LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP); 385 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size); 386 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE, 387 MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 388 389 /* 390 * Find/Create the page table window mappings. 391 */ 392 paddr = 0; 393 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0); 394 ASSERT(paddr != 0); 395 ASSERT((paddr & MMU_PAGEOFFSET) == 0); 396 mmu.pwin_pte_pa = paddr; 397 #ifdef __xpv 398 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0); 399 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa); 400 #else 401 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1); 402 #endif 403 } 404 405 /* 406 * Walk the boot loader's page tables and figure out 407 * how many tables and page mappings there will be. 408 */ 409 while (kbm_probe(&va, &size, &pfn, &prot) != 0) { 410 /* 411 * At each level, if the last_va falls into a new htable, 412 * increment table_cnt. We can stop at the 1st level where 413 * they are in the same htable. 414 */ 415 start_level = 0; 416 while (start_level <= mmu.max_page_level) { 417 if (size == LEVEL_SIZE(start_level)) 418 break; 419 start_level++; 420 } 421 422 for (l = start_level; l < mmu.max_level; ++l) { 423 if (va >> LEVEL_SHIFT(l + 1) == 424 last_va >> LEVEL_SHIFT(l + 1)) 425 break; 426 ++table_cnt; 427 } 428 last_va = va; 429 l = (start_level == 0) ? 1 : start_level; 430 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l); 431 } 432 433 /* 434 * Besides the boot loader mappings, we're going to fill in 435 * the entire top level page table for the kernel. Make sure there's 436 * enough reserve for that too. 437 */ 438 table_cnt += mmu.top_level_count - ((kernelbase >> 439 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1)); 440 441 #if defined(__i386) 442 /* 443 * The 32 bit PAE hat allocates tables one level below the top when 444 * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate 445 * a bunch more to the reserve. Any unused will be returned later. 446 * Note we've already counted these mappings, just not the extra 447 * pagetables. 448 */ 449 if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0) 450 table_cnt += mmu.ptes_per_table - 451 ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >> 452 LEVEL_SHIFT(mmu.max_level - 1)); 453 #endif 454 455 /* 456 * Add 1/4 more into table_cnt for extra slop. The unused 457 * slop is freed back when we htable_adjust_reserve() later. 458 */ 459 table_cnt += table_cnt >> 2; 460 461 /* 462 * We only need mapping entries (hments) for shared pages. 463 * This should be far, far fewer than the total possible, 464 * We'll allocate enough for 1/16 of all possible PTEs. 465 */ 466 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4; 467 468 /* 469 * Now create the initial htable/hment reserves 470 */ 471 htable_initial_reserve(table_cnt); 472 hment_reserve(mapping_cnt); 473 x86pte_cpu_init(CPU); 474 } 475 476 477 /* 478 * This routine handles the work of creating the kernel's initial mappings 479 * by deciphering the mappings in the page tables created by the boot program. 480 * 481 * We maintain large page mappings, but only to a level 1 pagesize. 482 * The boot loader can only add new mappings once this function starts. 483 * In particular it can not change the pagesize used for any existing 484 * mappings or this code breaks! 485 */ 486 487 void 488 hat_kern_setup(void) 489 { 490 /* 491 * Attach htables to the existing pagetables 492 */ 493 /* BEGIN CSTYLED */ 494 htable_attach(kas.a_hat, 0, mmu.max_level, NULL, 495 #ifdef __xpv 496 mmu_btop(xen_info->pt_base - ONE_GIG)); 497 #else 498 mmu_btop(getcr3())); 499 #endif 500 /* END CSTYLED */ 501 502 #if defined(__i386) && !defined(__xpv) 503 CPU->cpu_tss->tss_cr3 = dftss0.tss_cr3 = getcr3(); 504 #endif /* __i386 */ 505 506 #if defined(__xpv) && defined(__amd64) 507 /* 508 * Try to make the kpm mappings r/w. Failures here are OK, as 509 * it's probably just a pagetable 510 */ 511 xen_kpm_finish_init(); 512 #endif 513 514 /* 515 * The kernel HAT is now officially open for business. 516 */ 517 khat_running = 1; 518 519 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id); 520 CPU->cpu_current_hat = kas.a_hat; 521 } 522