1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/t_lock.h>
27 #include <sys/memlist.h>
28 #include <sys/cpuvar.h>
29 #include <sys/vmem.h>
30 #include <sys/mman.h>
31 #include <sys/vm.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/vm_machparam.h>
36 #include <sys/tss.h>
37 #include <sys/vnode.h>
38 #include <vm/hat.h>
39 #include <vm/anon.h>
40 #include <vm/as.h>
41 #include <vm/page.h>
42 #include <vm/seg.h>
43 #include <vm/seg_kmem.h>
44 #include <vm/seg_map.h>
45 #include <vm/hat_i86.h>
46 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
48 #include <sys/systm.h>
49 #include <sys/archsystm.h>
50 #include <sys/sunddi.h>
51 #include <sys/ddidmareq.h>
52 #include <sys/controlregs.h>
53 #include <sys/reboot.h>
54 #include <sys/kdi.h>
55 #include <sys/bootconf.h>
56 #include <sys/bootsvcs.h>
57 #include <sys/bootinfo.h>
58 #include <vm/kboot_mmu.h>
59
60 #ifdef __xpv
61 #include <sys/hypervisor.h>
62 #endif
63
64 caddr_t
i86devmap(pfn_t pf,pgcnt_t pgcnt,uint_t prot)65 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
66 {
67 caddr_t addr;
68 caddr_t addr1;
69 page_t *pp;
70
71 addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
72
73 for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
74 pp = page_numtopp_nolock(pf);
75 if (pp == NULL) {
76 hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
77 prot | HAT_NOSYNC, HAT_LOAD_LOCK);
78 } else {
79 hat_memload(kas.a_hat, addr, pp,
80 prot | HAT_NOSYNC, HAT_LOAD_LOCK);
81 }
82 }
83
84 return (addr1);
85 }
86
87 /*
88 * This routine is like page_numtopp, but accepts only free pages, which
89 * it allocates (unfrees) and returns with the exclusive lock held.
90 * It is used by machdep.c/dma_init() to find contiguous free pages.
91 *
92 * XXX this and some others should probably be in vm_machdep.c
93 */
94 page_t *
page_numtopp_alloc(pfn_t pfnum)95 page_numtopp_alloc(pfn_t pfnum)
96 {
97 page_t *pp;
98
99 retry:
100 pp = page_numtopp_nolock(pfnum);
101 if (pp == NULL) {
102 return (NULL);
103 }
104
105 if (!page_trylock(pp, SE_EXCL)) {
106 return (NULL);
107 }
108
109 if (page_pptonum(pp) != pfnum) {
110 page_unlock(pp);
111 goto retry;
112 }
113
114 if (!PP_ISFREE(pp)) {
115 page_unlock(pp);
116 return (NULL);
117 }
118 if (pp->p_szc) {
119 page_demote_free_pages(pp);
120 page_unlock(pp);
121 goto retry;
122 }
123
124 /* If associated with a vnode, destroy mappings */
125
126 if (pp->p_vnode) {
127
128 page_destroy_free(pp);
129
130 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
131 return (NULL);
132 }
133
134 if (page_pptonum(pp) != pfnum) {
135 page_unlock(pp);
136 goto retry;
137 }
138 }
139
140 if (!PP_ISFREE(pp)) {
141 page_unlock(pp);
142 return (NULL);
143 }
144
145 if (!page_reclaim(pp, (kmutex_t *)NULL))
146 return (NULL);
147
148 return (pp);
149 }
150
151 /*
152 * Flag is not set early in boot. Once it is set we are no longer
153 * using boot's page tables.
154 */
155 uint_t khat_running = 0;
156
157 /*
158 * This procedure is callable only while the boot loader is in charge of the
159 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in
160 * kboot_mmu.c since it's used from common code.
161 */
162 pfn_t
va_to_pfn(void * vaddr)163 va_to_pfn(void *vaddr)
164 {
165 uintptr_t des_va = ALIGN2PAGE(vaddr);
166 uintptr_t va = des_va;
167 size_t len;
168 uint_t prot;
169 pfn_t pfn;
170
171 if (khat_running)
172 panic("va_to_pfn(): called too late\n");
173
174 if (kbm_probe(&va, &len, &pfn, &prot) == 0)
175 return (PFN_INVALID);
176 if (va > des_va)
177 return (PFN_INVALID);
178 if (va < des_va)
179 pfn += mmu_btop(des_va - va);
180 return (pfn);
181 }
182
183 /*
184 * Initialize a special area in the kernel that always holds some PTEs for
185 * faster performance. This always holds segmap's PTEs.
186 * In the 32 bit kernel this maps the kernel heap too.
187 */
188 void
hat_kmap_init(uintptr_t base,size_t len)189 hat_kmap_init(uintptr_t base, size_t len)
190 {
191 uintptr_t map_addr; /* base rounded down to large page size */
192 uintptr_t map_eaddr; /* base + len rounded up */
193 size_t map_len;
194 caddr_t ptes; /* mapping area in kernel for kmap ptes */
195 size_t window_size; /* size of mapping area for ptes */
196 ulong_t htable_cnt; /* # of page tables to cover map_len */
197 ulong_t i;
198 htable_t *ht;
199 uintptr_t va;
200
201 /*
202 * We have to map in an area that matches an entire page table.
203 * The PTEs are large page aligned to avoid spurious pagefaults
204 * on the hypervisor.
205 */
206 map_addr = base & LEVEL_MASK(1);
207 map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
208 map_len = map_eaddr - map_addr;
209 window_size = mmu_btop(map_len) * mmu.pte_size;
210 window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
211 htable_cnt = map_len >> LEVEL_SHIFT(1);
212
213 /*
214 * allocate vmem for the kmap_ptes
215 */
216 ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
217 0, NULL, NULL, VM_SLEEP);
218 mmu.kmap_htables =
219 kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
220
221 /*
222 * Map the page tables that cover kmap into the allocated range.
223 * Note we don't ever htable_release() the kmap page tables - they
224 * can't ever be stolen, freed, etc.
225 */
226 for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
227 ht = htable_create(kas.a_hat, va, 0, NULL);
228 if (ht == NULL)
229 panic("hat_kmap_init: ht == NULL");
230 mmu.kmap_htables[i] = ht;
231
232 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
233 MMU_PAGESIZE, ht->ht_pfn,
234 #ifdef __xpv
235 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
236 #else
237 PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
238 #endif
239 HAT_LOAD | HAT_LOAD_NOCONSIST);
240 }
241
242 /*
243 * set information in mmu to activate handling of kmap
244 */
245 mmu.kmap_addr = map_addr;
246 mmu.kmap_eaddr = map_eaddr;
247 mmu.kmap_ptes = (x86pte_t *)ptes;
248 }
249
250 extern caddr_t kpm_vbase;
251 extern size_t kpm_size;
252
253 #ifdef __xpv
254 /*
255 * Create the initial segkpm mappings for the hypervisor. To avoid having
256 * to deal with page tables being read only, we make all mappings
257 * read only at first.
258 */
259 static void
xen_kpm_create(paddr_t paddr,level_t lvl)260 xen_kpm_create(paddr_t paddr, level_t lvl)
261 {
262 ulong_t pg_off;
263
264 for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
265 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
266 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
267 paddr + pg_off);
268 }
269 }
270
271 /*
272 * Try to make all kpm mappings writable. Failures are ok, as those
273 * are just pagetable, GDT, etc. pages.
274 */
275 static void
xen_kpm_finish_init(void)276 xen_kpm_finish_init(void)
277 {
278 pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
279 pfn_t pfn;
280 page_t *pp;
281
282 for (pfn = 0; pfn < mfn_count; ++pfn) {
283 /*
284 * skip gdt
285 */
286 if (pfn == gdtpfn)
287 continue;
288
289 /*
290 * p_index is a hint that this is a pagetable
291 */
292 pp = page_numtopp_nolock(pfn);
293 if (pp && pp->p_index) {
294 pp->p_index = 0;
295 continue;
296 }
297 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
298 }
299 }
300 #endif
301
302 /*
303 * Routine to pre-allocate data structures for hat_kern_setup(). It computes
304 * how many pagetables it needs by walking the boot loader's page tables.
305 */
306 /*ARGSUSED*/
307 void
hat_kern_alloc(caddr_t segmap_base,size_t segmap_size,caddr_t ekernelheap)308 hat_kern_alloc(
309 caddr_t segmap_base,
310 size_t segmap_size,
311 caddr_t ekernelheap)
312 {
313 uintptr_t last_va = (uintptr_t)-1; /* catch 1st time */
314 uintptr_t va = 0;
315 size_t size;
316 pfn_t pfn;
317 uint_t prot;
318 uint_t table_cnt = 1;
319 uint_t mapping_cnt;
320 level_t start_level;
321 level_t l;
322 struct memlist *pmem;
323 level_t lpagel = mmu.max_page_level;
324 uint64_t paddr;
325 int64_t psize;
326 int nwindows;
327
328 if (kpm_size > 0) {
329 /*
330 * Create the kpm page tables. When running on the
331 * hypervisor these are made read/only at first.
332 * Later we'll add write permission where possible.
333 */
334 for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
335 paddr = pmem->ml_address;
336 psize = pmem->ml_size;
337 while (psize >= MMU_PAGESIZE) {
338 /* find the largest page size */
339 for (l = lpagel; l > 0; l--) {
340 if ((paddr & LEVEL_OFFSET(l)) == 0 &&
341 psize > LEVEL_SIZE(l))
342 break;
343 }
344
345 #if defined(__xpv)
346 /*
347 * Create read/only mappings to avoid
348 * conflicting with pagetable usage
349 */
350 xen_kpm_create(paddr, l);
351 #else
352 kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
353 l, 1);
354 #endif
355 paddr += LEVEL_SIZE(l);
356 psize -= LEVEL_SIZE(l);
357 }
358 }
359 }
360
361 /*
362 * If this machine doesn't have a kpm segment, we need to allocate
363 * a small number of 'windows' which can be used to map pagetables.
364 */
365 nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
366
367 #if defined(__xpv)
368 /*
369 * On a hypervisor, these windows are also used by the xpv_panic
370 * code, where we need one window for each level of the pagetable
371 * hierarchy.
372 */
373 nwindows = MAX(nwindows, mmu.max_level);
374 #endif
375
376 if (nwindows != 0) {
377 /*
378 * Create the page windows and 1 page of VA in
379 * which we map the PTEs of those windows.
380 */
381 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
382 LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
383 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
384 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
385 MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
386
387 /*
388 * Find/Create the page table window mappings.
389 */
390 paddr = 0;
391 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
392 ASSERT(paddr != 0);
393 ASSERT((paddr & MMU_PAGEOFFSET) == 0);
394 mmu.pwin_pte_pa = paddr;
395 #ifdef __xpv
396 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
397 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
398 #else
399 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
400 #endif
401 }
402
403 /*
404 * Walk the boot loader's page tables and figure out
405 * how many tables and page mappings there will be.
406 */
407 while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
408 /*
409 * At each level, if the last_va falls into a new htable,
410 * increment table_cnt. We can stop at the 1st level where
411 * they are in the same htable.
412 */
413 start_level = 0;
414 while (start_level <= mmu.max_page_level) {
415 if (size == LEVEL_SIZE(start_level))
416 break;
417 start_level++;
418 }
419
420 for (l = start_level; l < mmu.max_level; ++l) {
421 if (va >> LEVEL_SHIFT(l + 1) ==
422 last_va >> LEVEL_SHIFT(l + 1))
423 break;
424 ++table_cnt;
425 }
426 last_va = va;
427 l = (start_level == 0) ? 1 : start_level;
428 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
429 }
430
431 /*
432 * Besides the boot loader mappings, we're going to fill in
433 * the entire top level page table for the kernel. Make sure there's
434 * enough reserve for that too.
435 */
436 table_cnt += mmu.top_level_count - ((kernelbase >>
437 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
438
439 #if defined(__i386)
440 /*
441 * The 32 bit PAE hat allocates tables one level below the top when
442 * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
443 * a bunch more to the reserve. Any unused will be returned later.
444 * Note we've already counted these mappings, just not the extra
445 * pagetables.
446 */
447 if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
448 table_cnt += mmu.ptes_per_table -
449 ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
450 LEVEL_SHIFT(mmu.max_level - 1));
451 #endif
452
453 /*
454 * Add 1/4 more into table_cnt for extra slop. The unused
455 * slop is freed back when we htable_adjust_reserve() later.
456 */
457 table_cnt += table_cnt >> 2;
458
459 /*
460 * We only need mapping entries (hments) for shared pages.
461 * This should be far, far fewer than the total possible,
462 * We'll allocate enough for 1/16 of all possible PTEs.
463 */
464 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
465
466 /*
467 * Now create the initial htable/hment reserves
468 */
469 htable_initial_reserve(table_cnt);
470 hment_reserve(mapping_cnt);
471 x86pte_cpu_init(CPU);
472 }
473
474
475 /*
476 * This routine handles the work of creating the kernel's initial mappings
477 * by deciphering the mappings in the page tables created by the boot program.
478 *
479 * We maintain large page mappings, but only to a level 1 pagesize.
480 * The boot loader can only add new mappings once this function starts.
481 * In particular it can not change the pagesize used for any existing
482 * mappings or this code breaks!
483 */
484
485 void
hat_kern_setup(void)486 hat_kern_setup(void)
487 {
488 /*
489 * Attach htables to the existing pagetables
490 */
491 /* BEGIN CSTYLED */
492 htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
493 #ifdef __xpv
494 mmu_btop(xen_info->pt_base - ONE_GIG));
495 #else
496 mmu_btop(getcr3()));
497 #endif
498 /* END CSTYLED */
499
500 #if defined(__i386) && !defined(__xpv)
501 CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
502 #endif /* __i386 */
503
504 #if defined(__xpv) && defined(__amd64)
505 /*
506 * Try to make the kpm mappings r/w. Failures here are OK, as
507 * it's probably just a pagetable
508 */
509 xen_kpm_finish_init();
510 #endif
511
512 /*
513 * The kernel HAT is now officially open for business.
514 */
515 khat_running = 1;
516
517 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
518 CPU->cpu_current_hat = kas.a_hat;
519 }
520