xref: /titanic_50/usr/src/uts/i86pc/vm/i86_mmu.c (revision ff3124eff995e6cd8ebd8c6543648e0670920034)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/t_lock.h>
29 #include <sys/memlist.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vmem.h>
32 #include <sys/mman.h>
33 #include <sys/vm.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vm_machparam.h>
38 #include <sys/tss.h>
39 #include <sys/vnode.h>
40 #include <vm/hat.h>
41 #include <vm/anon.h>
42 #include <vm/as.h>
43 #include <vm/page.h>
44 #include <vm/seg.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/seg_map.h>
47 #include <vm/hat_i86.h>
48 #include <sys/promif.h>
49 #include <sys/x86_archext.h>
50 #include <sys/systm.h>
51 #include <sys/archsystm.h>
52 #include <sys/sunddi.h>
53 #include <sys/ddidmareq.h>
54 #include <sys/controlregs.h>
55 #include <sys/reboot.h>
56 #include <sys/kdi.h>
57 #include <sys/bootconf.h>
58 #include <sys/bootsvcs.h>
59 #include <sys/bootinfo.h>
60 #include <vm/kboot_mmu.h>
61 
62 #ifdef __xpv
63 #include <sys/hypervisor.h>
64 #endif
65 
66 caddr_t
67 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
68 {
69 	caddr_t addr;
70 	caddr_t addr1;
71 	page_t *pp;
72 
73 	addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
74 
75 	for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
76 		pp = page_numtopp_nolock(pf);
77 		if (pp == NULL) {
78 			hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
79 			    prot | HAT_NOSYNC, HAT_LOAD_LOCK);
80 		} else {
81 			hat_memload(kas.a_hat, addr, pp,
82 			    prot | HAT_NOSYNC, HAT_LOAD_LOCK);
83 		}
84 	}
85 
86 	return (addr1);
87 }
88 
89 /*
90  * This routine is like page_numtopp, but accepts only free pages, which
91  * it allocates (unfrees) and returns with the exclusive lock held.
92  * It is used by machdep.c/dma_init() to find contiguous free pages.
93  *
94  * XXX this and some others should probably be in vm_machdep.c
95  */
96 page_t *
97 page_numtopp_alloc(pfn_t pfnum)
98 {
99 	page_t *pp;
100 
101 retry:
102 	pp = page_numtopp_nolock(pfnum);
103 	if (pp == NULL) {
104 		return (NULL);
105 	}
106 
107 	if (!page_trylock(pp, SE_EXCL)) {
108 		return (NULL);
109 	}
110 
111 	if (page_pptonum(pp) != pfnum) {
112 		page_unlock(pp);
113 		goto retry;
114 	}
115 
116 	if (!PP_ISFREE(pp)) {
117 		page_unlock(pp);
118 		return (NULL);
119 	}
120 	if (pp->p_szc) {
121 		page_demote_free_pages(pp);
122 		page_unlock(pp);
123 		goto retry;
124 	}
125 
126 	/* If associated with a vnode, destroy mappings */
127 
128 	if (pp->p_vnode) {
129 
130 		page_destroy_free(pp);
131 
132 		if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
133 			return (NULL);
134 		}
135 
136 		if (page_pptonum(pp) != pfnum) {
137 			page_unlock(pp);
138 			goto retry;
139 		}
140 	}
141 
142 	if (!PP_ISFREE(pp)) {
143 		page_unlock(pp);
144 		return (NULL);
145 	}
146 
147 	if (!page_reclaim(pp, (kmutex_t *)NULL))
148 		return (NULL);
149 
150 	return (pp);
151 }
152 
153 /*
154  * Flag is not set early in boot. Once it is set we are no longer
155  * using boot's page tables.
156  */
157 uint_t khat_running = 0;
158 
159 /*
160  * This procedure is callable only while the boot loader is in charge of the
161  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
162  * kboot_mmu.c since it's used from common code.
163  */
164 pfn_t
165 va_to_pfn(void *vaddr)
166 {
167 	uintptr_t	des_va = ALIGN2PAGE(vaddr);
168 	uintptr_t	va = des_va;
169 	size_t		len;
170 	uint_t		prot;
171 	pfn_t		pfn;
172 
173 	if (khat_running)
174 		panic("va_to_pfn(): called too late\n");
175 
176 	if (kbm_probe(&va, &len, &pfn, &prot) == 0)
177 		return (PFN_INVALID);
178 	if (va > des_va)
179 		return (PFN_INVALID);
180 	if (va < des_va)
181 		pfn += mmu_btop(des_va - va);
182 	return (pfn);
183 }
184 
185 /*
186  * Initialize a special area in the kernel that always holds some PTEs for
187  * faster performance. This always holds segmap's PTEs.
188  * In the 32 bit kernel this maps the kernel heap too.
189  */
190 void
191 hat_kmap_init(uintptr_t base, size_t len)
192 {
193 	uintptr_t map_addr;	/* base rounded down to large page size */
194 	uintptr_t map_eaddr;	/* base + len rounded up */
195 	size_t map_len;
196 	caddr_t ptes;		/* mapping area in kernel for kmap ptes */
197 	size_t window_size;	/* size of mapping area for ptes */
198 	ulong_t htable_cnt;	/* # of page tables to cover map_len */
199 	ulong_t i;
200 	htable_t *ht;
201 	uintptr_t va;
202 
203 	/*
204 	 * We have to map in an area that matches an entire page table.
205 	 * The PTEs are large page aligned to avoid spurious pagefaults
206 	 * on the hypervisor.
207 	 */
208 	map_addr = base & LEVEL_MASK(1);
209 	map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
210 	map_len = map_eaddr - map_addr;
211 	window_size = mmu_btop(map_len) * mmu.pte_size;
212 	window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
213 	htable_cnt = map_len >> LEVEL_SHIFT(1);
214 
215 	/*
216 	 * allocate vmem for the kmap_ptes
217 	 */
218 	ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
219 	    0, NULL, NULL, VM_SLEEP);
220 	mmu.kmap_htables =
221 	    kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
222 
223 	/*
224 	 * Map the page tables that cover kmap into the allocated range.
225 	 * Note we don't ever htable_release() the kmap page tables - they
226 	 * can't ever be stolen, freed, etc.
227 	 */
228 	for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
229 		ht = htable_create(kas.a_hat, va, 0, NULL);
230 		if (ht == NULL)
231 			panic("hat_kmap_init: ht == NULL");
232 		mmu.kmap_htables[i] = ht;
233 
234 		hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
235 		    MMU_PAGESIZE, ht->ht_pfn,
236 #ifdef __xpv
237 		    PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
238 #else
239 		    PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
240 #endif
241 		    HAT_LOAD | HAT_LOAD_NOCONSIST);
242 	}
243 
244 	/*
245 	 * set information in mmu to activate handling of kmap
246 	 */
247 	mmu.kmap_addr = map_addr;
248 	mmu.kmap_eaddr = map_eaddr;
249 	mmu.kmap_ptes = (x86pte_t *)ptes;
250 }
251 
252 extern caddr_t	kpm_vbase;
253 extern size_t	kpm_size;
254 
255 #ifdef __xpv
256 /*
257  * Create the initial segkpm mappings for the hypervisor. To avoid having
258  * to deal with page tables being read only, we make all mappings
259  * read only at first.
260  */
261 static void
262 xen_kpm_create(paddr_t paddr, level_t lvl)
263 {
264 	ulong_t pg_off;
265 
266 	for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
267 		kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
268 		kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
269 		    paddr + pg_off);
270 	}
271 }
272 
273 /*
274  * Try to make all kpm mappings writable. Failures are ok, as those
275  * are just pagetable, GDT, etc. pages.
276  */
277 static void
278 xen_kpm_finish_init(void)
279 {
280 	pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
281 	pfn_t pfn;
282 	page_t *pp;
283 
284 	for (pfn = 0; pfn < mfn_count; ++pfn) {
285 		/*
286 		 * skip gdt
287 		 */
288 		if (pfn == gdtpfn)
289 			continue;
290 
291 		/*
292 		 * p_index is a hint that this is a pagetable
293 		 */
294 		pp = page_numtopp_nolock(pfn);
295 		if (pp && pp->p_index) {
296 			pp->p_index = 0;
297 			continue;
298 		}
299 		(void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
300 	}
301 }
302 #endif
303 
304 /*
305  * Routine to pre-allocate data structures for hat_kern_setup(). It computes
306  * how many pagetables it needs by walking the boot loader's page tables.
307  */
308 /*ARGSUSED*/
309 void
310 hat_kern_alloc(
311 	caddr_t	segmap_base,
312 	size_t	segmap_size,
313 	caddr_t	ekernelheap)
314 {
315 	uintptr_t	last_va = (uintptr_t)-1;	/* catch 1st time */
316 	uintptr_t	va = 0;
317 	size_t		size;
318 	pfn_t		pfn;
319 	uint_t		prot;
320 	uint_t		table_cnt = 1;
321 	uint_t		mapping_cnt;
322 	level_t		start_level;
323 	level_t		l;
324 	struct memlist	*pmem;
325 	level_t		lpagel = mmu.max_page_level;
326 	uint64_t	paddr;
327 	int64_t		psize;
328 	int		nwindows;
329 
330 	if (kpm_size > 0) {
331 		/*
332 		 * Create the kpm page tables.  When running on the
333 		 * hypervisor these are made read/only at first.
334 		 * Later we'll add write permission where possible.
335 		 */
336 		for (pmem = phys_install; pmem; pmem = pmem->next) {
337 			paddr = pmem->address;
338 			psize = pmem->size;
339 			while (psize >= MMU_PAGESIZE) {
340 				/* find the largest page size */
341 				for (l = lpagel; l > 0; l--) {
342 					if ((paddr & LEVEL_OFFSET(l)) == 0 &&
343 					    psize > LEVEL_SIZE(l))
344 						break;
345 				}
346 
347 #if defined(__xpv)
348 				/*
349 				 * Create read/only mappings to avoid
350 				 * conflicting with pagetable usage
351 				 */
352 				xen_kpm_create(paddr, l);
353 #else
354 				kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
355 				    l, 1);
356 #endif
357 				paddr += LEVEL_SIZE(l);
358 				psize -= LEVEL_SIZE(l);
359 			}
360 		}
361 	}
362 
363 	/*
364 	 * If this machine doesn't have a kpm segment, we need to allocate
365 	 * a small number of 'windows' which can be used to map pagetables.
366 	 */
367 	nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
368 
369 #if defined(__xpv)
370 	/*
371 	 * On a hypervisor, these windows are also used by the xpv_panic
372 	 * code, where we need one window for each level of the pagetable
373 	 * hierarchy.
374 	 */
375 	nwindows = MAX(nwindows, mmu.max_level);
376 #endif
377 
378 	if (nwindows != 0) {
379 		/*
380 		 * Create the page windows and 1 page of VA in
381 		 * which we map the PTEs of those windows.
382 		 */
383 		mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
384 		    LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
385 		ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
386 		mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
387 		    MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
388 
389 		/*
390 		 * Find/Create the page table window mappings.
391 		 */
392 		paddr = 0;
393 		(void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
394 		ASSERT(paddr != 0);
395 		ASSERT((paddr & MMU_PAGEOFFSET) == 0);
396 		mmu.pwin_pte_pa = paddr;
397 #ifdef __xpv
398 		(void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
399 		kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
400 #else
401 		kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
402 #endif
403 	}
404 
405 	/*
406 	 * Walk the boot loader's page tables and figure out
407 	 * how many tables and page mappings there will be.
408 	 */
409 	while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
410 		/*
411 		 * At each level, if the last_va falls into a new htable,
412 		 * increment table_cnt. We can stop at the 1st level where
413 		 * they are in the same htable.
414 		 */
415 		start_level = 0;
416 		while (start_level <= mmu.max_page_level) {
417 			if (size == LEVEL_SIZE(start_level))
418 				break;
419 			start_level++;
420 		}
421 
422 		for (l = start_level; l < mmu.max_level; ++l) {
423 			if (va >> LEVEL_SHIFT(l + 1) ==
424 			    last_va >> LEVEL_SHIFT(l + 1))
425 				break;
426 			++table_cnt;
427 		}
428 		last_va = va;
429 		l = (start_level == 0) ? 1 : start_level;
430 		va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
431 	}
432 
433 	/*
434 	 * Besides the boot loader mappings, we're going to fill in
435 	 * the entire top level page table for the kernel. Make sure there's
436 	 * enough reserve for that too.
437 	 */
438 	table_cnt += mmu.top_level_count - ((kernelbase >>
439 	    LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
440 
441 #if defined(__i386)
442 	/*
443 	 * The 32 bit PAE hat allocates tables one level below the top when
444 	 * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
445 	 * a bunch more to the reserve. Any unused will be returned later.
446 	 * Note we've already counted these mappings, just not the extra
447 	 * pagetables.
448 	 */
449 	if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
450 		table_cnt += mmu.ptes_per_table -
451 		    ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
452 		    LEVEL_SHIFT(mmu.max_level - 1));
453 #endif
454 
455 	/*
456 	 * Add 1/4 more into table_cnt for extra slop.  The unused
457 	 * slop is freed back when we htable_adjust_reserve() later.
458 	 */
459 	table_cnt += table_cnt >> 2;
460 
461 	/*
462 	 * We only need mapping entries (hments) for shared pages.
463 	 * This should be far, far fewer than the total possible,
464 	 * We'll allocate enough for 1/16 of all possible PTEs.
465 	 */
466 	mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
467 
468 	/*
469 	 * Now create the initial htable/hment reserves
470 	 */
471 	htable_initial_reserve(table_cnt);
472 	hment_reserve(mapping_cnt);
473 	x86pte_cpu_init(CPU);
474 }
475 
476 
477 /*
478  * This routine handles the work of creating the kernel's initial mappings
479  * by deciphering the mappings in the page tables created by the boot program.
480  *
481  * We maintain large page mappings, but only to a level 1 pagesize.
482  * The boot loader can only add new mappings once this function starts.
483  * In particular it can not change the pagesize used for any existing
484  * mappings or this code breaks!
485  */
486 
487 void
488 hat_kern_setup(void)
489 {
490 	/*
491 	 * Attach htables to the existing pagetables
492 	 */
493 	/* BEGIN CSTYLED */
494 	htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
495 #ifdef __xpv
496 	    mmu_btop(xen_info->pt_base - ONE_GIG));
497 #else
498 	    mmu_btop(getcr3()));
499 #endif
500 	/* END CSTYLED */
501 
502 #if defined(__i386) && !defined(__xpv)
503 	CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
504 #endif /* __i386 */
505 
506 #if defined(__xpv) && defined(__amd64)
507 	/*
508 	 * Try to make the kpm mappings r/w. Failures here are OK, as
509 	 * it's probably just a pagetable
510 	 */
511 	xen_kpm_finish_init();
512 #endif
513 
514 	/*
515 	 * The kernel HAT is now officially open for business.
516 	 */
517 	khat_running = 1;
518 
519 	CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
520 	CPU->cpu_current_hat = kas.a_hat;
521 }
522