xref: /illumos-gate/usr/src/uts/i86pc/vm/i86_mmu.c (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2018 Joyent, Inc.
26  */
27 
28 #include <sys/t_lock.h>
29 #include <sys/memlist.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vmem.h>
32 #include <sys/mman.h>
33 #include <sys/vm.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vm_machparam.h>
38 #include <sys/tss.h>
39 #include <sys/vnode.h>
40 #include <vm/hat.h>
41 #include <vm/anon.h>
42 #include <vm/as.h>
43 #include <vm/page.h>
44 #include <vm/seg.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/seg_map.h>
47 #include <vm/hat_i86.h>
48 #include <sys/promif.h>
49 #include <sys/x86_archext.h>
50 #include <sys/systm.h>
51 #include <sys/archsystm.h>
52 #include <sys/sunddi.h>
53 #include <sys/ddidmareq.h>
54 #include <sys/controlregs.h>
55 #include <sys/reboot.h>
56 #include <sys/kdi.h>
57 #include <sys/bootconf.h>
58 #include <sys/bootsvcs.h>
59 #include <sys/bootinfo.h>
60 #include <vm/kboot_mmu.h>
61 
62 #ifdef __xpv
63 #include <sys/hypervisor.h>
64 #endif
65 
66 #define	ON_USER_HAT(cpu) \
67 	((cpu)->cpu_m.mcpu_current_hat != NULL && \
68 	(cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
69 
70 /*
71  * Flag is not set early in boot. Once it is set we are no longer
72  * using boot's page tables.
73  */
74 uint_t khat_running = 0;
75 
76 /*
77  * This procedure is callable only while the boot loader is in charge of the
78  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
79  * kboot_mmu.c since it's used from common code.
80  */
81 pfn_t
82 va_to_pfn(void *vaddr)
83 {
84 	uintptr_t	des_va = ALIGN2PAGE(vaddr);
85 	uintptr_t	va = des_va;
86 	size_t		len;
87 	uint_t		prot;
88 	pfn_t		pfn;
89 
90 	if (khat_running)
91 		panic("va_to_pfn(): called too late\n");
92 
93 	if (kbm_probe(&va, &len, &pfn, &prot) == 0)
94 		return (PFN_INVALID);
95 	if (va > des_va)
96 		return (PFN_INVALID);
97 	if (va < des_va)
98 		pfn += mmu_btop(des_va - va);
99 	return (pfn);
100 }
101 
102 /*
103  * Initialize a special area in the kernel that always holds some PTEs for
104  * faster performance. This always holds segmap's PTEs.
105  * In the 32 bit kernel this maps the kernel heap too.
106  */
107 void
108 hat_kmap_init(uintptr_t base, size_t len)
109 {
110 	uintptr_t map_addr;	/* base rounded down to large page size */
111 	uintptr_t map_eaddr;	/* base + len rounded up */
112 	size_t map_len;
113 	caddr_t ptes;		/* mapping area in kernel for kmap ptes */
114 	size_t window_size;	/* size of mapping area for ptes */
115 	ulong_t htable_cnt;	/* # of page tables to cover map_len */
116 	ulong_t i;
117 	htable_t *ht;
118 	uintptr_t va;
119 
120 	/*
121 	 * We have to map in an area that matches an entire page table.
122 	 * The PTEs are large page aligned to avoid spurious pagefaults
123 	 * on the hypervisor.
124 	 */
125 	map_addr = base & LEVEL_MASK(1);
126 	map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
127 	map_len = map_eaddr - map_addr;
128 	window_size = mmu_btop(map_len) * mmu.pte_size;
129 	window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
130 	htable_cnt = map_len >> LEVEL_SHIFT(1);
131 
132 	/*
133 	 * allocate vmem for the kmap_ptes
134 	 */
135 	ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
136 	    0, NULL, NULL, VM_SLEEP);
137 	mmu.kmap_htables =
138 	    kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
139 
140 	/*
141 	 * Map the page tables that cover kmap into the allocated range.
142 	 * Note we don't ever htable_release() the kmap page tables - they
143 	 * can't ever be stolen, freed, etc.
144 	 */
145 	for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
146 		ht = htable_create(kas.a_hat, va, 0, NULL);
147 		if (ht == NULL)
148 			panic("hat_kmap_init: ht == NULL");
149 		mmu.kmap_htables[i] = ht;
150 
151 		hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
152 		    MMU_PAGESIZE, ht->ht_pfn,
153 #ifdef __xpv
154 		    PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
155 #else
156 		    PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
157 #endif
158 		    HAT_LOAD | HAT_LOAD_NOCONSIST);
159 	}
160 
161 	/*
162 	 * set information in mmu to activate handling of kmap
163 	 */
164 	mmu.kmap_addr = map_addr;
165 	mmu.kmap_eaddr = map_eaddr;
166 	mmu.kmap_ptes = (x86pte_t *)ptes;
167 }
168 
169 extern caddr_t	kpm_vbase;
170 extern size_t	kpm_size;
171 
172 #ifdef __xpv
173 /*
174  * Create the initial segkpm mappings for the hypervisor. To avoid having
175  * to deal with page tables being read only, we make all mappings
176  * read only at first.
177  */
178 static void
179 xen_kpm_create(paddr_t paddr, level_t lvl)
180 {
181 	ulong_t pg_off;
182 
183 	for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
184 		kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
185 		kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
186 		    paddr + pg_off);
187 	}
188 }
189 
190 /*
191  * Try to make all kpm mappings writable. Failures are ok, as those
192  * are just pagetable, GDT, etc. pages.
193  */
194 static void
195 xen_kpm_finish_init(void)
196 {
197 	pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
198 	pfn_t pfn;
199 	page_t *pp;
200 
201 	for (pfn = 0; pfn < mfn_count; ++pfn) {
202 		/*
203 		 * skip gdt
204 		 */
205 		if (pfn == gdtpfn)
206 			continue;
207 
208 		/*
209 		 * p_index is a hint that this is a pagetable
210 		 */
211 		pp = page_numtopp_nolock(pfn);
212 		if (pp && pp->p_index) {
213 			pp->p_index = 0;
214 			continue;
215 		}
216 		(void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
217 	}
218 }
219 #endif
220 
221 /*
222  * Routine to pre-allocate data structures for hat_kern_setup(). It computes
223  * how many pagetables it needs by walking the boot loader's page tables.
224  */
225 /*ARGSUSED*/
226 void
227 hat_kern_alloc(
228 	caddr_t	segmap_base,
229 	size_t	segmap_size,
230 	caddr_t	ekernelheap)
231 {
232 	uintptr_t	last_va = (uintptr_t)-1;	/* catch 1st time */
233 	uintptr_t	va = 0;
234 	size_t		size;
235 	pfn_t		pfn;
236 	uint_t		prot;
237 	uint_t		table_cnt = 1;
238 	uint_t		mapping_cnt;
239 	level_t		start_level;
240 	level_t		l;
241 	struct memlist	*pmem;
242 	level_t		lpagel = mmu.max_page_level;
243 	uint64_t	paddr;
244 	int64_t		psize;
245 	int		nwindows;
246 
247 	if (kpm_size > 0) {
248 		/*
249 		 * Create the kpm page tables.  When running on the
250 		 * hypervisor these are made read/only at first.
251 		 * Later we'll add write permission where possible.
252 		 */
253 		for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
254 			paddr = pmem->ml_address;
255 			psize = pmem->ml_size;
256 			while (psize >= MMU_PAGESIZE) {
257 				/* find the largest page size */
258 				for (l = lpagel; l > 0; l--) {
259 					if ((paddr & LEVEL_OFFSET(l)) == 0 &&
260 					    psize > LEVEL_SIZE(l))
261 						break;
262 				}
263 
264 #if defined(__xpv)
265 				/*
266 				 * Create read/only mappings to avoid
267 				 * conflicting with pagetable usage
268 				 */
269 				xen_kpm_create(paddr, l);
270 #else
271 				kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
272 				    l, 1);
273 #endif
274 				paddr += LEVEL_SIZE(l);
275 				psize -= LEVEL_SIZE(l);
276 			}
277 		}
278 	}
279 
280 	/*
281 	 * If this machine doesn't have a kpm segment, we need to allocate
282 	 * a small number of 'windows' which can be used to map pagetables.
283 	 */
284 	nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
285 
286 #if defined(__xpv)
287 	/*
288 	 * On a hypervisor, these windows are also used by the xpv_panic
289 	 * code, where we need one window for each level of the pagetable
290 	 * hierarchy.
291 	 */
292 	nwindows = MAX(nwindows, mmu.max_level);
293 #endif
294 
295 	if (nwindows != 0) {
296 		/*
297 		 * Create the page windows and 1 page of VA in
298 		 * which we map the PTEs of those windows.
299 		 */
300 		mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
301 		    LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
302 		ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
303 		mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
304 		    MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
305 
306 		/*
307 		 * Find/Create the page table window mappings.
308 		 */
309 		paddr = 0;
310 		(void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
311 		ASSERT(paddr != 0);
312 		ASSERT((paddr & MMU_PAGEOFFSET) == 0);
313 		mmu.pwin_pte_pa = paddr;
314 #ifdef __xpv
315 		(void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
316 		kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
317 #else
318 		kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
319 #endif
320 	}
321 
322 	/*
323 	 * Walk the boot loader's page tables and figure out
324 	 * how many tables and page mappings there will be.
325 	 */
326 	while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
327 		/*
328 		 * At each level, if the last_va falls into a new htable,
329 		 * increment table_cnt. We can stop at the 1st level where
330 		 * they are in the same htable.
331 		 */
332 		start_level = 0;
333 		while (start_level <= mmu.max_page_level) {
334 			if (size == LEVEL_SIZE(start_level))
335 				break;
336 			start_level++;
337 		}
338 
339 		for (l = start_level; l < mmu.max_level; ++l) {
340 			if (va >> LEVEL_SHIFT(l + 1) ==
341 			    last_va >> LEVEL_SHIFT(l + 1))
342 				break;
343 			++table_cnt;
344 		}
345 		last_va = va;
346 		l = (start_level == 0) ? 1 : start_level;
347 		va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
348 	}
349 
350 	/*
351 	 * Besides the boot loader mappings, we're going to fill in
352 	 * the entire top level page table for the kernel. Make sure there's
353 	 * enough reserve for that too.
354 	 */
355 	table_cnt += mmu.top_level_count - ((kernelbase >>
356 	    LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
357 
358 	/*
359 	 * Add 1/4 more into table_cnt for extra slop.  The unused
360 	 * slop is freed back when we htable_adjust_reserve() later.
361 	 */
362 	table_cnt += table_cnt >> 2;
363 
364 	/*
365 	 * We only need mapping entries (hments) for shared pages.
366 	 * This should be far, far fewer than the total possible,
367 	 * We'll allocate enough for 1/16 of all possible PTEs.
368 	 */
369 	mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
370 
371 	/*
372 	 * Now create the initial htable/hment reserves
373 	 */
374 	htable_initial_reserve(table_cnt);
375 	hment_reserve(mapping_cnt);
376 	x86pte_cpu_init(CPU);
377 }
378 
379 
380 /*
381  * This routine handles the work of creating the kernel's initial mappings
382  * by deciphering the mappings in the page tables created by the boot program.
383  *
384  * We maintain large page mappings, but only to a level 1 pagesize.
385  * The boot loader can only add new mappings once this function starts.
386  * In particular it can not change the pagesize used for any existing
387  * mappings or this code breaks!
388  */
389 
390 void
391 hat_kern_setup(void)
392 {
393 	/*
394 	 * Attach htables to the existing pagetables
395 	 */
396 	/* BEGIN CSTYLED */
397 	htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
398 #ifdef __xpv
399 	    mmu_btop(xen_info->pt_base - ONE_GIG));
400 #else
401 	    mmu_btop(getcr3_pa()));
402 #endif
403 	/* END CSTYLED */
404 
405 #if defined(__xpv)
406 	/*
407 	 * Try to make the kpm mappings r/w. Failures here are OK, as
408 	 * it's probably just a pagetable
409 	 */
410 	xen_kpm_finish_init();
411 #endif
412 
413 	/*
414 	 * The kernel HAT is now officially open for business.
415 	 */
416 	khat_running = 1;
417 
418 	CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
419 	CPU->cpu_current_hat = kas.a_hat;
420 }
421 
422 #ifndef __xpv
423 
424 /*
425  * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
426  * INVPCID_ADDR isn't.
427  */
428 static void
429 invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
430 {
431 	ulong_t	flag;
432 	uint64_t cr4;
433 
434 	if (x86_use_invpcid == 1) {
435 		ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
436 		invpcid_insn(type, pcid, addr);
437 		return;
438 	}
439 
440 	switch (type) {
441 	case INVPCID_ALL_GLOBAL:
442 		flag = intr_clear();
443 		cr4 = getcr4();
444 		setcr4(cr4 & ~(ulong_t)CR4_PGE);
445 		setcr4(cr4 | CR4_PGE);
446 		intr_restore(flag);
447 		break;
448 
449 	case INVPCID_ALL_NONGLOBAL:
450 		if (!(getcr4() & CR4_PCIDE)) {
451 			reload_cr3();
452 		} else {
453 			flag = intr_clear();
454 			cr4 = getcr4();
455 			setcr4(cr4 & ~(ulong_t)CR4_PGE);
456 			setcr4(cr4 | CR4_PGE);
457 			intr_restore(flag);
458 		}
459 		break;
460 
461 	case INVPCID_ADDR:
462 		if (pcid == PCID_USER) {
463 			flag = intr_clear();
464 			ASSERT(addr < kernelbase);
465 			ASSERT(ON_USER_HAT(CPU));
466 			ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
467 			tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
468 			    MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
469 			intr_restore(flag);
470 		} else {
471 			mmu_invlpg((caddr_t)addr);
472 		}
473 		break;
474 
475 	default:
476 		panic("unsupported invpcid(%lu)", type);
477 		break;
478 	}
479 }
480 
481 /*
482  * Flush one kernel mapping.
483  *
484  * We want to assert on kernel space here mainly for reasoning about the PCIDE
485  * case: namely, this flush should never need to flush a non-current PCID
486  * mapping.  This presumes we never have reason to flush the kernel regions
487  * available to PCID_USER (the trampolines and so on).  It also relies on
488  * PCID_KERNEL == PCID_NONE.
489  */
490 void
491 mmu_flush_tlb_kpage(uintptr_t va)
492 {
493 	ASSERT(va >= kernelbase);
494 	ASSERT(getpcid() == PCID_KERNEL);
495 	mmu_invlpg((caddr_t)va);
496 }
497 
498 /*
499  * Flush one mapping: local CPU version of hat_tlb_inval().
500  *
501  * If this is a userspace address in the PCIDE case, we need two invalidations,
502  * one for any potentially stale PCID_USER mapping, as well as any established
503  * while in the kernel.
504  */
505 void
506 mmu_flush_tlb_page(uintptr_t va)
507 {
508 	ASSERT(getpcid() == PCID_KERNEL);
509 
510 	if (va >= kernelbase) {
511 		mmu_flush_tlb_kpage(va);
512 		return;
513 	}
514 
515 	if (!(getcr4() & CR4_PCIDE)) {
516 		mmu_invlpg((caddr_t)va);
517 		return;
518 	}
519 
520 	/*
521 	 * Yes, kas will need to flush below kernelspace, at least during boot.
522 	 * But there's no PCID_USER context.
523 	 */
524 	if (ON_USER_HAT(CPU))
525 		invpcid(INVPCID_ADDR, PCID_USER, va);
526 	invpcid(INVPCID_ADDR, PCID_KERNEL, va);
527 }
528 
529 static void
530 mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
531 {
532 	EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
533 	ASSERT(len > 0);
534 	ASSERT(pgsz != 0);
535 
536 	if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
537 		for (uintptr_t va = addr; va < (addr + len); va += pgsz)
538 			mmu_flush_tlb_page(va);
539 		return;
540 	}
541 
542 	/*
543 	 * As an emulated invpcid() in the PCIDE case requires jumping
544 	 * cr3s, we batch the invalidations.  We should only need to flush the
545 	 * user range if we're on a user-space HAT.
546 	 */
547 	if (addr < kernelbase && ON_USER_HAT(CPU)) {
548 		ulong_t flag = intr_clear();
549 		ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
550 		tr_mmu_flush_user_range(addr, len, pgsz,
551 		    CPU->cpu_m.mcpu_kpti.kf_user_cr3);
552 		intr_restore(flag);
553 	}
554 
555 	for (uintptr_t va = addr; va < (addr + len); va += pgsz)
556 		mmu_invlpg((caddr_t)va);
557 }
558 
559 /*
560  * MMU TLB (and PT cache) flushing on this CPU.
561  *
562  * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
563  * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
564  * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
565  * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
566  * invalidated.
567  */
568 void
569 mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
570 {
571 	ASSERT(getpcid() == PCID_KERNEL);
572 
573 	switch (type) {
574 	case FLUSH_TLB_ALL:
575 		ASSERT(range == NULL);
576 		invpcid(INVPCID_ALL_GLOBAL, 0, 0);
577 		break;
578 
579 	case FLUSH_TLB_NONGLOBAL:
580 		ASSERT(range == NULL);
581 		invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
582 		break;
583 
584 	case FLUSH_TLB_RANGE: {
585 		mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
586 		    LEVEL_SIZE(range->tr_level));
587 		break;
588 	}
589 
590 	default:
591 		panic("invalid call mmu_flush_tlb(%d)", type);
592 		break;
593 	}
594 }
595 
596 #endif /* ! __xpv */
597