xref: /titanic_52/usr/src/uts/i86xpv/os/xen_mmu.c (revision 02b4e56ca3a4e4a4fe9e52fca9c2972101f0e57f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 
28 #include <sys/mach_mmu.h>
29 #include <sys/machsystm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/promif.h>
32 #include <sys/hypervisor.h>
33 #include <sys/bootconf.h>
34 #include <sys/ontrap.h>
35 #include <sys/rwlock.h>
36 #include <sys/sysmacros.h>
37 #include <vm/seg_kmem.h>
38 #include <vm/kboot_mmu.h>
39 #include <vm/hat_pte.h>
40 #include <vm/hat.h>
41 #include <vm/htable.h>
42 #include <vm/hat_i86.h>
43 
44 start_info_t *xen_info;
45 ulong_t mfn_count;
46 mfn_t *mfn_list;
47 mfn_t *mfn_list_pages;		/* pages that make a table of mfn's */
48 				/* that make up the pa_to_ma table */
49 mfn_t *mfn_list_pages_page;	/* page of mfn's for mfn_list_pages */
50 mfn_t cached_max_mfn;
51 uintptr_t xen_virt_start;
52 pfn_t *mfn_to_pfn_mapping;
53 caddr_t xb_addr;		/* virtual addr for the store_mfn page */
54 
55 
56 /*
57  * We need to prevent migration or suspension of a domU while it's
58  * manipulating MFN values, as the MFN values will spontaneously
59  * change. The next 4 routines provide a mechanism for that.
60  * The basic idea is to use reader/writer mutex, readers are any thread
61  * that is manipulating MFNs. Only the thread which is going to actually call
62  * HYPERVISOR_suspend() will become a writer.
63  *
64  * Since various places need to manipulate MFNs and also call the HAT,
65  * we track if a thread acquires reader status and allow it to recursively
66  * do so again. This prevents deadlocks if a migration request
67  * is started and waits for some reader, but then the previous reader needs
68  * to call into the HAT.
69  */
70 #define	NUM_M2P_LOCKS 128
71 static struct {
72 	krwlock_t m2p_rwlock;
73 	char m2p_pad[64 - sizeof (krwlock_t)];	/* 64 byte cache line size */
74 } m2p_lock[NUM_M2P_LOCKS];
75 
76 #define	XM2P_HASH	((uintptr_t)curthread->t_tid & (NUM_M2P_LOCKS - 1))
77 
78 void
79 xen_block_migrate(void)
80 {
81 	if (!DOMAIN_IS_INITDOMAIN(xen_info) &&
82 	    ++curthread->t_xpvcntr == 1)
83 		rw_enter(&m2p_lock[XM2P_HASH].m2p_rwlock, RW_READER);
84 }
85 
86 void
87 xen_allow_migrate(void)
88 {
89 	if (!DOMAIN_IS_INITDOMAIN(xen_info) &&
90 	    --curthread->t_xpvcntr == 0)
91 		rw_exit(&m2p_lock[XM2P_HASH].m2p_rwlock);
92 }
93 
94 void
95 xen_start_migrate(void)
96 {
97 	int i;
98 
99 	ASSERT(curthread->t_xpvcntr == 0);
100 	++curthread->t_xpvcntr; /* this allows calls into HAT */
101 	for (i = 0; i < NUM_M2P_LOCKS; ++i)
102 		rw_enter(&m2p_lock[i].m2p_rwlock, RW_WRITER);
103 }
104 
105 void
106 xen_end_migrate(void)
107 {
108 	int i;
109 
110 	for (i = 0; i < NUM_M2P_LOCKS; ++i)
111 		rw_exit(&m2p_lock[i].m2p_rwlock);
112 	ASSERT(curthread->t_xpvcntr == 1);
113 	--curthread->t_xpvcntr;
114 }
115 
116 /*ARGSUSED*/
117 void
118 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
119 {
120 	mmu_update_t t;
121 	maddr_t mtable = pa_to_ma(table);
122 	int retcnt;
123 
124 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
125 	t.val = pteval;
126 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
127 		bop_panic("HYPERVISOR_mmu_update() failed");
128 }
129 
130 /*
131  * The start_info_t and mfn_list are initially mapped in low "boot" memory.
132  * Each has a page aligned address and size. We relocate them up into the
133  * kernel's normal address space at this point in time. We also create
134  * the arrays that let the hypervisor suspend/resume a domain.
135  */
136 void
137 xen_relocate_start_info(void)
138 {
139 	maddr_t mach_addr;
140 	size_t sz;
141 	size_t sz2;
142 	offset_t off;
143 	uintptr_t addr;
144 	uintptr_t old;
145 	int i, j;
146 
147 	/*
148 	 * In dom0, we have to account for the console_info structure
149 	 * which might immediately follow the start_info in memory.
150 	 */
151 	sz = sizeof (start_info_t);
152 	if (DOMAIN_IS_INITDOMAIN(xen_info) &&
153 	    xen_info->console.dom0.info_off >= sizeof (start_info_t)) {
154 		sz += xen_info->console.dom0.info_off - sizeof (start_info_t) +
155 		    xen_info->console.dom0.info_size;
156 	}
157 	sz = P2ROUNDUP(sz, MMU_PAGESIZE);
158 	addr = (uintptr_t)vmem_alloc(heap_arena, sz, VM_SLEEP);
159 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
160 		mach_addr = pa_to_ma(pfn_to_pa(va_to_pfn(
161 		    (caddr_t)xen_info + off)));
162 		kbm_map_ma(mach_addr + off, addr + off, 0);
163 	}
164 	boot_mapin((caddr_t)addr, sz);
165 	old = (uintptr_t)xen_info;
166 	xen_info = (start_info_t *)addr;
167 	for (off = 0; off < sz; off += MMU_PAGESIZE)
168 		kbm_unmap(old + off);
169 
170 	/*
171 	 * Relocate the mfn_list, any number of pages.
172 	 */
173 	sz = P2ROUNDUP(mfn_count * sizeof (mfn_t), MMU_PAGESIZE);
174 	addr = (uintptr_t)vmem_xalloc(heap_arena, sz, MMU_PAGESIZE, 0,
175 	    0, 0, 0, VM_SLEEP);
176 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
177 		mach_addr =
178 		    pa_to_ma(pfn_to_pa(va_to_pfn((caddr_t)mfn_list + off)));
179 		kbm_map_ma(mach_addr, addr + off, 0);
180 	}
181 	boot_mapin((caddr_t)addr, sz);
182 	old = (uintptr_t)mfn_list;
183 	mfn_list = (mfn_t *)addr;
184 	xen_info->mfn_list = (mfn_t)addr;
185 	for (off = 0; off < sz; off += MMU_PAGESIZE)
186 		kbm_unmap(old + off);
187 
188 	/*
189 	 * Create the lists of mfn_list pages needed by suspend/resume.
190 	 * Note we skip this for domain 0 as it can't suspend/resume.
191 	 */
192 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
193 		sz2 = P2ROUNDUP(mmu_btop(sz) * sizeof (mfn_t), MMU_PAGESIZE);
194 		mfn_list_pages = kmem_zalloc(sz2, VM_SLEEP);
195 		mfn_list_pages_page = kmem_zalloc(MMU_PAGESIZE, VM_SLEEP);
196 		i = 0;
197 		for (off = 0; off < sz; off += MMU_PAGESIZE) {
198 			j = mmu_btop(off);
199 			if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
200 				mfn_list_pages_page[i++] =
201 				    pfn_to_mfn(va_to_pfn(&mfn_list_pages[j]));
202 			}
203 			mfn_list_pages[j] =
204 			    pfn_to_mfn(va_to_pfn((caddr_t)mfn_list + off));
205 		}
206 		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
207 		    pfn_to_mfn(va_to_pfn(mfn_list_pages_page));
208 		HYPERVISOR_shared_info->arch.max_pfn = xen_info->nr_pages;
209 	}
210 
211 	/*
212 	 * Remap the shared info (for I/O) into high memory, too.
213 	 */
214 	sz = MMU_PAGESIZE;
215 	addr = (uintptr_t)vmem_alloc(heap_arena, sz, VM_SLEEP);
216 	kbm_map_ma(xen_info->shared_info, addr, 0);
217 	/* shared info has no PFN so don't do: boot_mapin((caddr_t)addr, sz) */
218 	old = (uintptr_t)HYPERVISOR_shared_info;
219 	HYPERVISOR_shared_info = (void *)addr;
220 	kbm_unmap(old);
221 
222 	/*
223 	 * Remap the console info into high memory, too.
224 	 */
225 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
226 		sz = MMU_PAGESIZE;
227 		addr = (uintptr_t)vmem_alloc(heap_arena, sz, VM_SLEEP);
228 		kbm_map_ma(pfn_to_pa(xen_info->console.domU.mfn), addr, 0);
229 		boot_mapin((caddr_t)addr, sz);
230 		old = (uintptr_t)HYPERVISOR_console_page;
231 		HYPERVISOR_console_page = (void *)addr;
232 		kbm_unmap(old);
233 	} else {
234 		HYPERVISOR_console_page = NULL;
235 	}
236 
237 	/*
238 	 * On domUs we need to have the xenbus page (store_mfn) mapped into
239 	 * the kernel. This is referenced as xb_addr.
240 	 */
241 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
242 		xb_addr = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
243 		kbm_map_ma(mfn_to_ma(xen_info->store_mfn),
244 		    (uintptr_t)xb_addr, 0);
245 		boot_mapin(xb_addr, MMU_PAGESIZE);
246 	}
247 }
248 
249 /*
250  * Generate the pfn value to use for a foreign mfn.
251  */
252 pfn_t
253 xen_assign_pfn(mfn_t mfn)
254 {
255 	pfn_t pfn;
256 
257 #ifdef DEBUG
258 	/*
259 	 * make sure this MFN isn't in our list of MFNs
260 	 */
261 	on_trap_data_t otd;
262 	uint_t	on_trap_ready = (t0.t_stk != NULL);
263 
264 	if (on_trap_ready) {
265 		if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
266 			pfn = mfn_to_pfn_mapping[mfn];
267 			if (pfn < mfn_count && mfn_list[pfn] == mfn)
268 				panic("xen_assign_pfn() mfn belongs to us");
269 		}
270 		no_trap();
271 	}
272 #endif /* DEBUG */
273 
274 	if (mfn == MFN_INVALID)
275 		panic("xen_assign_pfn(MFN_INVALID) not allowed");
276 	pfn = (pfn_t)mfn | PFN_IS_FOREIGN_MFN;
277 	if (pfn == mfn)
278 		panic("xen_assign_pfn(mfn) PFN_IS_FOREIGN_MFN bit already set");
279 	return (pfn);
280 }
281 
282 void
283 xen_release_pfn(pfn_t pfn)
284 {
285 	if (pfn == PFN_INVALID)
286 		panic("xen_release_pfn(PFN_INVALID) not allowed");
287 	if ((pfn & PFN_IS_FOREIGN_MFN) == 0)
288 		panic("mfn high bit not set");
289 }
290 
291 uint_t
292 pfn_is_foreign(pfn_t pfn)
293 {
294 	if (pfn == PFN_INVALID)
295 		return (0);
296 	return ((pfn & PFN_IS_FOREIGN_MFN) != 0);
297 }
298 
299 pfn_t
300 pte2pfn(x86pte_t pte, level_t l)
301 {
302 	mfn_t mfn = PTE2MFN(pte, l);
303 
304 	if ((pte & PT_SOFTWARE) >= PT_FOREIGN)
305 		return ((pfn_t)mfn | PFN_IS_FOREIGN_MFN);
306 	return (mfn_to_pfn(mfn));
307 }
308 
309 mfn_t
310 pfn_to_mfn(pfn_t pfn)
311 {
312 	if (pfn == PFN_INVALID)
313 		panic("pfn_to_mfn(PFN_INVALID) not allowed");
314 
315 	if (pfn & PFN_IS_FOREIGN_MFN)
316 		return (pfn & ~PFN_IS_FOREIGN_MFN);
317 
318 	if (pfn >= mfn_count)
319 		panic("pfn_to_mfn(): illegal PFN 0x%lx", pfn);
320 
321 	return (mfn_list[pfn]);
322 }
323 
324 /*
325  * This routine translates an MFN back into the corresponding PFN value.
326  * It has to be careful since the mfn_to_pfn_mapping[] might fault
327  * as that table is sparse. It also has to check for non-faulting, but out of
328  * range that exceed the table.
329  */
330 pfn_t
331 mfn_to_pfn(mfn_t mfn)
332 {
333 	pfn_t pfn;
334 	on_trap_data_t otd;
335 	uint_t	on_trap_ready = (t0.t_stk != NULL);
336 
337 	/*
338 	 * Cleared at a suspend or migrate
339 	 */
340 	if (cached_max_mfn == 0)
341 		cached_max_mfn =
342 		    HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
343 
344 	if (cached_max_mfn < mfn)
345 		return ((pfn_t)mfn | PFN_IS_FOREIGN_MFN);
346 
347 	if (on_trap_ready && on_trap(&otd, OT_DATA_ACCESS)) {
348 		pfn = (pfn_t)mfn | PFN_IS_FOREIGN_MFN;
349 	} else {
350 		pfn = mfn_to_pfn_mapping[mfn];
351 
352 		if (pfn == PFN_INVALID || pfn >= mfn_count ||
353 		    pfn_to_mfn(pfn) != mfn)
354 			pfn = (pfn_t)mfn | PFN_IS_FOREIGN_MFN;
355 	}
356 
357 	if (on_trap_ready)
358 		no_trap();
359 
360 	/*
361 	 * If khat_running is set then we should be checking
362 	 * in domUs that migration is blocked while using the
363 	 * mfn_to_pfn_mapping[] table.
364 	 */
365 	ASSERT(!khat_running || DOMAIN_IS_INITDOMAIN(xen_info) ||
366 	    rw_read_held(&m2p_lock[XM2P_HASH].m2p_rwlock));
367 
368 	return (pfn);
369 }
370 
371 /*
372  * From a pseudo-physical address, find the corresponding machine address.
373  */
374 maddr_t
375 pa_to_ma(paddr_t pa)
376 {
377 	mfn_t mfn = pfn_to_mfn(mmu_btop(pa));
378 
379 	if (mfn == MFN_INVALID)
380 		panic("pa_to_ma() got MFN_INVALID");
381 	return (mfn_to_ma(mfn) + (pa & MMU_PAGEOFFSET));
382 }
383 
384 /*
385  * From a machine address, find the corresponding pseudo-physical address.
386  */
387 paddr_t
388 ma_to_pa(maddr_t ma)
389 {
390 	pfn_t pfn = mfn_to_pfn(mmu_btop(ma));
391 
392 	if (pfn == PFN_INVALID)
393 		panic("ma_to_pa() got PFN_INVALID");
394 	return (pfn_to_pa(pfn) + (ma & MMU_PAGEOFFSET));
395 }
396 
397 /*
398  * When calling reassign_pfn(), the page must be (at least) read locked
399  * to make sure swrand does not try to grab it.
400  */
401 #ifdef DEBUG
402 #define	CHECK_PAGE_LOCK(pfn)	{			\
403 	page_t *pp = page_numtopp_nolock(pfn);		\
404 	if ((pp != NULL) && (!PAGE_LOCKED(pp))) {	\
405 		panic("reassign_pfn() called with unlocked page (pfn 0x%lx)", \
406 		    pfn);				\
407 	}						\
408 }
409 #else	/* DEBUG */
410 #define	CHECK_PAGE_LOCK(pfn)
411 #endif	/* DEBUG */
412 
413 /*
414  * Reassign a new machine page to back a physical address.
415  */
416 void
417 reassign_pfn(pfn_t pfn, mfn_t mfn)
418 {
419 	int mmu_update_return;
420 	mmu_update_t t;
421 	extern void update_contig_pfnlist(pfn_t, mfn_t, mfn_t);
422 
423 	ASSERT(pfn != PFN_INVALID);
424 	ASSERT(!pfn_is_foreign(pfn));
425 
426 	ASSERT(pfn < mfn_count);
427 	update_contig_pfnlist(pfn, mfn_list[pfn], mfn);
428 	if (mfn == MFN_INVALID) {
429 		CHECK_PAGE_LOCK(pfn);
430 		if (kpm_vbase != NULL && xen_kpm_page(pfn, 0) < 0)
431 			panic("reassign_pfn(): failed to remove kpm mapping");
432 		mfn_list[pfn] = mfn;
433 		return;
434 	}
435 
436 	/*
437 	 * Verify that previously given away pages are still page locked.
438 	 */
439 	if (mfn_list[pfn] == MFN_INVALID) {
440 		CHECK_PAGE_LOCK(pfn);
441 	}
442 	mfn_list[pfn] = mfn;
443 
444 	t.ptr = mfn_to_ma(mfn) | MMU_MACHPHYS_UPDATE;
445 	t.val = pfn;
446 
447 	if (HYPERVISOR_mmu_update(&t, 1, &mmu_update_return, DOMID_SELF))
448 		panic("HYPERVISOR_mmu_update() failed");
449 	ASSERT(mmu_update_return == 1);
450 
451 	if (kpm_vbase != NULL && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
452 		panic("reassign_pfn(): failed to enable kpm mapping");
453 }
454