xref: /linux/arch/x86/xen/mmu.c (revision cc4589ebfae6f8dbb5cf880a0a67eedab3416492)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
45 #include <linux/module.h>
46 #include <linux/gfp.h>
47 
48 #include <asm/pgtable.h>
49 #include <asm/tlbflush.h>
50 #include <asm/fixmap.h>
51 #include <asm/mmu_context.h>
52 #include <asm/setup.h>
53 #include <asm/paravirt.h>
54 #include <asm/linkage.h>
55 
56 #include <asm/xen/hypercall.h>
57 #include <asm/xen/hypervisor.h>
58 
59 #include <xen/page.h>
60 #include <xen/interface/xen.h>
61 #include <xen/interface/hvm/hvm_op.h>
62 #include <xen/interface/version.h>
63 #include <xen/hvc-console.h>
64 
65 #include "multicalls.h"
66 #include "mmu.h"
67 #include "debugfs.h"
68 
69 #define MMU_UPDATE_HISTO	30
70 
71 #ifdef CONFIG_XEN_DEBUG_FS
72 
73 static struct {
74 	u32 pgd_update;
75 	u32 pgd_update_pinned;
76 	u32 pgd_update_batched;
77 
78 	u32 pud_update;
79 	u32 pud_update_pinned;
80 	u32 pud_update_batched;
81 
82 	u32 pmd_update;
83 	u32 pmd_update_pinned;
84 	u32 pmd_update_batched;
85 
86 	u32 pte_update;
87 	u32 pte_update_pinned;
88 	u32 pte_update_batched;
89 
90 	u32 mmu_update;
91 	u32 mmu_update_extended;
92 	u32 mmu_update_histo[MMU_UPDATE_HISTO];
93 
94 	u32 prot_commit;
95 	u32 prot_commit_batched;
96 
97 	u32 set_pte_at;
98 	u32 set_pte_at_batched;
99 	u32 set_pte_at_pinned;
100 	u32 set_pte_at_current;
101 	u32 set_pte_at_kernel;
102 } mmu_stats;
103 
104 static u8 zero_stats;
105 
106 static inline void check_zero(void)
107 {
108 	if (unlikely(zero_stats)) {
109 		memset(&mmu_stats, 0, sizeof(mmu_stats));
110 		zero_stats = 0;
111 	}
112 }
113 
114 #define ADD_STATS(elem, val)			\
115 	do { check_zero(); mmu_stats.elem += (val); } while(0)
116 
117 #else  /* !CONFIG_XEN_DEBUG_FS */
118 
119 #define ADD_STATS(elem, val)	do { (void)(val); } while(0)
120 
121 #endif /* CONFIG_XEN_DEBUG_FS */
122 
123 
124 /*
125  * Identity map, in addition to plain kernel map.  This needs to be
126  * large enough to allocate page table pages to allocate the rest.
127  * Each page can map 2MB.
128  */
129 static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
130 
131 #ifdef CONFIG_X86_64
132 /* l3 pud for userspace vsyscall mapping */
133 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
134 #endif /* CONFIG_X86_64 */
135 
136 /*
137  * Note about cr3 (pagetable base) values:
138  *
139  * xen_cr3 contains the current logical cr3 value; it contains the
140  * last set cr3.  This may not be the current effective cr3, because
141  * its update may be being lazily deferred.  However, a vcpu looking
142  * at its own cr3 can use this value knowing that it everything will
143  * be self-consistent.
144  *
145  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
146  * hypercall to set the vcpu cr3 is complete (so it may be a little
147  * out of date, but it will never be set early).  If one vcpu is
148  * looking at another vcpu's cr3 value, it should use this variable.
149  */
150 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
151 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
152 
153 
154 /*
155  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
156  * redzone above it, so round it up to a PGD boundary.
157  */
158 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
159 
160 
161 #define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
162 #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
163 
164 /* Placeholder for holes in the address space */
165 static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
166 		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
167 
168  /* Array of pointers to pages containing p2m entries */
169 static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
170 		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
171 
172 /* Arrays of p2m arrays expressed in mfns used for save/restore */
173 static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
174 
175 static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
176 	__page_aligned_bss;
177 
178 static inline unsigned p2m_top_index(unsigned long pfn)
179 {
180 	BUG_ON(pfn >= MAX_DOMAIN_PAGES);
181 	return pfn / P2M_ENTRIES_PER_PAGE;
182 }
183 
184 static inline unsigned p2m_index(unsigned long pfn)
185 {
186 	return pfn % P2M_ENTRIES_PER_PAGE;
187 }
188 
189 /* Build the parallel p2m_top_mfn structures */
190 void xen_build_mfn_list_list(void)
191 {
192 	unsigned pfn, idx;
193 
194 	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
195 		unsigned topidx = p2m_top_index(pfn);
196 
197 		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
198 	}
199 
200 	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
201 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
202 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
203 	}
204 }
205 
206 void xen_setup_mfn_list_list(void)
207 {
208 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
209 
210 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
211 		virt_to_mfn(p2m_top_mfn_list);
212 	HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
213 }
214 
215 /* Set up p2m_top to point to the domain-builder provided p2m pages */
216 void __init xen_build_dynamic_phys_to_machine(void)
217 {
218 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
219 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
220 	unsigned pfn;
221 
222 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
223 		unsigned topidx = p2m_top_index(pfn);
224 
225 		p2m_top[topidx] = &mfn_list[pfn];
226 	}
227 
228 	xen_build_mfn_list_list();
229 }
230 
231 unsigned long get_phys_to_machine(unsigned long pfn)
232 {
233 	unsigned topidx, idx;
234 
235 	if (unlikely(pfn >= MAX_DOMAIN_PAGES))
236 		return INVALID_P2M_ENTRY;
237 
238 	topidx = p2m_top_index(pfn);
239 	idx = p2m_index(pfn);
240 	return p2m_top[topidx][idx];
241 }
242 EXPORT_SYMBOL_GPL(get_phys_to_machine);
243 
244 /* install a  new p2m_top page */
245 bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
246 {
247 	unsigned topidx = p2m_top_index(pfn);
248 	unsigned long **pfnp, *mfnp;
249 	unsigned i;
250 
251 	pfnp = &p2m_top[topidx];
252 	mfnp = &p2m_top_mfn[topidx];
253 
254 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
255 		p[i] = INVALID_P2M_ENTRY;
256 
257 	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
258 		*mfnp = virt_to_mfn(p);
259 		return true;
260 	}
261 
262 	return false;
263 }
264 
265 static void alloc_p2m(unsigned long pfn)
266 {
267 	unsigned long *p;
268 
269 	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
270 	BUG_ON(p == NULL);
271 
272 	if (!install_p2mtop_page(pfn, p))
273 		free_page((unsigned long)p);
274 }
275 
276 /* Try to install p2m mapping; fail if intermediate bits missing */
277 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
278 {
279 	unsigned topidx, idx;
280 
281 	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
282 		BUG_ON(mfn != INVALID_P2M_ENTRY);
283 		return true;
284 	}
285 
286 	topidx = p2m_top_index(pfn);
287 	if (p2m_top[topidx] == p2m_missing) {
288 		if (mfn == INVALID_P2M_ENTRY)
289 			return true;
290 		return false;
291 	}
292 
293 	idx = p2m_index(pfn);
294 	p2m_top[topidx][idx] = mfn;
295 
296 	return true;
297 }
298 
299 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
300 {
301 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
302 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
303 		return;
304 	}
305 
306 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
307 		alloc_p2m(pfn);
308 
309 		if (!__set_phys_to_machine(pfn, mfn))
310 			BUG();
311 	}
312 }
313 
314 unsigned long arbitrary_virt_to_mfn(void *vaddr)
315 {
316 	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
317 
318 	return PFN_DOWN(maddr.maddr);
319 }
320 
321 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
322 {
323 	unsigned long address = (unsigned long)vaddr;
324 	unsigned int level;
325 	pte_t *pte;
326 	unsigned offset;
327 
328 	/*
329 	 * if the PFN is in the linear mapped vaddr range, we can just use
330 	 * the (quick) virt_to_machine() p2m lookup
331 	 */
332 	if (virt_addr_valid(vaddr))
333 		return virt_to_machine(vaddr);
334 
335 	/* otherwise we have to do a (slower) full page-table walk */
336 
337 	pte = lookup_address(address, &level);
338 	BUG_ON(pte == NULL);
339 	offset = address & ~PAGE_MASK;
340 	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
341 }
342 
343 void make_lowmem_page_readonly(void *vaddr)
344 {
345 	pte_t *pte, ptev;
346 	unsigned long address = (unsigned long)vaddr;
347 	unsigned int level;
348 
349 	pte = lookup_address(address, &level);
350 	BUG_ON(pte == NULL);
351 
352 	ptev = pte_wrprotect(*pte);
353 
354 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
355 		BUG();
356 }
357 
358 void make_lowmem_page_readwrite(void *vaddr)
359 {
360 	pte_t *pte, ptev;
361 	unsigned long address = (unsigned long)vaddr;
362 	unsigned int level;
363 
364 	pte = lookup_address(address, &level);
365 	BUG_ON(pte == NULL);
366 
367 	ptev = pte_mkwrite(*pte);
368 
369 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
370 		BUG();
371 }
372 
373 
374 static bool xen_page_pinned(void *ptr)
375 {
376 	struct page *page = virt_to_page(ptr);
377 
378 	return PagePinned(page);
379 }
380 
381 static void xen_extend_mmu_update(const struct mmu_update *update)
382 {
383 	struct multicall_space mcs;
384 	struct mmu_update *u;
385 
386 	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
387 
388 	if (mcs.mc != NULL) {
389 		ADD_STATS(mmu_update_extended, 1);
390 		ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
391 
392 		mcs.mc->args[1]++;
393 
394 		if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
395 			ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
396 		else
397 			ADD_STATS(mmu_update_histo[0], 1);
398 	} else {
399 		ADD_STATS(mmu_update, 1);
400 		mcs = __xen_mc_entry(sizeof(*u));
401 		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
402 		ADD_STATS(mmu_update_histo[1], 1);
403 	}
404 
405 	u = mcs.args;
406 	*u = *update;
407 }
408 
409 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
410 {
411 	struct mmu_update u;
412 
413 	preempt_disable();
414 
415 	xen_mc_batch();
416 
417 	/* ptr may be ioremapped for 64-bit pagetable setup */
418 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
419 	u.val = pmd_val_ma(val);
420 	xen_extend_mmu_update(&u);
421 
422 	ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
423 
424 	xen_mc_issue(PARAVIRT_LAZY_MMU);
425 
426 	preempt_enable();
427 }
428 
429 void xen_set_pmd(pmd_t *ptr, pmd_t val)
430 {
431 	ADD_STATS(pmd_update, 1);
432 
433 	/* If page is not pinned, we can just update the entry
434 	   directly */
435 	if (!xen_page_pinned(ptr)) {
436 		*ptr = val;
437 		return;
438 	}
439 
440 	ADD_STATS(pmd_update_pinned, 1);
441 
442 	xen_set_pmd_hyper(ptr, val);
443 }
444 
445 /*
446  * Associate a virtual page frame with a given physical page frame
447  * and protection flags for that frame.
448  */
449 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
450 {
451 	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
452 }
453 
454 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
455 		    pte_t *ptep, pte_t pteval)
456 {
457 	ADD_STATS(set_pte_at, 1);
458 //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
459 	ADD_STATS(set_pte_at_current, mm == current->mm);
460 	ADD_STATS(set_pte_at_kernel, mm == &init_mm);
461 
462 	if (mm == current->mm || mm == &init_mm) {
463 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
464 			struct multicall_space mcs;
465 			mcs = xen_mc_entry(0);
466 
467 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
468 			ADD_STATS(set_pte_at_batched, 1);
469 			xen_mc_issue(PARAVIRT_LAZY_MMU);
470 			goto out;
471 		} else
472 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
473 				goto out;
474 	}
475 	xen_set_pte(ptep, pteval);
476 
477 out:	return;
478 }
479 
480 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
481 				 unsigned long addr, pte_t *ptep)
482 {
483 	/* Just return the pte as-is.  We preserve the bits on commit */
484 	return *ptep;
485 }
486 
487 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
488 				 pte_t *ptep, pte_t pte)
489 {
490 	struct mmu_update u;
491 
492 	xen_mc_batch();
493 
494 	u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
495 	u.val = pte_val_ma(pte);
496 	xen_extend_mmu_update(&u);
497 
498 	ADD_STATS(prot_commit, 1);
499 	ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
500 
501 	xen_mc_issue(PARAVIRT_LAZY_MMU);
502 }
503 
504 /* Assume pteval_t is equivalent to all the other *val_t types. */
505 static pteval_t pte_mfn_to_pfn(pteval_t val)
506 {
507 	if (val & _PAGE_PRESENT) {
508 		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
509 		pteval_t flags = val & PTE_FLAGS_MASK;
510 		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
511 	}
512 
513 	return val;
514 }
515 
516 static pteval_t pte_pfn_to_mfn(pteval_t val)
517 {
518 	if (val & _PAGE_PRESENT) {
519 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
520 		pteval_t flags = val & PTE_FLAGS_MASK;
521 		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
522 	}
523 
524 	return val;
525 }
526 
527 pteval_t xen_pte_val(pte_t pte)
528 {
529 	return pte_mfn_to_pfn(pte.pte);
530 }
531 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
532 
533 pgdval_t xen_pgd_val(pgd_t pgd)
534 {
535 	return pte_mfn_to_pfn(pgd.pgd);
536 }
537 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
538 
539 pte_t xen_make_pte(pteval_t pte)
540 {
541 	pte = pte_pfn_to_mfn(pte);
542 	return native_make_pte(pte);
543 }
544 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
545 
546 pgd_t xen_make_pgd(pgdval_t pgd)
547 {
548 	pgd = pte_pfn_to_mfn(pgd);
549 	return native_make_pgd(pgd);
550 }
551 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
552 
553 pmdval_t xen_pmd_val(pmd_t pmd)
554 {
555 	return pte_mfn_to_pfn(pmd.pmd);
556 }
557 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
558 
559 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
560 {
561 	struct mmu_update u;
562 
563 	preempt_disable();
564 
565 	xen_mc_batch();
566 
567 	/* ptr may be ioremapped for 64-bit pagetable setup */
568 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
569 	u.val = pud_val_ma(val);
570 	xen_extend_mmu_update(&u);
571 
572 	ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
573 
574 	xen_mc_issue(PARAVIRT_LAZY_MMU);
575 
576 	preempt_enable();
577 }
578 
579 void xen_set_pud(pud_t *ptr, pud_t val)
580 {
581 	ADD_STATS(pud_update, 1);
582 
583 	/* If page is not pinned, we can just update the entry
584 	   directly */
585 	if (!xen_page_pinned(ptr)) {
586 		*ptr = val;
587 		return;
588 	}
589 
590 	ADD_STATS(pud_update_pinned, 1);
591 
592 	xen_set_pud_hyper(ptr, val);
593 }
594 
595 void xen_set_pte(pte_t *ptep, pte_t pte)
596 {
597 	ADD_STATS(pte_update, 1);
598 //	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
599 	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
600 
601 #ifdef CONFIG_X86_PAE
602 	ptep->pte_high = pte.pte_high;
603 	smp_wmb();
604 	ptep->pte_low = pte.pte_low;
605 #else
606 	*ptep = pte;
607 #endif
608 }
609 
610 #ifdef CONFIG_X86_PAE
611 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
612 {
613 	set_64bit((u64 *)ptep, native_pte_val(pte));
614 }
615 
616 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
617 {
618 	ptep->pte_low = 0;
619 	smp_wmb();		/* make sure low gets written first */
620 	ptep->pte_high = 0;
621 }
622 
623 void xen_pmd_clear(pmd_t *pmdp)
624 {
625 	set_pmd(pmdp, __pmd(0));
626 }
627 #endif	/* CONFIG_X86_PAE */
628 
629 pmd_t xen_make_pmd(pmdval_t pmd)
630 {
631 	pmd = pte_pfn_to_mfn(pmd);
632 	return native_make_pmd(pmd);
633 }
634 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
635 
636 #if PAGETABLE_LEVELS == 4
637 pudval_t xen_pud_val(pud_t pud)
638 {
639 	return pte_mfn_to_pfn(pud.pud);
640 }
641 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
642 
643 pud_t xen_make_pud(pudval_t pud)
644 {
645 	pud = pte_pfn_to_mfn(pud);
646 
647 	return native_make_pud(pud);
648 }
649 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
650 
651 pgd_t *xen_get_user_pgd(pgd_t *pgd)
652 {
653 	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
654 	unsigned offset = pgd - pgd_page;
655 	pgd_t *user_ptr = NULL;
656 
657 	if (offset < pgd_index(USER_LIMIT)) {
658 		struct page *page = virt_to_page(pgd_page);
659 		user_ptr = (pgd_t *)page->private;
660 		if (user_ptr)
661 			user_ptr += offset;
662 	}
663 
664 	return user_ptr;
665 }
666 
667 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
668 {
669 	struct mmu_update u;
670 
671 	u.ptr = virt_to_machine(ptr).maddr;
672 	u.val = pgd_val_ma(val);
673 	xen_extend_mmu_update(&u);
674 }
675 
676 /*
677  * Raw hypercall-based set_pgd, intended for in early boot before
678  * there's a page structure.  This implies:
679  *  1. The only existing pagetable is the kernel's
680  *  2. It is always pinned
681  *  3. It has no user pagetable attached to it
682  */
683 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
684 {
685 	preempt_disable();
686 
687 	xen_mc_batch();
688 
689 	__xen_set_pgd_hyper(ptr, val);
690 
691 	xen_mc_issue(PARAVIRT_LAZY_MMU);
692 
693 	preempt_enable();
694 }
695 
696 void xen_set_pgd(pgd_t *ptr, pgd_t val)
697 {
698 	pgd_t *user_ptr = xen_get_user_pgd(ptr);
699 
700 	ADD_STATS(pgd_update, 1);
701 
702 	/* If page is not pinned, we can just update the entry
703 	   directly */
704 	if (!xen_page_pinned(ptr)) {
705 		*ptr = val;
706 		if (user_ptr) {
707 			WARN_ON(xen_page_pinned(user_ptr));
708 			*user_ptr = val;
709 		}
710 		return;
711 	}
712 
713 	ADD_STATS(pgd_update_pinned, 1);
714 	ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
715 
716 	/* If it's pinned, then we can at least batch the kernel and
717 	   user updates together. */
718 	xen_mc_batch();
719 
720 	__xen_set_pgd_hyper(ptr, val);
721 	if (user_ptr)
722 		__xen_set_pgd_hyper(user_ptr, val);
723 
724 	xen_mc_issue(PARAVIRT_LAZY_MMU);
725 }
726 #endif	/* PAGETABLE_LEVELS == 4 */
727 
728 /*
729  * (Yet another) pagetable walker.  This one is intended for pinning a
730  * pagetable.  This means that it walks a pagetable and calls the
731  * callback function on each page it finds making up the page table,
732  * at every level.  It walks the entire pagetable, but it only bothers
733  * pinning pte pages which are below limit.  In the normal case this
734  * will be STACK_TOP_MAX, but at boot we need to pin up to
735  * FIXADDR_TOP.
736  *
737  * For 32-bit the important bit is that we don't pin beyond there,
738  * because then we start getting into Xen's ptes.
739  *
740  * For 64-bit, we must skip the Xen hole in the middle of the address
741  * space, just after the big x86-64 virtual hole.
742  */
743 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
744 			  int (*func)(struct mm_struct *mm, struct page *,
745 				      enum pt_level),
746 			  unsigned long limit)
747 {
748 	int flush = 0;
749 	unsigned hole_low, hole_high;
750 	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
751 	unsigned pgdidx, pudidx, pmdidx;
752 
753 	/* The limit is the last byte to be touched */
754 	limit--;
755 	BUG_ON(limit >= FIXADDR_TOP);
756 
757 	if (xen_feature(XENFEAT_auto_translated_physmap))
758 		return 0;
759 
760 	/*
761 	 * 64-bit has a great big hole in the middle of the address
762 	 * space, which contains the Xen mappings.  On 32-bit these
763 	 * will end up making a zero-sized hole and so is a no-op.
764 	 */
765 	hole_low = pgd_index(USER_LIMIT);
766 	hole_high = pgd_index(PAGE_OFFSET);
767 
768 	pgdidx_limit = pgd_index(limit);
769 #if PTRS_PER_PUD > 1
770 	pudidx_limit = pud_index(limit);
771 #else
772 	pudidx_limit = 0;
773 #endif
774 #if PTRS_PER_PMD > 1
775 	pmdidx_limit = pmd_index(limit);
776 #else
777 	pmdidx_limit = 0;
778 #endif
779 
780 	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
781 		pud_t *pud;
782 
783 		if (pgdidx >= hole_low && pgdidx < hole_high)
784 			continue;
785 
786 		if (!pgd_val(pgd[pgdidx]))
787 			continue;
788 
789 		pud = pud_offset(&pgd[pgdidx], 0);
790 
791 		if (PTRS_PER_PUD > 1) /* not folded */
792 			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
793 
794 		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
795 			pmd_t *pmd;
796 
797 			if (pgdidx == pgdidx_limit &&
798 			    pudidx > pudidx_limit)
799 				goto out;
800 
801 			if (pud_none(pud[pudidx]))
802 				continue;
803 
804 			pmd = pmd_offset(&pud[pudidx], 0);
805 
806 			if (PTRS_PER_PMD > 1) /* not folded */
807 				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
808 
809 			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
810 				struct page *pte;
811 
812 				if (pgdidx == pgdidx_limit &&
813 				    pudidx == pudidx_limit &&
814 				    pmdidx > pmdidx_limit)
815 					goto out;
816 
817 				if (pmd_none(pmd[pmdidx]))
818 					continue;
819 
820 				pte = pmd_page(pmd[pmdidx]);
821 				flush |= (*func)(mm, pte, PT_PTE);
822 			}
823 		}
824 	}
825 
826 out:
827 	/* Do the top level last, so that the callbacks can use it as
828 	   a cue to do final things like tlb flushes. */
829 	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
830 
831 	return flush;
832 }
833 
834 static int xen_pgd_walk(struct mm_struct *mm,
835 			int (*func)(struct mm_struct *mm, struct page *,
836 				    enum pt_level),
837 			unsigned long limit)
838 {
839 	return __xen_pgd_walk(mm, mm->pgd, func, limit);
840 }
841 
842 /* If we're using split pte locks, then take the page's lock and
843    return a pointer to it.  Otherwise return NULL. */
844 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
845 {
846 	spinlock_t *ptl = NULL;
847 
848 #if USE_SPLIT_PTLOCKS
849 	ptl = __pte_lockptr(page);
850 	spin_lock_nest_lock(ptl, &mm->page_table_lock);
851 #endif
852 
853 	return ptl;
854 }
855 
856 static void xen_pte_unlock(void *v)
857 {
858 	spinlock_t *ptl = v;
859 	spin_unlock(ptl);
860 }
861 
862 static void xen_do_pin(unsigned level, unsigned long pfn)
863 {
864 	struct mmuext_op *op;
865 	struct multicall_space mcs;
866 
867 	mcs = __xen_mc_entry(sizeof(*op));
868 	op = mcs.args;
869 	op->cmd = level;
870 	op->arg1.mfn = pfn_to_mfn(pfn);
871 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
872 }
873 
874 static int xen_pin_page(struct mm_struct *mm, struct page *page,
875 			enum pt_level level)
876 {
877 	unsigned pgfl = TestSetPagePinned(page);
878 	int flush;
879 
880 	if (pgfl)
881 		flush = 0;		/* already pinned */
882 	else if (PageHighMem(page))
883 		/* kmaps need flushing if we found an unpinned
884 		   highpage */
885 		flush = 1;
886 	else {
887 		void *pt = lowmem_page_address(page);
888 		unsigned long pfn = page_to_pfn(page);
889 		struct multicall_space mcs = __xen_mc_entry(0);
890 		spinlock_t *ptl;
891 
892 		flush = 0;
893 
894 		/*
895 		 * We need to hold the pagetable lock between the time
896 		 * we make the pagetable RO and when we actually pin
897 		 * it.  If we don't, then other users may come in and
898 		 * attempt to update the pagetable by writing it,
899 		 * which will fail because the memory is RO but not
900 		 * pinned, so Xen won't do the trap'n'emulate.
901 		 *
902 		 * If we're using split pte locks, we can't hold the
903 		 * entire pagetable's worth of locks during the
904 		 * traverse, because we may wrap the preempt count (8
905 		 * bits).  The solution is to mark RO and pin each PTE
906 		 * page while holding the lock.  This means the number
907 		 * of locks we end up holding is never more than a
908 		 * batch size (~32 entries, at present).
909 		 *
910 		 * If we're not using split pte locks, we needn't pin
911 		 * the PTE pages independently, because we're
912 		 * protected by the overall pagetable lock.
913 		 */
914 		ptl = NULL;
915 		if (level == PT_PTE)
916 			ptl = xen_pte_lock(page, mm);
917 
918 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
919 					pfn_pte(pfn, PAGE_KERNEL_RO),
920 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
921 
922 		if (ptl) {
923 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
924 
925 			/* Queue a deferred unlock for when this batch
926 			   is completed. */
927 			xen_mc_callback(xen_pte_unlock, ptl);
928 		}
929 	}
930 
931 	return flush;
932 }
933 
934 /* This is called just after a mm has been created, but it has not
935    been used yet.  We need to make sure that its pagetable is all
936    read-only, and can be pinned. */
937 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
938 {
939 	vm_unmap_aliases();
940 
941 	xen_mc_batch();
942 
943 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
944 		/* re-enable interrupts for flushing */
945 		xen_mc_issue(0);
946 
947 		kmap_flush_unused();
948 
949 		xen_mc_batch();
950 	}
951 
952 #ifdef CONFIG_X86_64
953 	{
954 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
955 
956 		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
957 
958 		if (user_pgd) {
959 			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
960 			xen_do_pin(MMUEXT_PIN_L4_TABLE,
961 				   PFN_DOWN(__pa(user_pgd)));
962 		}
963 	}
964 #else /* CONFIG_X86_32 */
965 #ifdef CONFIG_X86_PAE
966 	/* Need to make sure unshared kernel PMD is pinnable */
967 	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
968 		     PT_PMD);
969 #endif
970 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
971 #endif /* CONFIG_X86_64 */
972 	xen_mc_issue(0);
973 }
974 
975 static void xen_pgd_pin(struct mm_struct *mm)
976 {
977 	__xen_pgd_pin(mm, mm->pgd);
978 }
979 
980 /*
981  * On save, we need to pin all pagetables to make sure they get their
982  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
983  * them (unpinned pgds are not currently in use, probably because the
984  * process is under construction or destruction).
985  *
986  * Expected to be called in stop_machine() ("equivalent to taking
987  * every spinlock in the system"), so the locking doesn't really
988  * matter all that much.
989  */
990 void xen_mm_pin_all(void)
991 {
992 	unsigned long flags;
993 	struct page *page;
994 
995 	spin_lock_irqsave(&pgd_lock, flags);
996 
997 	list_for_each_entry(page, &pgd_list, lru) {
998 		if (!PagePinned(page)) {
999 			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1000 			SetPageSavePinned(page);
1001 		}
1002 	}
1003 
1004 	spin_unlock_irqrestore(&pgd_lock, flags);
1005 }
1006 
1007 /*
1008  * The init_mm pagetable is really pinned as soon as its created, but
1009  * that's before we have page structures to store the bits.  So do all
1010  * the book-keeping now.
1011  */
1012 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1013 				  enum pt_level level)
1014 {
1015 	SetPagePinned(page);
1016 	return 0;
1017 }
1018 
1019 static void __init xen_mark_init_mm_pinned(void)
1020 {
1021 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1022 }
1023 
1024 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1025 			  enum pt_level level)
1026 {
1027 	unsigned pgfl = TestClearPagePinned(page);
1028 
1029 	if (pgfl && !PageHighMem(page)) {
1030 		void *pt = lowmem_page_address(page);
1031 		unsigned long pfn = page_to_pfn(page);
1032 		spinlock_t *ptl = NULL;
1033 		struct multicall_space mcs;
1034 
1035 		/*
1036 		 * Do the converse to pin_page.  If we're using split
1037 		 * pte locks, we must be holding the lock for while
1038 		 * the pte page is unpinned but still RO to prevent
1039 		 * concurrent updates from seeing it in this
1040 		 * partially-pinned state.
1041 		 */
1042 		if (level == PT_PTE) {
1043 			ptl = xen_pte_lock(page, mm);
1044 
1045 			if (ptl)
1046 				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1047 		}
1048 
1049 		mcs = __xen_mc_entry(0);
1050 
1051 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1052 					pfn_pte(pfn, PAGE_KERNEL),
1053 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1054 
1055 		if (ptl) {
1056 			/* unlock when batch completed */
1057 			xen_mc_callback(xen_pte_unlock, ptl);
1058 		}
1059 	}
1060 
1061 	return 0;		/* never need to flush on unpin */
1062 }
1063 
1064 /* Release a pagetables pages back as normal RW */
1065 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1066 {
1067 	xen_mc_batch();
1068 
1069 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1070 
1071 #ifdef CONFIG_X86_64
1072 	{
1073 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
1074 
1075 		if (user_pgd) {
1076 			xen_do_pin(MMUEXT_UNPIN_TABLE,
1077 				   PFN_DOWN(__pa(user_pgd)));
1078 			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1079 		}
1080 	}
1081 #endif
1082 
1083 #ifdef CONFIG_X86_PAE
1084 	/* Need to make sure unshared kernel PMD is unpinned */
1085 	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1086 		       PT_PMD);
1087 #endif
1088 
1089 	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1090 
1091 	xen_mc_issue(0);
1092 }
1093 
1094 static void xen_pgd_unpin(struct mm_struct *mm)
1095 {
1096 	__xen_pgd_unpin(mm, mm->pgd);
1097 }
1098 
1099 /*
1100  * On resume, undo any pinning done at save, so that the rest of the
1101  * kernel doesn't see any unexpected pinned pagetables.
1102  */
1103 void xen_mm_unpin_all(void)
1104 {
1105 	unsigned long flags;
1106 	struct page *page;
1107 
1108 	spin_lock_irqsave(&pgd_lock, flags);
1109 
1110 	list_for_each_entry(page, &pgd_list, lru) {
1111 		if (PageSavePinned(page)) {
1112 			BUG_ON(!PagePinned(page));
1113 			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1114 			ClearPageSavePinned(page);
1115 		}
1116 	}
1117 
1118 	spin_unlock_irqrestore(&pgd_lock, flags);
1119 }
1120 
1121 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1122 {
1123 	spin_lock(&next->page_table_lock);
1124 	xen_pgd_pin(next);
1125 	spin_unlock(&next->page_table_lock);
1126 }
1127 
1128 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1129 {
1130 	spin_lock(&mm->page_table_lock);
1131 	xen_pgd_pin(mm);
1132 	spin_unlock(&mm->page_table_lock);
1133 }
1134 
1135 
1136 #ifdef CONFIG_SMP
1137 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1138    we need to repoint it somewhere else before we can unpin it. */
1139 static void drop_other_mm_ref(void *info)
1140 {
1141 	struct mm_struct *mm = info;
1142 	struct mm_struct *active_mm;
1143 
1144 	active_mm = percpu_read(cpu_tlbstate.active_mm);
1145 
1146 	if (active_mm == mm)
1147 		leave_mm(smp_processor_id());
1148 
1149 	/* If this cpu still has a stale cr3 reference, then make sure
1150 	   it has been flushed. */
1151 	if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1152 		load_cr3(swapper_pg_dir);
1153 }
1154 
1155 static void xen_drop_mm_ref(struct mm_struct *mm)
1156 {
1157 	cpumask_var_t mask;
1158 	unsigned cpu;
1159 
1160 	if (current->active_mm == mm) {
1161 		if (current->mm == mm)
1162 			load_cr3(swapper_pg_dir);
1163 		else
1164 			leave_mm(smp_processor_id());
1165 	}
1166 
1167 	/* Get the "official" set of cpus referring to our pagetable. */
1168 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1169 		for_each_online_cpu(cpu) {
1170 			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1171 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1172 				continue;
1173 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1174 		}
1175 		return;
1176 	}
1177 	cpumask_copy(mask, mm_cpumask(mm));
1178 
1179 	/* It's possible that a vcpu may have a stale reference to our
1180 	   cr3, because its in lazy mode, and it hasn't yet flushed
1181 	   its set of pending hypercalls yet.  In this case, we can
1182 	   look at its actual current cr3 value, and force it to flush
1183 	   if needed. */
1184 	for_each_online_cpu(cpu) {
1185 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1186 			cpumask_set_cpu(cpu, mask);
1187 	}
1188 
1189 	if (!cpumask_empty(mask))
1190 		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1191 	free_cpumask_var(mask);
1192 }
1193 #else
1194 static void xen_drop_mm_ref(struct mm_struct *mm)
1195 {
1196 	if (current->active_mm == mm)
1197 		load_cr3(swapper_pg_dir);
1198 }
1199 #endif
1200 
1201 /*
1202  * While a process runs, Xen pins its pagetables, which means that the
1203  * hypervisor forces it to be read-only, and it controls all updates
1204  * to it.  This means that all pagetable updates have to go via the
1205  * hypervisor, which is moderately expensive.
1206  *
1207  * Since we're pulling the pagetable down, we switch to use init_mm,
1208  * unpin old process pagetable and mark it all read-write, which
1209  * allows further operations on it to be simple memory accesses.
1210  *
1211  * The only subtle point is that another CPU may be still using the
1212  * pagetable because of lazy tlb flushing.  This means we need need to
1213  * switch all CPUs off this pagetable before we can unpin it.
1214  */
1215 void xen_exit_mmap(struct mm_struct *mm)
1216 {
1217 	get_cpu();		/* make sure we don't move around */
1218 	xen_drop_mm_ref(mm);
1219 	put_cpu();
1220 
1221 	spin_lock(&mm->page_table_lock);
1222 
1223 	/* pgd may not be pinned in the error exit path of execve */
1224 	if (xen_page_pinned(mm->pgd))
1225 		xen_pgd_unpin(mm);
1226 
1227 	spin_unlock(&mm->page_table_lock);
1228 }
1229 
1230 static __init void xen_pagetable_setup_start(pgd_t *base)
1231 {
1232 }
1233 
1234 static void xen_post_allocator_init(void);
1235 
1236 static __init void xen_pagetable_setup_done(pgd_t *base)
1237 {
1238 	xen_setup_shared_info();
1239 	xen_post_allocator_init();
1240 }
1241 
1242 static void xen_write_cr2(unsigned long cr2)
1243 {
1244 	percpu_read(xen_vcpu)->arch.cr2 = cr2;
1245 }
1246 
1247 static unsigned long xen_read_cr2(void)
1248 {
1249 	return percpu_read(xen_vcpu)->arch.cr2;
1250 }
1251 
1252 unsigned long xen_read_cr2_direct(void)
1253 {
1254 	return percpu_read(xen_vcpu_info.arch.cr2);
1255 }
1256 
1257 static void xen_flush_tlb(void)
1258 {
1259 	struct mmuext_op *op;
1260 	struct multicall_space mcs;
1261 
1262 	preempt_disable();
1263 
1264 	mcs = xen_mc_entry(sizeof(*op));
1265 
1266 	op = mcs.args;
1267 	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1268 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1269 
1270 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1271 
1272 	preempt_enable();
1273 }
1274 
1275 static void xen_flush_tlb_single(unsigned long addr)
1276 {
1277 	struct mmuext_op *op;
1278 	struct multicall_space mcs;
1279 
1280 	preempt_disable();
1281 
1282 	mcs = xen_mc_entry(sizeof(*op));
1283 	op = mcs.args;
1284 	op->cmd = MMUEXT_INVLPG_LOCAL;
1285 	op->arg1.linear_addr = addr & PAGE_MASK;
1286 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1287 
1288 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1289 
1290 	preempt_enable();
1291 }
1292 
1293 static void xen_flush_tlb_others(const struct cpumask *cpus,
1294 				 struct mm_struct *mm, unsigned long va)
1295 {
1296 	struct {
1297 		struct mmuext_op op;
1298 		DECLARE_BITMAP(mask, NR_CPUS);
1299 	} *args;
1300 	struct multicall_space mcs;
1301 
1302 	if (cpumask_empty(cpus))
1303 		return;		/* nothing to do */
1304 
1305 	mcs = xen_mc_entry(sizeof(*args));
1306 	args = mcs.args;
1307 	args->op.arg2.vcpumask = to_cpumask(args->mask);
1308 
1309 	/* Remove us, and any offline CPUS. */
1310 	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1311 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1312 
1313 	if (va == TLB_FLUSH_ALL) {
1314 		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1315 	} else {
1316 		args->op.cmd = MMUEXT_INVLPG_MULTI;
1317 		args->op.arg1.linear_addr = va;
1318 	}
1319 
1320 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1321 
1322 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1323 }
1324 
1325 static unsigned long xen_read_cr3(void)
1326 {
1327 	return percpu_read(xen_cr3);
1328 }
1329 
1330 static void set_current_cr3(void *v)
1331 {
1332 	percpu_write(xen_current_cr3, (unsigned long)v);
1333 }
1334 
1335 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1336 {
1337 	struct mmuext_op *op;
1338 	struct multicall_space mcs;
1339 	unsigned long mfn;
1340 
1341 	if (cr3)
1342 		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1343 	else
1344 		mfn = 0;
1345 
1346 	WARN_ON(mfn == 0 && kernel);
1347 
1348 	mcs = __xen_mc_entry(sizeof(*op));
1349 
1350 	op = mcs.args;
1351 	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1352 	op->arg1.mfn = mfn;
1353 
1354 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1355 
1356 	if (kernel) {
1357 		percpu_write(xen_cr3, cr3);
1358 
1359 		/* Update xen_current_cr3 once the batch has actually
1360 		   been submitted. */
1361 		xen_mc_callback(set_current_cr3, (void *)cr3);
1362 	}
1363 }
1364 
1365 static void xen_write_cr3(unsigned long cr3)
1366 {
1367 	BUG_ON(preemptible());
1368 
1369 	xen_mc_batch();  /* disables interrupts */
1370 
1371 	/* Update while interrupts are disabled, so its atomic with
1372 	   respect to ipis */
1373 	percpu_write(xen_cr3, cr3);
1374 
1375 	__xen_write_cr3(true, cr3);
1376 
1377 #ifdef CONFIG_X86_64
1378 	{
1379 		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1380 		if (user_pgd)
1381 			__xen_write_cr3(false, __pa(user_pgd));
1382 		else
1383 			__xen_write_cr3(false, 0);
1384 	}
1385 #endif
1386 
1387 	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1388 }
1389 
1390 static int xen_pgd_alloc(struct mm_struct *mm)
1391 {
1392 	pgd_t *pgd = mm->pgd;
1393 	int ret = 0;
1394 
1395 	BUG_ON(PagePinned(virt_to_page(pgd)));
1396 
1397 #ifdef CONFIG_X86_64
1398 	{
1399 		struct page *page = virt_to_page(pgd);
1400 		pgd_t *user_pgd;
1401 
1402 		BUG_ON(page->private != 0);
1403 
1404 		ret = -ENOMEM;
1405 
1406 		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1407 		page->private = (unsigned long)user_pgd;
1408 
1409 		if (user_pgd != NULL) {
1410 			user_pgd[pgd_index(VSYSCALL_START)] =
1411 				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1412 			ret = 0;
1413 		}
1414 
1415 		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1416 	}
1417 #endif
1418 
1419 	return ret;
1420 }
1421 
1422 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1423 {
1424 #ifdef CONFIG_X86_64
1425 	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1426 
1427 	if (user_pgd)
1428 		free_page((unsigned long)user_pgd);
1429 #endif
1430 }
1431 
1432 #ifdef CONFIG_X86_32
1433 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1434 {
1435 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1436 	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1437 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1438 			       pte_val_ma(pte));
1439 
1440 	return pte;
1441 }
1442 
1443 /* Init-time set_pte while constructing initial pagetables, which
1444    doesn't allow RO pagetable pages to be remapped RW */
1445 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1446 {
1447 	pte = mask_rw_pte(ptep, pte);
1448 
1449 	xen_set_pte(ptep, pte);
1450 }
1451 #endif
1452 
1453 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1454 {
1455 	struct mmuext_op op;
1456 	op.cmd = cmd;
1457 	op.arg1.mfn = pfn_to_mfn(pfn);
1458 	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1459 		BUG();
1460 }
1461 
1462 /* Early in boot, while setting up the initial pagetable, assume
1463    everything is pinned. */
1464 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1465 {
1466 #ifdef CONFIG_FLATMEM
1467 	BUG_ON(mem_map);	/* should only be used early */
1468 #endif
1469 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1470 	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1471 }
1472 
1473 /* Used for pmd and pud */
1474 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1475 {
1476 #ifdef CONFIG_FLATMEM
1477 	BUG_ON(mem_map);	/* should only be used early */
1478 #endif
1479 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1480 }
1481 
1482 /* Early release_pte assumes that all pts are pinned, since there's
1483    only init_mm and anything attached to that is pinned. */
1484 static __init void xen_release_pte_init(unsigned long pfn)
1485 {
1486 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1487 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1488 }
1489 
1490 static __init void xen_release_pmd_init(unsigned long pfn)
1491 {
1492 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1493 }
1494 
1495 /* This needs to make sure the new pte page is pinned iff its being
1496    attached to a pinned pagetable. */
1497 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1498 {
1499 	struct page *page = pfn_to_page(pfn);
1500 
1501 	if (PagePinned(virt_to_page(mm->pgd))) {
1502 		SetPagePinned(page);
1503 
1504 		vm_unmap_aliases();
1505 		if (!PageHighMem(page)) {
1506 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1507 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1508 				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1509 		} else {
1510 			/* make sure there are no stray mappings of
1511 			   this page */
1512 			kmap_flush_unused();
1513 		}
1514 	}
1515 }
1516 
1517 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1518 {
1519 	xen_alloc_ptpage(mm, pfn, PT_PTE);
1520 }
1521 
1522 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1523 {
1524 	xen_alloc_ptpage(mm, pfn, PT_PMD);
1525 }
1526 
1527 /* This should never happen until we're OK to use struct page */
1528 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1529 {
1530 	struct page *page = pfn_to_page(pfn);
1531 
1532 	if (PagePinned(page)) {
1533 		if (!PageHighMem(page)) {
1534 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1535 				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1536 			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1537 		}
1538 		ClearPagePinned(page);
1539 	}
1540 }
1541 
1542 static void xen_release_pte(unsigned long pfn)
1543 {
1544 	xen_release_ptpage(pfn, PT_PTE);
1545 }
1546 
1547 static void xen_release_pmd(unsigned long pfn)
1548 {
1549 	xen_release_ptpage(pfn, PT_PMD);
1550 }
1551 
1552 #if PAGETABLE_LEVELS == 4
1553 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1554 {
1555 	xen_alloc_ptpage(mm, pfn, PT_PUD);
1556 }
1557 
1558 static void xen_release_pud(unsigned long pfn)
1559 {
1560 	xen_release_ptpage(pfn, PT_PUD);
1561 }
1562 #endif
1563 
1564 void __init xen_reserve_top(void)
1565 {
1566 #ifdef CONFIG_X86_32
1567 	unsigned long top = HYPERVISOR_VIRT_START;
1568 	struct xen_platform_parameters pp;
1569 
1570 	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1571 		top = pp.virt_start;
1572 
1573 	reserve_top_address(-top);
1574 #endif	/* CONFIG_X86_32 */
1575 }
1576 
1577 /*
1578  * Like __va(), but returns address in the kernel mapping (which is
1579  * all we have until the physical memory mapping has been set up.
1580  */
1581 static void *__ka(phys_addr_t paddr)
1582 {
1583 #ifdef CONFIG_X86_64
1584 	return (void *)(paddr + __START_KERNEL_map);
1585 #else
1586 	return __va(paddr);
1587 #endif
1588 }
1589 
1590 /* Convert a machine address to physical address */
1591 static unsigned long m2p(phys_addr_t maddr)
1592 {
1593 	phys_addr_t paddr;
1594 
1595 	maddr &= PTE_PFN_MASK;
1596 	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1597 
1598 	return paddr;
1599 }
1600 
1601 /* Convert a machine address to kernel virtual */
1602 static void *m2v(phys_addr_t maddr)
1603 {
1604 	return __ka(m2p(maddr));
1605 }
1606 
1607 static void set_page_prot(void *addr, pgprot_t prot)
1608 {
1609 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1610 	pte_t pte = pfn_pte(pfn, prot);
1611 
1612 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1613 		BUG();
1614 }
1615 
1616 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1617 {
1618 	unsigned pmdidx, pteidx;
1619 	unsigned ident_pte;
1620 	unsigned long pfn;
1621 
1622 	ident_pte = 0;
1623 	pfn = 0;
1624 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1625 		pte_t *pte_page;
1626 
1627 		/* Reuse or allocate a page of ptes */
1628 		if (pmd_present(pmd[pmdidx]))
1629 			pte_page = m2v(pmd[pmdidx].pmd);
1630 		else {
1631 			/* Check for free pte pages */
1632 			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1633 				break;
1634 
1635 			pte_page = &level1_ident_pgt[ident_pte];
1636 			ident_pte += PTRS_PER_PTE;
1637 
1638 			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1639 		}
1640 
1641 		/* Install mappings */
1642 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1643 			pte_t pte;
1644 
1645 			if (pfn > max_pfn_mapped)
1646 				max_pfn_mapped = pfn;
1647 
1648 			if (!pte_none(pte_page[pteidx]))
1649 				continue;
1650 
1651 			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1652 			pte_page[pteidx] = pte;
1653 		}
1654 	}
1655 
1656 	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1657 		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1658 
1659 	set_page_prot(pmd, PAGE_KERNEL_RO);
1660 }
1661 
1662 #ifdef CONFIG_X86_64
1663 static void convert_pfn_mfn(void *v)
1664 {
1665 	pte_t *pte = v;
1666 	int i;
1667 
1668 	/* All levels are converted the same way, so just treat them
1669 	   as ptes. */
1670 	for (i = 0; i < PTRS_PER_PTE; i++)
1671 		pte[i] = xen_make_pte(pte[i].pte);
1672 }
1673 
1674 /*
1675  * Set up the inital kernel pagetable.
1676  *
1677  * We can construct this by grafting the Xen provided pagetable into
1678  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1679  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1680  * means that only the kernel has a physical mapping to start with -
1681  * but that's enough to get __va working.  We need to fill in the rest
1682  * of the physical mapping once some sort of allocator has been set
1683  * up.
1684  */
1685 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1686 					 unsigned long max_pfn)
1687 {
1688 	pud_t *l3;
1689 	pmd_t *l2;
1690 
1691 	/* Zap identity mapping */
1692 	init_level4_pgt[0] = __pgd(0);
1693 
1694 	/* Pre-constructed entries are in pfn, so convert to mfn */
1695 	convert_pfn_mfn(init_level4_pgt);
1696 	convert_pfn_mfn(level3_ident_pgt);
1697 	convert_pfn_mfn(level3_kernel_pgt);
1698 
1699 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1700 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1701 
1702 	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1703 	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1704 
1705 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1706 	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1707 	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1708 
1709 	/* Set up identity map */
1710 	xen_map_identity_early(level2_ident_pgt, max_pfn);
1711 
1712 	/* Make pagetable pieces RO */
1713 	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1714 	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1715 	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1716 	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1717 	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1718 	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1719 
1720 	/* Pin down new L4 */
1721 	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1722 			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1723 
1724 	/* Unpin Xen-provided one */
1725 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1726 
1727 	/* Switch over */
1728 	pgd = init_level4_pgt;
1729 
1730 	/*
1731 	 * At this stage there can be no user pgd, and no page
1732 	 * structure to attach it to, so make sure we just set kernel
1733 	 * pgd.
1734 	 */
1735 	xen_mc_batch();
1736 	__xen_write_cr3(true, __pa(pgd));
1737 	xen_mc_issue(PARAVIRT_LAZY_CPU);
1738 
1739 	reserve_early(__pa(xen_start_info->pt_base),
1740 		      __pa(xen_start_info->pt_base +
1741 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
1742 		      "XEN PAGETABLES");
1743 
1744 	return pgd;
1745 }
1746 #else	/* !CONFIG_X86_64 */
1747 static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1748 
1749 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1750 					 unsigned long max_pfn)
1751 {
1752 	pmd_t *kernel_pmd;
1753 
1754 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1755 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
1756 				  512*1024);
1757 
1758 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1759 	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1760 
1761 	xen_map_identity_early(level2_kernel_pgt, max_pfn);
1762 
1763 	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1764 	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1765 			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1766 
1767 	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1768 	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1769 	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1770 
1771 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1772 
1773 	xen_write_cr3(__pa(swapper_pg_dir));
1774 
1775 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1776 
1777 	reserve_early(__pa(xen_start_info->pt_base),
1778 		      __pa(xen_start_info->pt_base +
1779 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
1780 		      "XEN PAGETABLES");
1781 
1782 	return swapper_pg_dir;
1783 }
1784 #endif	/* CONFIG_X86_64 */
1785 
1786 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1787 {
1788 	pte_t pte;
1789 
1790 	phys >>= PAGE_SHIFT;
1791 
1792 	switch (idx) {
1793 	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1794 #ifdef CONFIG_X86_F00F_BUG
1795 	case FIX_F00F_IDT:
1796 #endif
1797 #ifdef CONFIG_X86_32
1798 	case FIX_WP_TEST:
1799 	case FIX_VDSO:
1800 # ifdef CONFIG_HIGHMEM
1801 	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1802 # endif
1803 #else
1804 	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1805 #endif
1806 #ifdef CONFIG_X86_LOCAL_APIC
1807 	case FIX_APIC_BASE:	/* maps dummy local APIC */
1808 #endif
1809 	case FIX_TEXT_POKE0:
1810 	case FIX_TEXT_POKE1:
1811 		/* All local page mappings */
1812 		pte = pfn_pte(phys, prot);
1813 		break;
1814 
1815 	default:
1816 		pte = mfn_pte(phys, prot);
1817 		break;
1818 	}
1819 
1820 	__native_set_fixmap(idx, pte);
1821 
1822 #ifdef CONFIG_X86_64
1823 	/* Replicate changes to map the vsyscall page into the user
1824 	   pagetable vsyscall mapping. */
1825 	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1826 		unsigned long vaddr = __fix_to_virt(idx);
1827 		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1828 	}
1829 #endif
1830 }
1831 
1832 static __init void xen_post_allocator_init(void)
1833 {
1834 	pv_mmu_ops.set_pte = xen_set_pte;
1835 	pv_mmu_ops.set_pmd = xen_set_pmd;
1836 	pv_mmu_ops.set_pud = xen_set_pud;
1837 #if PAGETABLE_LEVELS == 4
1838 	pv_mmu_ops.set_pgd = xen_set_pgd;
1839 #endif
1840 
1841 	/* This will work as long as patching hasn't happened yet
1842 	   (which it hasn't) */
1843 	pv_mmu_ops.alloc_pte = xen_alloc_pte;
1844 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1845 	pv_mmu_ops.release_pte = xen_release_pte;
1846 	pv_mmu_ops.release_pmd = xen_release_pmd;
1847 #if PAGETABLE_LEVELS == 4
1848 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
1849 	pv_mmu_ops.release_pud = xen_release_pud;
1850 #endif
1851 
1852 #ifdef CONFIG_X86_64
1853 	SetPagePinned(virt_to_page(level3_user_vsyscall));
1854 #endif
1855 	xen_mark_init_mm_pinned();
1856 }
1857 
1858 static void xen_leave_lazy_mmu(void)
1859 {
1860 	preempt_disable();
1861 	xen_mc_flush();
1862 	paravirt_leave_lazy_mmu();
1863 	preempt_enable();
1864 }
1865 
1866 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1867 	.read_cr2 = xen_read_cr2,
1868 	.write_cr2 = xen_write_cr2,
1869 
1870 	.read_cr3 = xen_read_cr3,
1871 	.write_cr3 = xen_write_cr3,
1872 
1873 	.flush_tlb_user = xen_flush_tlb,
1874 	.flush_tlb_kernel = xen_flush_tlb,
1875 	.flush_tlb_single = xen_flush_tlb_single,
1876 	.flush_tlb_others = xen_flush_tlb_others,
1877 
1878 	.pte_update = paravirt_nop,
1879 	.pte_update_defer = paravirt_nop,
1880 
1881 	.pgd_alloc = xen_pgd_alloc,
1882 	.pgd_free = xen_pgd_free,
1883 
1884 	.alloc_pte = xen_alloc_pte_init,
1885 	.release_pte = xen_release_pte_init,
1886 	.alloc_pmd = xen_alloc_pmd_init,
1887 	.alloc_pmd_clone = paravirt_nop,
1888 	.release_pmd = xen_release_pmd_init,
1889 
1890 #ifdef CONFIG_X86_64
1891 	.set_pte = xen_set_pte,
1892 #else
1893 	.set_pte = xen_set_pte_init,
1894 #endif
1895 	.set_pte_at = xen_set_pte_at,
1896 	.set_pmd = xen_set_pmd_hyper,
1897 
1898 	.ptep_modify_prot_start = __ptep_modify_prot_start,
1899 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
1900 
1901 	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
1902 	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1903 
1904 	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
1905 	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1906 
1907 #ifdef CONFIG_X86_PAE
1908 	.set_pte_atomic = xen_set_pte_atomic,
1909 	.pte_clear = xen_pte_clear,
1910 	.pmd_clear = xen_pmd_clear,
1911 #endif	/* CONFIG_X86_PAE */
1912 	.set_pud = xen_set_pud_hyper,
1913 
1914 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1915 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1916 
1917 #if PAGETABLE_LEVELS == 4
1918 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
1919 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
1920 	.set_pgd = xen_set_pgd_hyper,
1921 
1922 	.alloc_pud = xen_alloc_pmd_init,
1923 	.release_pud = xen_release_pmd_init,
1924 #endif	/* PAGETABLE_LEVELS == 4 */
1925 
1926 	.activate_mm = xen_activate_mm,
1927 	.dup_mmap = xen_dup_mmap,
1928 	.exit_mmap = xen_exit_mmap,
1929 
1930 	.lazy_mode = {
1931 		.enter = paravirt_enter_lazy_mmu,
1932 		.leave = xen_leave_lazy_mmu,
1933 	},
1934 
1935 	.set_fixmap = xen_set_fixmap,
1936 };
1937 
1938 void __init xen_init_mmu_ops(void)
1939 {
1940 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
1941 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
1942 	pv_mmu_ops = xen_mmu_ops;
1943 }
1944 
1945 #ifdef CONFIG_XEN_PVHVM
1946 static void xen_hvm_exit_mmap(struct mm_struct *mm)
1947 {
1948 	struct xen_hvm_pagetable_dying a;
1949 	int rc;
1950 
1951 	a.domid = DOMID_SELF;
1952 	a.gpa = __pa(mm->pgd);
1953 	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
1954 	WARN_ON_ONCE(rc < 0);
1955 }
1956 
1957 static int is_pagetable_dying_supported(void)
1958 {
1959 	struct xen_hvm_pagetable_dying a;
1960 	int rc = 0;
1961 
1962 	a.domid = DOMID_SELF;
1963 	a.gpa = 0x00;
1964 	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
1965 	if (rc < 0) {
1966 		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
1967 		return 0;
1968 	}
1969 	return 1;
1970 }
1971 
1972 void __init xen_hvm_init_mmu_ops(void)
1973 {
1974 	if (is_pagetable_dying_supported())
1975 		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
1976 }
1977 #endif
1978 
1979 #ifdef CONFIG_XEN_DEBUG_FS
1980 
1981 static struct dentry *d_mmu_debug;
1982 
1983 static int __init xen_mmu_debugfs(void)
1984 {
1985 	struct dentry *d_xen = xen_init_debugfs();
1986 
1987 	if (d_xen == NULL)
1988 		return -ENOMEM;
1989 
1990 	d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1991 
1992 	debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1993 
1994 	debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1995 	debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1996 			   &mmu_stats.pgd_update_pinned);
1997 	debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1998 			   &mmu_stats.pgd_update_pinned);
1999 
2000 	debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2001 	debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2002 			   &mmu_stats.pud_update_pinned);
2003 	debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2004 			   &mmu_stats.pud_update_pinned);
2005 
2006 	debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2007 	debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2008 			   &mmu_stats.pmd_update_pinned);
2009 	debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2010 			   &mmu_stats.pmd_update_pinned);
2011 
2012 	debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2013 //	debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2014 //			   &mmu_stats.pte_update_pinned);
2015 	debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2016 			   &mmu_stats.pte_update_pinned);
2017 
2018 	debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2019 	debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2020 			   &mmu_stats.mmu_update_extended);
2021 	xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2022 				     mmu_stats.mmu_update_histo, 20);
2023 
2024 	debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2025 	debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2026 			   &mmu_stats.set_pte_at_batched);
2027 	debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2028 			   &mmu_stats.set_pte_at_current);
2029 	debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2030 			   &mmu_stats.set_pte_at_kernel);
2031 
2032 	debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2033 	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2034 			   &mmu_stats.prot_commit_batched);
2035 
2036 	return 0;
2037 }
2038 fs_initcall(xen_mmu_debugfs);
2039 
2040 #endif	/* CONFIG_XEN_DEBUG_FS */
2041