xref: /linux/arch/x86/mm/pat/set_memory.c (revision 6093a688a07da07808f0122f9aa2a3eed250d853)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2002 Andi Kleen, SuSE Labs.
4  * Thanks to Ben LaHaise for precious feedback.
5  */
6 #include <linux/highmem.h>
7 #include <linux/memblock.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/interrupt.h>
11 #include <linux/seq_file.h>
12 #include <linux/proc_fs.h>
13 #include <linux/debugfs.h>
14 #include <linux/pfn.h>
15 #include <linux/percpu.h>
16 #include <linux/gfp.h>
17 #include <linux/pci.h>
18 #include <linux/vmalloc.h>
19 #include <linux/libnvdimm.h>
20 #include <linux/vmstat.h>
21 #include <linux/kernel.h>
22 #include <linux/cc_platform.h>
23 #include <linux/set_memory.h>
24 #include <linux/memregion.h>
25 
26 #include <asm/e820/api.h>
27 #include <asm/processor.h>
28 #include <asm/tlbflush.h>
29 #include <asm/sections.h>
30 #include <asm/setup.h>
31 #include <linux/uaccess.h>
32 #include <asm/pgalloc.h>
33 #include <asm/proto.h>
34 #include <asm/memtype.h>
35 
36 #include "../mm_internal.h"
37 
38 /*
39  * The current flushing context - we pass it instead of 5 arguments:
40  */
41 struct cpa_data {
42 	unsigned long	*vaddr;
43 	pgd_t		*pgd;
44 	pgprot_t	mask_set;
45 	pgprot_t	mask_clr;
46 	unsigned long	numpages;
47 	unsigned long	curpage;
48 	unsigned long	pfn;
49 	unsigned int	flags;
50 	unsigned int	force_split		: 1,
51 			force_static_prot	: 1,
52 			force_flush_all		: 1;
53 	struct page	**pages;
54 };
55 
56 enum cpa_warn {
57 	CPA_CONFLICT,
58 	CPA_PROTECT,
59 	CPA_DETECT,
60 };
61 
62 static const int cpa_warn_level = CPA_PROTECT;
63 
64 /*
65  * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
66  * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
67  * entries change the page attribute in parallel to some other cpu
68  * splitting a large page entry along with changing the attribute.
69  */
70 static DEFINE_SPINLOCK(cpa_lock);
71 
72 #define CPA_FLUSHTLB 1
73 #define CPA_ARRAY 2
74 #define CPA_PAGES_ARRAY 4
75 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
76 #define CPA_COLLAPSE 16 /* try to collapse large pages */
77 
78 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
79 {
80 	return __pgprot(cachemode2protval(pcm));
81 }
82 
83 #ifdef CONFIG_PROC_FS
84 static unsigned long direct_pages_count[PG_LEVEL_NUM];
85 
86 void update_page_count(int level, unsigned long pages)
87 {
88 	/* Protect against CPA */
89 	spin_lock(&pgd_lock);
90 	direct_pages_count[level] += pages;
91 	spin_unlock(&pgd_lock);
92 }
93 
94 static void split_page_count(int level)
95 {
96 	if (direct_pages_count[level] == 0)
97 		return;
98 
99 	direct_pages_count[level]--;
100 	if (system_state == SYSTEM_RUNNING) {
101 		if (level == PG_LEVEL_2M)
102 			count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
103 		else if (level == PG_LEVEL_1G)
104 			count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
105 	}
106 	direct_pages_count[level - 1] += PTRS_PER_PTE;
107 }
108 
109 static void collapse_page_count(int level)
110 {
111 	direct_pages_count[level]++;
112 	if (system_state == SYSTEM_RUNNING) {
113 		if (level == PG_LEVEL_2M)
114 			count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
115 		else if (level == PG_LEVEL_1G)
116 			count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
117 	}
118 	direct_pages_count[level - 1] -= PTRS_PER_PTE;
119 }
120 
121 void arch_report_meminfo(struct seq_file *m)
122 {
123 	seq_printf(m, "DirectMap4k:    %8lu kB\n",
124 			direct_pages_count[PG_LEVEL_4K] << 2);
125 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
126 	seq_printf(m, "DirectMap2M:    %8lu kB\n",
127 			direct_pages_count[PG_LEVEL_2M] << 11);
128 #else
129 	seq_printf(m, "DirectMap4M:    %8lu kB\n",
130 			direct_pages_count[PG_LEVEL_2M] << 12);
131 #endif
132 	if (direct_gbpages)
133 		seq_printf(m, "DirectMap1G:    %8lu kB\n",
134 			direct_pages_count[PG_LEVEL_1G] << 20);
135 }
136 #else
137 static inline void split_page_count(int level) { }
138 static inline void collapse_page_count(int level) { }
139 #endif
140 
141 #ifdef CONFIG_X86_CPA_STATISTICS
142 
143 static unsigned long cpa_1g_checked;
144 static unsigned long cpa_1g_sameprot;
145 static unsigned long cpa_1g_preserved;
146 static unsigned long cpa_2m_checked;
147 static unsigned long cpa_2m_sameprot;
148 static unsigned long cpa_2m_preserved;
149 static unsigned long cpa_4k_install;
150 
151 static inline void cpa_inc_1g_checked(void)
152 {
153 	cpa_1g_checked++;
154 }
155 
156 static inline void cpa_inc_2m_checked(void)
157 {
158 	cpa_2m_checked++;
159 }
160 
161 static inline void cpa_inc_4k_install(void)
162 {
163 	data_race(cpa_4k_install++);
164 }
165 
166 static inline void cpa_inc_lp_sameprot(int level)
167 {
168 	if (level == PG_LEVEL_1G)
169 		cpa_1g_sameprot++;
170 	else
171 		cpa_2m_sameprot++;
172 }
173 
174 static inline void cpa_inc_lp_preserved(int level)
175 {
176 	if (level == PG_LEVEL_1G)
177 		cpa_1g_preserved++;
178 	else
179 		cpa_2m_preserved++;
180 }
181 
182 static int cpastats_show(struct seq_file *m, void *p)
183 {
184 	seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
185 	seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
186 	seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
187 	seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
188 	seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
189 	seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
190 	seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
191 	return 0;
192 }
193 
194 static int cpastats_open(struct inode *inode, struct file *file)
195 {
196 	return single_open(file, cpastats_show, NULL);
197 }
198 
199 static const struct file_operations cpastats_fops = {
200 	.open		= cpastats_open,
201 	.read		= seq_read,
202 	.llseek		= seq_lseek,
203 	.release	= single_release,
204 };
205 
206 static int __init cpa_stats_init(void)
207 {
208 	debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
209 			    &cpastats_fops);
210 	return 0;
211 }
212 late_initcall(cpa_stats_init);
213 #else
214 static inline void cpa_inc_1g_checked(void) { }
215 static inline void cpa_inc_2m_checked(void) { }
216 static inline void cpa_inc_4k_install(void) { }
217 static inline void cpa_inc_lp_sameprot(int level) { }
218 static inline void cpa_inc_lp_preserved(int level) { }
219 #endif
220 
221 
222 static inline int
223 within(unsigned long addr, unsigned long start, unsigned long end)
224 {
225 	return addr >= start && addr < end;
226 }
227 
228 #ifdef CONFIG_X86_64
229 
230 static inline int
231 within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
232 {
233 	return addr >= start && addr <= end;
234 }
235 
236 /*
237  * The kernel image is mapped into two places in the virtual address space
238  * (addresses without KASLR, of course):
239  *
240  * 1. The kernel direct map (0xffff880000000000)
241  * 2. The "high kernel map" (0xffffffff81000000)
242  *
243  * We actually execute out of #2. If we get the address of a kernel symbol, it
244  * points to #2, but almost all physical-to-virtual translations point to #1.
245  *
246  * This is so that we can have both a directmap of all physical memory *and*
247  * take full advantage of the limited (s32) immediate addressing range (2G)
248  * of x86_64.
249  *
250  * See Documentation/arch/x86/x86_64/mm.rst for more detail.
251  */
252 
253 static inline unsigned long highmap_start_pfn(void)
254 {
255 	return __pa_symbol(_text) >> PAGE_SHIFT;
256 }
257 
258 static inline unsigned long highmap_end_pfn(void)
259 {
260 	/* Do not reference physical address outside the kernel. */
261 	return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
262 }
263 
264 static bool __cpa_pfn_in_highmap(unsigned long pfn)
265 {
266 	/*
267 	 * Kernel text has an alias mapping at a high address, known
268 	 * here as "highmap".
269 	 */
270 	return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
271 }
272 
273 #else
274 
275 static bool __cpa_pfn_in_highmap(unsigned long pfn)
276 {
277 	/* There is no highmap on 32-bit */
278 	return false;
279 }
280 
281 #endif
282 
283 /*
284  * See set_mce_nospec().
285  *
286  * Machine check recovery code needs to change cache mode of poisoned pages to
287  * UC to avoid speculative access logging another error. But passing the
288  * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
289  * speculative access. So we cheat and flip the top bit of the address. This
290  * works fine for the code that updates the page tables. But at the end of the
291  * process we need to flush the TLB and cache and the non-canonical address
292  * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
293  *
294  * But in the common case we already have a canonical address. This code
295  * will fix the top bit if needed and is a no-op otherwise.
296  */
297 static inline unsigned long fix_addr(unsigned long addr)
298 {
299 #ifdef CONFIG_X86_64
300 	return (long)(addr << 1) >> 1;
301 #else
302 	return addr;
303 #endif
304 }
305 
306 static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
307 {
308 	if (cpa->flags & CPA_PAGES_ARRAY) {
309 		struct page *page = cpa->pages[idx];
310 
311 		if (unlikely(PageHighMem(page)))
312 			return 0;
313 
314 		return (unsigned long)page_address(page);
315 	}
316 
317 	if (cpa->flags & CPA_ARRAY)
318 		return cpa->vaddr[idx];
319 
320 	return *cpa->vaddr + idx * PAGE_SIZE;
321 }
322 
323 /*
324  * Flushing functions
325  */
326 
327 static void clflush_cache_range_opt(void *vaddr, unsigned int size)
328 {
329 	const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
330 	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
331 	void *vend = vaddr + size;
332 
333 	if (p >= vend)
334 		return;
335 
336 	for (; p < vend; p += clflush_size)
337 		clflushopt(p);
338 }
339 
340 /**
341  * clflush_cache_range - flush a cache range with clflush
342  * @vaddr:	virtual start address
343  * @size:	number of bytes to flush
344  *
345  * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
346  * SFENCE to avoid ordering issues.
347  */
348 void clflush_cache_range(void *vaddr, unsigned int size)
349 {
350 	mb();
351 	clflush_cache_range_opt(vaddr, size);
352 	mb();
353 }
354 EXPORT_SYMBOL_GPL(clflush_cache_range);
355 
356 #ifdef CONFIG_ARCH_HAS_PMEM_API
357 void arch_invalidate_pmem(void *addr, size_t size)
358 {
359 	clflush_cache_range(addr, size);
360 }
361 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
362 #endif
363 
364 #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
365 bool cpu_cache_has_invalidate_memregion(void)
366 {
367 	return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
368 }
369 EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
370 
371 int cpu_cache_invalidate_memregion(int res_desc)
372 {
373 	if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
374 		return -ENXIO;
375 	wbinvd_on_all_cpus();
376 	return 0;
377 }
378 EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
379 #endif
380 
381 static void __cpa_flush_all(void *arg)
382 {
383 	unsigned long cache = (unsigned long)arg;
384 
385 	/*
386 	 * Flush all to work around Errata in early athlons regarding
387 	 * large page flushing.
388 	 */
389 	__flush_tlb_all();
390 
391 	if (cache && boot_cpu_data.x86 >= 4)
392 		wbinvd();
393 }
394 
395 static void cpa_flush_all(unsigned long cache)
396 {
397 	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
398 
399 	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
400 }
401 
402 static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
403 
404 static void cpa_collapse_large_pages(struct cpa_data *cpa)
405 {
406 	unsigned long start, addr, end;
407 	struct ptdesc *ptdesc, *tmp;
408 	LIST_HEAD(pgtables);
409 	int collapsed = 0;
410 	int i;
411 
412 	if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
413 		for (i = 0; i < cpa->numpages; i++)
414 			collapsed += collapse_large_pages(__cpa_addr(cpa, i),
415 							  &pgtables);
416 	} else {
417 		addr = __cpa_addr(cpa, 0);
418 		start = addr & PMD_MASK;
419 		end = addr + PAGE_SIZE * cpa->numpages;
420 
421 		for (addr = start; within(addr, start, end); addr += PMD_SIZE)
422 			collapsed += collapse_large_pages(addr, &pgtables);
423 	}
424 
425 	if (!collapsed)
426 		return;
427 
428 	flush_tlb_all();
429 
430 	list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
431 		list_del(&ptdesc->pt_list);
432 		__free_page(ptdesc_page(ptdesc));
433 	}
434 }
435 
436 static void cpa_flush(struct cpa_data *cpa, int cache)
437 {
438 	unsigned long start, end;
439 	unsigned int i;
440 
441 	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
442 
443 	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
444 		cpa_flush_all(cache);
445 		goto collapse_large_pages;
446 	}
447 
448 	start = fix_addr(__cpa_addr(cpa, 0));
449 	end =   fix_addr(__cpa_addr(cpa, cpa->numpages));
450 	if (cpa->force_flush_all)
451 		end = TLB_FLUSH_ALL;
452 
453 	flush_tlb_kernel_range(start, end);
454 
455 	if (!cache)
456 		goto collapse_large_pages;
457 
458 	mb();
459 	for (i = 0; i < cpa->numpages; i++) {
460 		unsigned long addr = __cpa_addr(cpa, i);
461 		unsigned int level;
462 
463 		pte_t *pte = lookup_address(addr, &level);
464 
465 		/*
466 		 * Only flush present addresses:
467 		 */
468 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
469 			clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
470 	}
471 	mb();
472 
473 collapse_large_pages:
474 	if (cpa->flags & CPA_COLLAPSE)
475 		cpa_collapse_large_pages(cpa);
476 }
477 
478 static bool overlaps(unsigned long r1_start, unsigned long r1_end,
479 		     unsigned long r2_start, unsigned long r2_end)
480 {
481 	return (r1_start <= r2_end && r1_end >= r2_start) ||
482 		(r2_start <= r1_end && r2_end >= r1_start);
483 }
484 
485 #ifdef CONFIG_PCI_BIOS
486 /*
487  * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
488  * based config access (CONFIG_PCI_GOBIOS) support.
489  */
490 #define BIOS_PFN	PFN_DOWN(BIOS_BEGIN)
491 #define BIOS_PFN_END	PFN_DOWN(BIOS_END - 1)
492 
493 static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
494 {
495 	if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
496 		return _PAGE_NX;
497 	return 0;
498 }
499 #else
500 static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
501 {
502 	return 0;
503 }
504 #endif
505 
506 /*
507  * The .rodata section needs to be read-only. Using the pfn catches all
508  * aliases.  This also includes __ro_after_init, so do not enforce until
509  * kernel_set_to_readonly is true.
510  */
511 static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
512 {
513 	unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
514 
515 	/*
516 	 * Note: __end_rodata is at page aligned and not inclusive, so
517 	 * subtract 1 to get the last enforced PFN in the rodata area.
518 	 */
519 	epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
520 
521 	if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
522 		return _PAGE_RW;
523 	return 0;
524 }
525 
526 /*
527  * Protect kernel text against becoming non executable by forbidding
528  * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
529  * out of which the kernel actually executes.  Do not protect the low
530  * mapping.
531  *
532  * This does not cover __inittext since that is gone after boot.
533  */
534 static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
535 {
536 	unsigned long t_end = (unsigned long)_etext - 1;
537 	unsigned long t_start = (unsigned long)_text;
538 
539 	if (overlaps(start, end, t_start, t_end))
540 		return _PAGE_NX;
541 	return 0;
542 }
543 
544 #if defined(CONFIG_X86_64)
545 /*
546  * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
547  * kernel text mappings for the large page aligned text, rodata sections
548  * will be always read-only. For the kernel identity mappings covering the
549  * holes caused by this alignment can be anything that user asks.
550  *
551  * This will preserve the large page mappings for kernel text/data at no
552  * extra cost.
553  */
554 static pgprotval_t protect_kernel_text_ro(unsigned long start,
555 					  unsigned long end)
556 {
557 	unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
558 	unsigned long t_start = (unsigned long)_text;
559 	unsigned int level;
560 
561 	if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
562 		return 0;
563 	/*
564 	 * Don't enforce the !RW mapping for the kernel text mapping, if
565 	 * the current mapping is already using small page mapping.  No
566 	 * need to work hard to preserve large page mappings in this case.
567 	 *
568 	 * This also fixes the Linux Xen paravirt guest boot failure caused
569 	 * by unexpected read-only mappings for kernel identity
570 	 * mappings. In this paravirt guest case, the kernel text mapping
571 	 * and the kernel identity mapping share the same page-table pages,
572 	 * so the protections for kernel text and identity mappings have to
573 	 * be the same.
574 	 */
575 	if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
576 		return _PAGE_RW;
577 	return 0;
578 }
579 #else
580 static pgprotval_t protect_kernel_text_ro(unsigned long start,
581 					  unsigned long end)
582 {
583 	return 0;
584 }
585 #endif
586 
587 static inline bool conflicts(pgprot_t prot, pgprotval_t val)
588 {
589 	return (pgprot_val(prot) & ~val) != pgprot_val(prot);
590 }
591 
592 static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
593 				  unsigned long start, unsigned long end,
594 				  unsigned long pfn, const char *txt)
595 {
596 	static const char *lvltxt[] = {
597 		[CPA_CONFLICT]	= "conflict",
598 		[CPA_PROTECT]	= "protect",
599 		[CPA_DETECT]	= "detect",
600 	};
601 
602 	if (warnlvl > cpa_warn_level || !conflicts(prot, val))
603 		return;
604 
605 	pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
606 		lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
607 		(unsigned long long)val);
608 }
609 
610 /*
611  * Certain areas of memory on x86 require very specific protection flags,
612  * for example the BIOS area or kernel text. Callers don't always get this
613  * right (again, ioremap() on BIOS memory is not uncommon) so this function
614  * checks and fixes these known static required protection bits.
615  */
616 static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
617 					  unsigned long pfn, unsigned long npg,
618 					  unsigned long lpsize, int warnlvl)
619 {
620 	pgprotval_t forbidden, res;
621 	unsigned long end;
622 
623 	/*
624 	 * There is no point in checking RW/NX conflicts when the requested
625 	 * mapping is setting the page !PRESENT.
626 	 */
627 	if (!(pgprot_val(prot) & _PAGE_PRESENT))
628 		return prot;
629 
630 	/* Operate on the virtual address */
631 	end = start + npg * PAGE_SIZE - 1;
632 
633 	res = protect_kernel_text(start, end);
634 	check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
635 	forbidden = res;
636 
637 	/*
638 	 * Special case to preserve a large page. If the change spawns the
639 	 * full large page mapping then there is no point to split it
640 	 * up. Happens with ftrace and is going to be removed once ftrace
641 	 * switched to text_poke().
642 	 */
643 	if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
644 		res = protect_kernel_text_ro(start, end);
645 		check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
646 		forbidden |= res;
647 	}
648 
649 	/* Check the PFN directly */
650 	res = protect_pci_bios(pfn, pfn + npg - 1);
651 	check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
652 	forbidden |= res;
653 
654 	res = protect_rodata(pfn, pfn + npg - 1);
655 	check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
656 	forbidden |= res;
657 
658 	return __pgprot(pgprot_val(prot) & ~forbidden);
659 }
660 
661 /*
662  * Validate strict W^X semantics.
663  */
664 static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
665 				  unsigned long pfn, unsigned long npg,
666 				  bool nx, bool rw)
667 {
668 	unsigned long end;
669 
670 	/*
671 	 * 32-bit has some unfixable W+X issues, like EFI code
672 	 * and writeable data being in the same page.  Disable
673 	 * detection and enforcement there.
674 	 */
675 	if (IS_ENABLED(CONFIG_X86_32))
676 		return new;
677 
678 	/* Only verify when NX is supported: */
679 	if (!(__supported_pte_mask & _PAGE_NX))
680 		return new;
681 
682 	if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
683 		return new;
684 
685 	if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
686 		return new;
687 
688 	/* Non-leaf translation entries can disable writing or execution. */
689 	if (!rw || nx)
690 		return new;
691 
692 	end = start + npg * PAGE_SIZE - 1;
693 	WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
694 		  (unsigned long long)pgprot_val(old),
695 		  (unsigned long long)pgprot_val(new),
696 		  start, end, pfn);
697 
698 	/*
699 	 * For now, allow all permission change attempts by returning the
700 	 * attempted permissions.  This can 'return old' to actively
701 	 * refuse the permission change at a later time.
702 	 */
703 	return new;
704 }
705 
706 /*
707  * Lookup the page table entry for a virtual address in a specific pgd.
708  * Return a pointer to the entry (or NULL if the entry does not exist),
709  * the level of the entry, and the effective NX and RW bits of all
710  * page table levels.
711  */
712 pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
713 				  unsigned int *level, bool *nx, bool *rw)
714 {
715 	p4d_t *p4d;
716 	pud_t *pud;
717 	pmd_t *pmd;
718 
719 	*level = PG_LEVEL_256T;
720 	*nx = false;
721 	*rw = true;
722 
723 	if (pgd_none(*pgd))
724 		return NULL;
725 
726 	*level = PG_LEVEL_512G;
727 	*nx |= pgd_flags(*pgd) & _PAGE_NX;
728 	*rw &= pgd_flags(*pgd) & _PAGE_RW;
729 
730 	p4d = p4d_offset(pgd, address);
731 	if (p4d_none(*p4d))
732 		return NULL;
733 
734 	if (p4d_leaf(*p4d) || !p4d_present(*p4d))
735 		return (pte_t *)p4d;
736 
737 	*level = PG_LEVEL_1G;
738 	*nx |= p4d_flags(*p4d) & _PAGE_NX;
739 	*rw &= p4d_flags(*p4d) & _PAGE_RW;
740 
741 	pud = pud_offset(p4d, address);
742 	if (pud_none(*pud))
743 		return NULL;
744 
745 	if (pud_leaf(*pud) || !pud_present(*pud))
746 		return (pte_t *)pud;
747 
748 	*level = PG_LEVEL_2M;
749 	*nx |= pud_flags(*pud) & _PAGE_NX;
750 	*rw &= pud_flags(*pud) & _PAGE_RW;
751 
752 	pmd = pmd_offset(pud, address);
753 	if (pmd_none(*pmd))
754 		return NULL;
755 
756 	if (pmd_leaf(*pmd) || !pmd_present(*pmd))
757 		return (pte_t *)pmd;
758 
759 	*level = PG_LEVEL_4K;
760 	*nx |= pmd_flags(*pmd) & _PAGE_NX;
761 	*rw &= pmd_flags(*pmd) & _PAGE_RW;
762 
763 	return pte_offset_kernel(pmd, address);
764 }
765 
766 /*
767  * Lookup the page table entry for a virtual address in a specific pgd.
768  * Return a pointer to the entry and the level of the mapping.
769  */
770 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
771 			     unsigned int *level)
772 {
773 	bool nx, rw;
774 
775 	return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
776 }
777 
778 /*
779  * Lookup the page table entry for a virtual address. Return a pointer
780  * to the entry and the level of the mapping.
781  *
782  * Note: the function returns p4d, pud or pmd either when the entry is marked
783  * large or when the present bit is not set. Otherwise it returns NULL.
784  */
785 pte_t *lookup_address(unsigned long address, unsigned int *level)
786 {
787 	return lookup_address_in_pgd(pgd_offset_k(address), address, level);
788 }
789 EXPORT_SYMBOL_GPL(lookup_address);
790 
791 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
792 				  unsigned int *level, bool *nx, bool *rw)
793 {
794 	pgd_t *pgd;
795 
796 	if (!cpa->pgd)
797 		pgd = pgd_offset_k(address);
798 	else
799 		pgd = cpa->pgd + pgd_index(address);
800 
801 	return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
802 }
803 
804 /*
805  * Lookup the PMD entry for a virtual address. Return a pointer to the entry
806  * or NULL if not present.
807  */
808 pmd_t *lookup_pmd_address(unsigned long address)
809 {
810 	pgd_t *pgd;
811 	p4d_t *p4d;
812 	pud_t *pud;
813 
814 	pgd = pgd_offset_k(address);
815 	if (pgd_none(*pgd))
816 		return NULL;
817 
818 	p4d = p4d_offset(pgd, address);
819 	if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
820 		return NULL;
821 
822 	pud = pud_offset(p4d, address);
823 	if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
824 		return NULL;
825 
826 	return pmd_offset(pud, address);
827 }
828 
829 /*
830  * This is necessary because __pa() does not work on some
831  * kinds of memory, like vmalloc() or the alloc_remap()
832  * areas on 32-bit NUMA systems.  The percpu areas can
833  * end up in this kind of memory, for instance.
834  *
835  * Note that as long as the PTEs are well-formed with correct PFNs, this
836  * works without checking the PRESENT bit in the leaf PTE.  This is unlike
837  * the similar vmalloc_to_page() and derivatives.  Callers may depend on
838  * this behavior.
839  *
840  * This could be optimized, but it is only used in paths that are not perf
841  * sensitive, and keeping it unoptimized should increase the testing coverage
842  * for the more obscure platforms.
843  */
844 phys_addr_t slow_virt_to_phys(void *__virt_addr)
845 {
846 	unsigned long virt_addr = (unsigned long)__virt_addr;
847 	phys_addr_t phys_addr;
848 	unsigned long offset;
849 	enum pg_level level;
850 	pte_t *pte;
851 
852 	pte = lookup_address(virt_addr, &level);
853 	BUG_ON(!pte);
854 
855 	/*
856 	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
857 	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
858 	 * make 32-PAE kernel work correctly.
859 	 */
860 	switch (level) {
861 	case PG_LEVEL_1G:
862 		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
863 		offset = virt_addr & ~PUD_MASK;
864 		break;
865 	case PG_LEVEL_2M:
866 		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
867 		offset = virt_addr & ~PMD_MASK;
868 		break;
869 	default:
870 		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
871 		offset = virt_addr & ~PAGE_MASK;
872 	}
873 
874 	return (phys_addr_t)(phys_addr | offset);
875 }
876 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
877 
878 /*
879  * Set the new pmd in all the pgds we know about:
880  */
881 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
882 {
883 	/* change init_mm */
884 	set_pte_atomic(kpte, pte);
885 #ifdef CONFIG_X86_32
886 	{
887 		struct page *page;
888 
889 		list_for_each_entry(page, &pgd_list, lru) {
890 			pgd_t *pgd;
891 			p4d_t *p4d;
892 			pud_t *pud;
893 			pmd_t *pmd;
894 
895 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
896 			p4d = p4d_offset(pgd, address);
897 			pud = pud_offset(p4d, address);
898 			pmd = pmd_offset(pud, address);
899 			set_pte_atomic((pte_t *)pmd, pte);
900 		}
901 	}
902 #endif
903 }
904 
905 static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
906 {
907 	/*
908 	 * _PAGE_GLOBAL means "global page" for present PTEs.
909 	 * But, it is also used to indicate _PAGE_PROTNONE
910 	 * for non-present PTEs.
911 	 *
912 	 * This ensures that a _PAGE_GLOBAL PTE going from
913 	 * present to non-present is not confused as
914 	 * _PAGE_PROTNONE.
915 	 */
916 	if (!(pgprot_val(prot) & _PAGE_PRESENT))
917 		pgprot_val(prot) &= ~_PAGE_GLOBAL;
918 
919 	return prot;
920 }
921 
922 static int __should_split_large_page(pte_t *kpte, unsigned long address,
923 				     struct cpa_data *cpa)
924 {
925 	unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
926 	pgprot_t old_prot, new_prot, req_prot, chk_prot;
927 	pte_t new_pte, *tmp;
928 	enum pg_level level;
929 	bool nx, rw;
930 
931 	/*
932 	 * Check for races, another CPU might have split this page
933 	 * up already:
934 	 */
935 	tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
936 	if (tmp != kpte)
937 		return 1;
938 
939 	switch (level) {
940 	case PG_LEVEL_2M:
941 		old_prot = pmd_pgprot(*(pmd_t *)kpte);
942 		old_pfn = pmd_pfn(*(pmd_t *)kpte);
943 		cpa_inc_2m_checked();
944 		break;
945 	case PG_LEVEL_1G:
946 		old_prot = pud_pgprot(*(pud_t *)kpte);
947 		old_pfn = pud_pfn(*(pud_t *)kpte);
948 		cpa_inc_1g_checked();
949 		break;
950 	default:
951 		return -EINVAL;
952 	}
953 
954 	psize = page_level_size(level);
955 	pmask = page_level_mask(level);
956 
957 	/*
958 	 * Calculate the number of pages, which fit into this large
959 	 * page starting at address:
960 	 */
961 	lpaddr = (address + psize) & pmask;
962 	numpages = (lpaddr - address) >> PAGE_SHIFT;
963 	if (numpages < cpa->numpages)
964 		cpa->numpages = numpages;
965 
966 	/*
967 	 * We are safe now. Check whether the new pgprot is the same:
968 	 * Convert protection attributes to 4k-format, as cpa->mask* are set
969 	 * up accordingly.
970 	 */
971 
972 	/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
973 	req_prot = pgprot_large_2_4k(old_prot);
974 
975 	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
976 	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
977 
978 	/*
979 	 * req_prot is in format of 4k pages. It must be converted to large
980 	 * page format: the caching mode includes the PAT bit located at
981 	 * different bit positions in the two formats.
982 	 */
983 	req_prot = pgprot_4k_2_large(req_prot);
984 	req_prot = pgprot_clear_protnone_bits(req_prot);
985 	if (pgprot_val(req_prot) & _PAGE_PRESENT)
986 		pgprot_val(req_prot) |= _PAGE_PSE;
987 
988 	/*
989 	 * old_pfn points to the large page base pfn. So we need to add the
990 	 * offset of the virtual address:
991 	 */
992 	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
993 	cpa->pfn = pfn;
994 
995 	/*
996 	 * Calculate the large page base address and the number of 4K pages
997 	 * in the large page
998 	 */
999 	lpaddr = address & pmask;
1000 	numpages = psize >> PAGE_SHIFT;
1001 
1002 	/*
1003 	 * Sanity check that the existing mapping is correct versus the static
1004 	 * protections. static_protections() guards against !PRESENT, so no
1005 	 * extra conditional required here.
1006 	 */
1007 	chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
1008 				      psize, CPA_CONFLICT);
1009 
1010 	if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
1011 		/*
1012 		 * Split the large page and tell the split code to
1013 		 * enforce static protections.
1014 		 */
1015 		cpa->force_static_prot = 1;
1016 		return 1;
1017 	}
1018 
1019 	/*
1020 	 * Optimization: If the requested pgprot is the same as the current
1021 	 * pgprot, then the large page can be preserved and no updates are
1022 	 * required independent of alignment and length of the requested
1023 	 * range. The above already established that the current pgprot is
1024 	 * correct, which in consequence makes the requested pgprot correct
1025 	 * as well if it is the same. The static protection scan below will
1026 	 * not come to a different conclusion.
1027 	 */
1028 	if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
1029 		cpa_inc_lp_sameprot(level);
1030 		return 0;
1031 	}
1032 
1033 	/*
1034 	 * If the requested range does not cover the full page, split it up
1035 	 */
1036 	if (address != lpaddr || cpa->numpages != numpages)
1037 		return 1;
1038 
1039 	/*
1040 	 * Check whether the requested pgprot is conflicting with a static
1041 	 * protection requirement in the large page.
1042 	 */
1043 	new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
1044 				      psize, CPA_DETECT);
1045 
1046 	new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
1047 			      nx, rw);
1048 
1049 	/*
1050 	 * If there is a conflict, split the large page.
1051 	 *
1052 	 * There used to be a 4k wise evaluation trying really hard to
1053 	 * preserve the large pages, but experimentation has shown, that this
1054 	 * does not help at all. There might be corner cases which would
1055 	 * preserve one large page occasionally, but it's really not worth the
1056 	 * extra code and cycles for the common case.
1057 	 */
1058 	if (pgprot_val(req_prot) != pgprot_val(new_prot))
1059 		return 1;
1060 
1061 	/* All checks passed. Update the large page mapping. */
1062 	new_pte = pfn_pte(old_pfn, new_prot);
1063 	__set_pmd_pte(kpte, address, new_pte);
1064 	cpa->flags |= CPA_FLUSHTLB;
1065 	cpa_inc_lp_preserved(level);
1066 	return 0;
1067 }
1068 
1069 static int should_split_large_page(pte_t *kpte, unsigned long address,
1070 				   struct cpa_data *cpa)
1071 {
1072 	int do_split;
1073 
1074 	if (cpa->force_split)
1075 		return 1;
1076 
1077 	spin_lock(&pgd_lock);
1078 	do_split = __should_split_large_page(kpte, address, cpa);
1079 	spin_unlock(&pgd_lock);
1080 
1081 	return do_split;
1082 }
1083 
1084 static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
1085 			  pgprot_t ref_prot, unsigned long address,
1086 			  unsigned long size)
1087 {
1088 	unsigned int npg = PFN_DOWN(size);
1089 	pgprot_t prot;
1090 
1091 	/*
1092 	 * If should_split_large_page() discovered an inconsistent mapping,
1093 	 * remove the invalid protection in the split mapping.
1094 	 */
1095 	if (!cpa->force_static_prot)
1096 		goto set;
1097 
1098 	/* Hand in lpsize = 0 to enforce the protection mechanism */
1099 	prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);
1100 
1101 	if (pgprot_val(prot) == pgprot_val(ref_prot))
1102 		goto set;
1103 
1104 	/*
1105 	 * If this is splitting a PMD, fix it up. PUD splits cannot be
1106 	 * fixed trivially as that would require to rescan the newly
1107 	 * installed PMD mappings after returning from split_large_page()
1108 	 * so an eventual further split can allocate the necessary PTE
1109 	 * pages. Warn for now and revisit it in case this actually
1110 	 * happens.
1111 	 */
1112 	if (size == PAGE_SIZE)
1113 		ref_prot = prot;
1114 	else
1115 		pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
1116 set:
1117 	set_pte(pte, pfn_pte(pfn, ref_prot));
1118 }
1119 
1120 static int
1121 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
1122 		   struct page *base)
1123 {
1124 	unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
1125 	pte_t *pbase = (pte_t *)page_address(base);
1126 	unsigned int i, level;
1127 	pgprot_t ref_prot;
1128 	bool nx, rw;
1129 	pte_t *tmp;
1130 
1131 	spin_lock(&pgd_lock);
1132 	/*
1133 	 * Check for races, another CPU might have split this page
1134 	 * up for us already:
1135 	 */
1136 	tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
1137 	if (tmp != kpte) {
1138 		spin_unlock(&pgd_lock);
1139 		return 1;
1140 	}
1141 
1142 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
1143 
1144 	switch (level) {
1145 	case PG_LEVEL_2M:
1146 		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
1147 		/*
1148 		 * Clear PSE (aka _PAGE_PAT) and move
1149 		 * PAT bit to correct position.
1150 		 */
1151 		ref_prot = pgprot_large_2_4k(ref_prot);
1152 		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
1153 		lpaddr = address & PMD_MASK;
1154 		lpinc = PAGE_SIZE;
1155 		break;
1156 
1157 	case PG_LEVEL_1G:
1158 		ref_prot = pud_pgprot(*(pud_t *)kpte);
1159 		ref_pfn = pud_pfn(*(pud_t *)kpte);
1160 		pfninc = PMD_SIZE >> PAGE_SHIFT;
1161 		lpaddr = address & PUD_MASK;
1162 		lpinc = PMD_SIZE;
1163 		/*
1164 		 * Clear the PSE flags if the PRESENT flag is not set
1165 		 * otherwise pmd_present() will return true even on a non
1166 		 * present pmd.
1167 		 */
1168 		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
1169 			pgprot_val(ref_prot) &= ~_PAGE_PSE;
1170 		break;
1171 
1172 	default:
1173 		spin_unlock(&pgd_lock);
1174 		return 1;
1175 	}
1176 
1177 	ref_prot = pgprot_clear_protnone_bits(ref_prot);
1178 
1179 	/*
1180 	 * Get the target pfn from the original entry:
1181 	 */
1182 	pfn = ref_pfn;
1183 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
1184 		split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
1185 
1186 	if (virt_addr_valid(address)) {
1187 		unsigned long pfn = PFN_DOWN(__pa(address));
1188 
1189 		if (pfn_range_is_mapped(pfn, pfn + 1))
1190 			split_page_count(level);
1191 	}
1192 
1193 	/*
1194 	 * Install the new, split up pagetable.
1195 	 *
1196 	 * We use the standard kernel pagetable protections for the new
1197 	 * pagetable protections, the actual ptes set above control the
1198 	 * primary protection behavior:
1199 	 */
1200 	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
1201 
1202 	/*
1203 	 * Do a global flush tlb after splitting the large page
1204 	 * and before we do the actual change page attribute in the PTE.
1205 	 *
1206 	 * Without this, we violate the TLB application note, that says:
1207 	 * "The TLBs may contain both ordinary and large-page
1208 	 *  translations for a 4-KByte range of linear addresses. This
1209 	 *  may occur if software modifies the paging structures so that
1210 	 *  the page size used for the address range changes. If the two
1211 	 *  translations differ with respect to page frame or attributes
1212 	 *  (e.g., permissions), processor behavior is undefined and may
1213 	 *  be implementation-specific."
1214 	 *
1215 	 * We do this global tlb flush inside the cpa_lock, so that we
1216 	 * don't allow any other cpu, with stale tlb entries change the
1217 	 * page attribute in parallel, that also falls into the
1218 	 * just split large page entry.
1219 	 */
1220 	flush_tlb_all();
1221 	spin_unlock(&pgd_lock);
1222 
1223 	return 0;
1224 }
1225 
1226 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
1227 			    unsigned long address)
1228 {
1229 	struct page *base;
1230 
1231 	if (!debug_pagealloc_enabled())
1232 		spin_unlock(&cpa_lock);
1233 	base = alloc_pages(GFP_KERNEL, 0);
1234 	if (!debug_pagealloc_enabled())
1235 		spin_lock(&cpa_lock);
1236 	if (!base)
1237 		return -ENOMEM;
1238 
1239 	if (__split_large_page(cpa, kpte, address, base))
1240 		__free_page(base);
1241 
1242 	return 0;
1243 }
1244 
1245 static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
1246 			     struct list_head *pgtables)
1247 {
1248 	pmd_t _pmd, old_pmd;
1249 	pte_t *pte, first;
1250 	unsigned long pfn;
1251 	pgprot_t pgprot;
1252 	int i = 0;
1253 
1254 	if (!cpu_feature_enabled(X86_FEATURE_PSE))
1255 		return 0;
1256 
1257 	addr &= PMD_MASK;
1258 	pte = pte_offset_kernel(pmd, addr);
1259 	first = *pte;
1260 	pfn = pte_pfn(first);
1261 
1262 	/* Make sure alignment is suitable */
1263 	if (PFN_PHYS(pfn) & ~PMD_MASK)
1264 		return 0;
1265 
1266 	/* The page is 4k intentionally */
1267 	if (pte_flags(first) & _PAGE_KERNEL_4K)
1268 		return 0;
1269 
1270 	/* Check that the rest of PTEs are compatible with the first one */
1271 	for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
1272 		pte_t entry = *pte;
1273 
1274 		if (!pte_present(entry))
1275 			return 0;
1276 		if (pte_flags(entry) != pte_flags(first))
1277 			return 0;
1278 		if (pte_pfn(entry) != pte_pfn(first) + i)
1279 			return 0;
1280 	}
1281 
1282 	old_pmd = *pmd;
1283 
1284 	/* Success: set up a large page */
1285 	pgprot = pgprot_4k_2_large(pte_pgprot(first));
1286 	pgprot_val(pgprot) |= _PAGE_PSE;
1287 	_pmd = pfn_pmd(pfn, pgprot);
1288 	set_pmd(pmd, _pmd);
1289 
1290 	/* Queue the page table to be freed after TLB flush */
1291 	list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
1292 
1293 	if (IS_ENABLED(CONFIG_X86_32)) {
1294 		struct page *page;
1295 
1296 		/* Update all PGD tables to use the same large page */
1297 		list_for_each_entry(page, &pgd_list, lru) {
1298 			pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
1299 			p4d_t *p4d = p4d_offset(pgd, addr);
1300 			pud_t *pud = pud_offset(p4d, addr);
1301 			pmd_t *pmd = pmd_offset(pud, addr);
1302 			/* Something is wrong if entries doesn't match */
1303 			if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
1304 				continue;
1305 			set_pmd(pmd, _pmd);
1306 		}
1307 	}
1308 
1309 	if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1310 		collapse_page_count(PG_LEVEL_2M);
1311 
1312 	return 1;
1313 }
1314 
1315 static int collapse_pud_page(pud_t *pud, unsigned long addr,
1316 			     struct list_head *pgtables)
1317 {
1318 	unsigned long pfn;
1319 	pmd_t *pmd, first;
1320 	int i;
1321 
1322 	if (!direct_gbpages)
1323 		return 0;
1324 
1325 	addr &= PUD_MASK;
1326 	pmd = pmd_offset(pud, addr);
1327 	first = *pmd;
1328 
1329 	/*
1330 	 * To restore PUD page all PMD entries must be large and
1331 	 * have suitable alignment
1332 	 */
1333 	pfn = pmd_pfn(first);
1334 	if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
1335 		return 0;
1336 
1337 	/*
1338 	 * To restore PUD page, all following PMDs must be compatible with the
1339 	 * first one.
1340 	 */
1341 	for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
1342 		pmd_t entry = *pmd;
1343 
1344 		if (!pmd_present(entry) || !pmd_leaf(entry))
1345 			return 0;
1346 		if (pmd_flags(entry) != pmd_flags(first))
1347 			return 0;
1348 		if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
1349 			return 0;
1350 	}
1351 
1352 	/* Restore PUD page and queue page table to be freed after TLB flush */
1353 	list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
1354 	set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
1355 
1356 	if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1357 		collapse_page_count(PG_LEVEL_1G);
1358 
1359 	return 1;
1360 }
1361 
1362 /*
1363  * Collapse PMD and PUD pages in the kernel mapping around the address where
1364  * possible.
1365  *
1366  * Caller must flush TLB and free page tables queued on the list before
1367  * touching the new entries. CPU must not see TLB entries of different size
1368  * with different attributes.
1369  */
1370 static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
1371 {
1372 	int collapsed = 0;
1373 	pgd_t *pgd;
1374 	p4d_t *p4d;
1375 	pud_t *pud;
1376 	pmd_t *pmd;
1377 
1378 	addr &= PMD_MASK;
1379 
1380 	spin_lock(&pgd_lock);
1381 	pgd = pgd_offset_k(addr);
1382 	if (pgd_none(*pgd))
1383 		goto out;
1384 	p4d = p4d_offset(pgd, addr);
1385 	if (p4d_none(*p4d))
1386 		goto out;
1387 	pud = pud_offset(p4d, addr);
1388 	if (!pud_present(*pud) || pud_leaf(*pud))
1389 		goto out;
1390 	pmd = pmd_offset(pud, addr);
1391 	if (!pmd_present(*pmd) || pmd_leaf(*pmd))
1392 		goto out;
1393 
1394 	collapsed = collapse_pmd_page(pmd, addr, pgtables);
1395 	if (collapsed)
1396 		collapsed += collapse_pud_page(pud, addr, pgtables);
1397 
1398 out:
1399 	spin_unlock(&pgd_lock);
1400 	return collapsed;
1401 }
1402 
1403 static bool try_to_free_pte_page(pte_t *pte)
1404 {
1405 	int i;
1406 
1407 	for (i = 0; i < PTRS_PER_PTE; i++)
1408 		if (!pte_none(pte[i]))
1409 			return false;
1410 
1411 	free_page((unsigned long)pte);
1412 	return true;
1413 }
1414 
1415 static bool try_to_free_pmd_page(pmd_t *pmd)
1416 {
1417 	int i;
1418 
1419 	for (i = 0; i < PTRS_PER_PMD; i++)
1420 		if (!pmd_none(pmd[i]))
1421 			return false;
1422 
1423 	free_page((unsigned long)pmd);
1424 	return true;
1425 }
1426 
1427 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
1428 {
1429 	pte_t *pte = pte_offset_kernel(pmd, start);
1430 
1431 	while (start < end) {
1432 		set_pte(pte, __pte(0));
1433 
1434 		start += PAGE_SIZE;
1435 		pte++;
1436 	}
1437 
1438 	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
1439 		pmd_clear(pmd);
1440 		return true;
1441 	}
1442 	return false;
1443 }
1444 
1445 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
1446 			      unsigned long start, unsigned long end)
1447 {
1448 	if (unmap_pte_range(pmd, start, end))
1449 		if (try_to_free_pmd_page(pud_pgtable(*pud)))
1450 			pud_clear(pud);
1451 }
1452 
1453 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
1454 {
1455 	pmd_t *pmd = pmd_offset(pud, start);
1456 
1457 	/*
1458 	 * Not on a 2MB page boundary?
1459 	 */
1460 	if (start & (PMD_SIZE - 1)) {
1461 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
1462 		unsigned long pre_end = min_t(unsigned long, end, next_page);
1463 
1464 		__unmap_pmd_range(pud, pmd, start, pre_end);
1465 
1466 		start = pre_end;
1467 		pmd++;
1468 	}
1469 
1470 	/*
1471 	 * Try to unmap in 2M chunks.
1472 	 */
1473 	while (end - start >= PMD_SIZE) {
1474 		if (pmd_leaf(*pmd))
1475 			pmd_clear(pmd);
1476 		else
1477 			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
1478 
1479 		start += PMD_SIZE;
1480 		pmd++;
1481 	}
1482 
1483 	/*
1484 	 * 4K leftovers?
1485 	 */
1486 	if (start < end)
1487 		return __unmap_pmd_range(pud, pmd, start, end);
1488 
1489 	/*
1490 	 * Try again to free the PMD page if haven't succeeded above.
1491 	 */
1492 	if (!pud_none(*pud))
1493 		if (try_to_free_pmd_page(pud_pgtable(*pud)))
1494 			pud_clear(pud);
1495 }
1496 
1497 static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
1498 {
1499 	pud_t *pud = pud_offset(p4d, start);
1500 
1501 	/*
1502 	 * Not on a GB page boundary?
1503 	 */
1504 	if (start & (PUD_SIZE - 1)) {
1505 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1506 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
1507 
1508 		unmap_pmd_range(pud, start, pre_end);
1509 
1510 		start = pre_end;
1511 		pud++;
1512 	}
1513 
1514 	/*
1515 	 * Try to unmap in 1G chunks?
1516 	 */
1517 	while (end - start >= PUD_SIZE) {
1518 
1519 		if (pud_leaf(*pud))
1520 			pud_clear(pud);
1521 		else
1522 			unmap_pmd_range(pud, start, start + PUD_SIZE);
1523 
1524 		start += PUD_SIZE;
1525 		pud++;
1526 	}
1527 
1528 	/*
1529 	 * 2M leftovers?
1530 	 */
1531 	if (start < end)
1532 		unmap_pmd_range(pud, start, end);
1533 
1534 	/*
1535 	 * No need to try to free the PUD page because we'll free it in
1536 	 * populate_pgd's error path
1537 	 */
1538 }
1539 
1540 static int alloc_pte_page(pmd_t *pmd)
1541 {
1542 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
1543 	if (!pte)
1544 		return -1;
1545 
1546 	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
1547 	return 0;
1548 }
1549 
1550 static int alloc_pmd_page(pud_t *pud)
1551 {
1552 	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
1553 	if (!pmd)
1554 		return -1;
1555 
1556 	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
1557 	return 0;
1558 }
1559 
1560 static void populate_pte(struct cpa_data *cpa,
1561 			 unsigned long start, unsigned long end,
1562 			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
1563 {
1564 	pte_t *pte;
1565 
1566 	pte = pte_offset_kernel(pmd, start);
1567 
1568 	pgprot = pgprot_clear_protnone_bits(pgprot);
1569 
1570 	while (num_pages-- && start < end) {
1571 		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
1572 
1573 		start	 += PAGE_SIZE;
1574 		cpa->pfn++;
1575 		pte++;
1576 	}
1577 }
1578 
1579 static long populate_pmd(struct cpa_data *cpa,
1580 			 unsigned long start, unsigned long end,
1581 			 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
1582 {
1583 	long cur_pages = 0;
1584 	pmd_t *pmd;
1585 	pgprot_t pmd_pgprot;
1586 
1587 	/*
1588 	 * Not on a 2M boundary?
1589 	 */
1590 	if (start & (PMD_SIZE - 1)) {
1591 		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
1592 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
1593 
1594 		pre_end   = min_t(unsigned long, pre_end, next_page);
1595 		cur_pages = (pre_end - start) >> PAGE_SHIFT;
1596 		cur_pages = min_t(unsigned int, num_pages, cur_pages);
1597 
1598 		/*
1599 		 * Need a PTE page?
1600 		 */
1601 		pmd = pmd_offset(pud, start);
1602 		if (pmd_none(*pmd))
1603 			if (alloc_pte_page(pmd))
1604 				return -1;
1605 
1606 		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
1607 
1608 		start = pre_end;
1609 	}
1610 
1611 	/*
1612 	 * We mapped them all?
1613 	 */
1614 	if (num_pages == cur_pages)
1615 		return cur_pages;
1616 
1617 	pmd_pgprot = pgprot_4k_2_large(pgprot);
1618 
1619 	while (end - start >= PMD_SIZE) {
1620 
1621 		/*
1622 		 * We cannot use a 1G page so allocate a PMD page if needed.
1623 		 */
1624 		if (pud_none(*pud))
1625 			if (alloc_pmd_page(pud))
1626 				return -1;
1627 
1628 		pmd = pmd_offset(pud, start);
1629 
1630 		set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
1631 					canon_pgprot(pmd_pgprot))));
1632 
1633 		start	  += PMD_SIZE;
1634 		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
1635 		cur_pages += PMD_SIZE >> PAGE_SHIFT;
1636 	}
1637 
1638 	/*
1639 	 * Map trailing 4K pages.
1640 	 */
1641 	if (start < end) {
1642 		pmd = pmd_offset(pud, start);
1643 		if (pmd_none(*pmd))
1644 			if (alloc_pte_page(pmd))
1645 				return -1;
1646 
1647 		populate_pte(cpa, start, end, num_pages - cur_pages,
1648 			     pmd, pgprot);
1649 	}
1650 	return num_pages;
1651 }
1652 
1653 static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
1654 			pgprot_t pgprot)
1655 {
1656 	pud_t *pud;
1657 	unsigned long end;
1658 	long cur_pages = 0;
1659 	pgprot_t pud_pgprot;
1660 
1661 	end = start + (cpa->numpages << PAGE_SHIFT);
1662 
1663 	/*
1664 	 * Not on a Gb page boundary? => map everything up to it with
1665 	 * smaller pages.
1666 	 */
1667 	if (start & (PUD_SIZE - 1)) {
1668 		unsigned long pre_end;
1669 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1670 
1671 		pre_end   = min_t(unsigned long, end, next_page);
1672 		cur_pages = (pre_end - start) >> PAGE_SHIFT;
1673 		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1674 
1675 		pud = pud_offset(p4d, start);
1676 
1677 		/*
1678 		 * Need a PMD page?
1679 		 */
1680 		if (pud_none(*pud))
1681 			if (alloc_pmd_page(pud))
1682 				return -1;
1683 
1684 		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1685 					 pud, pgprot);
1686 		if (cur_pages < 0)
1687 			return cur_pages;
1688 
1689 		start = pre_end;
1690 	}
1691 
1692 	/* We mapped them all? */
1693 	if (cpa->numpages == cur_pages)
1694 		return cur_pages;
1695 
1696 	pud = pud_offset(p4d, start);
1697 	pud_pgprot = pgprot_4k_2_large(pgprot);
1698 
1699 	/*
1700 	 * Map everything starting from the Gb boundary, possibly with 1G pages
1701 	 */
1702 	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1703 		set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
1704 				   canon_pgprot(pud_pgprot))));
1705 
1706 		start	  += PUD_SIZE;
1707 		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
1708 		cur_pages += PUD_SIZE >> PAGE_SHIFT;
1709 		pud++;
1710 	}
1711 
1712 	/* Map trailing leftover */
1713 	if (start < end) {
1714 		long tmp;
1715 
1716 		pud = pud_offset(p4d, start);
1717 		if (pud_none(*pud))
1718 			if (alloc_pmd_page(pud))
1719 				return -1;
1720 
1721 		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1722 				   pud, pgprot);
1723 		if (tmp < 0)
1724 			return cur_pages;
1725 
1726 		cur_pages += tmp;
1727 	}
1728 	return cur_pages;
1729 }
1730 
1731 /*
1732  * Restrictions for kernel page table do not necessarily apply when mapping in
1733  * an alternate PGD.
1734  */
1735 static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1736 {
1737 	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1738 	pud_t *pud = NULL;	/* shut up gcc */
1739 	p4d_t *p4d;
1740 	pgd_t *pgd_entry;
1741 	long ret;
1742 
1743 	pgd_entry = cpa->pgd + pgd_index(addr);
1744 
1745 	if (pgd_none(*pgd_entry)) {
1746 		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
1747 		if (!p4d)
1748 			return -1;
1749 
1750 		set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
1751 	}
1752 
1753 	/*
1754 	 * Allocate a PUD page and hand it down for mapping.
1755 	 */
1756 	p4d = p4d_offset(pgd_entry, addr);
1757 	if (p4d_none(*p4d)) {
1758 		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
1759 		if (!pud)
1760 			return -1;
1761 
1762 		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
1763 	}
1764 
1765 	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1766 	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1767 
1768 	ret = populate_pud(cpa, addr, p4d, pgprot);
1769 	if (ret < 0) {
1770 		/*
1771 		 * Leave the PUD page in place in case some other CPU or thread
1772 		 * already found it, but remove any useless entries we just
1773 		 * added to it.
1774 		 */
1775 		unmap_pud_range(p4d, addr,
1776 				addr + (cpa->numpages << PAGE_SHIFT));
1777 		return ret;
1778 	}
1779 
1780 	cpa->numpages = ret;
1781 	return 0;
1782 }
1783 
1784 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1785 			       int primary)
1786 {
1787 	if (cpa->pgd) {
1788 		/*
1789 		 * Right now, we only execute this code path when mapping
1790 		 * the EFI virtual memory map regions, no other users
1791 		 * provide a ->pgd value. This may change in the future.
1792 		 */
1793 		return populate_pgd(cpa, vaddr);
1794 	}
1795 
1796 	/*
1797 	 * Ignore all non primary paths.
1798 	 */
1799 	if (!primary) {
1800 		cpa->numpages = 1;
1801 		return 0;
1802 	}
1803 
1804 	/*
1805 	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1806 	 * to have holes.
1807 	 * Also set numpages to '1' indicating that we processed cpa req for
1808 	 * one virtual address page and its pfn. TBD: numpages can be set based
1809 	 * on the initial value and the level returned by lookup_address().
1810 	 */
1811 	if (within(vaddr, PAGE_OFFSET,
1812 		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1813 		cpa->numpages = 1;
1814 		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1815 		return 0;
1816 
1817 	} else if (__cpa_pfn_in_highmap(cpa->pfn)) {
1818 		/* Faults in the highmap are OK, so do not warn: */
1819 		return -EFAULT;
1820 	} else {
1821 		WARN(1, KERN_WARNING "CPA: called for zero pte. "
1822 			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1823 			*cpa->vaddr);
1824 
1825 		return -EFAULT;
1826 	}
1827 }
1828 
1829 static int __change_page_attr(struct cpa_data *cpa, int primary)
1830 {
1831 	unsigned long address;
1832 	int do_split, err;
1833 	unsigned int level;
1834 	pte_t *kpte, old_pte;
1835 	bool nx, rw;
1836 
1837 	address = __cpa_addr(cpa, cpa->curpage);
1838 repeat:
1839 	kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
1840 	if (!kpte)
1841 		return __cpa_process_fault(cpa, address, primary);
1842 
1843 	old_pte = *kpte;
1844 	if (pte_none(old_pte))
1845 		return __cpa_process_fault(cpa, address, primary);
1846 
1847 	if (level == PG_LEVEL_4K) {
1848 		pte_t new_pte;
1849 		pgprot_t old_prot = pte_pgprot(old_pte);
1850 		pgprot_t new_prot = pte_pgprot(old_pte);
1851 		unsigned long pfn = pte_pfn(old_pte);
1852 
1853 		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1854 		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1855 
1856 		cpa_inc_4k_install();
1857 		/* Hand in lpsize = 0 to enforce the protection mechanism */
1858 		new_prot = static_protections(new_prot, address, pfn, 1, 0,
1859 					      CPA_PROTECT);
1860 
1861 		new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
1862 				      nx, rw);
1863 
1864 		new_prot = pgprot_clear_protnone_bits(new_prot);
1865 
1866 		/*
1867 		 * We need to keep the pfn from the existing PTE,
1868 		 * after all we're only going to change its attributes
1869 		 * not the memory it points to
1870 		 */
1871 		new_pte = pfn_pte(pfn, new_prot);
1872 		cpa->pfn = pfn;
1873 		/*
1874 		 * Do we really change anything ?
1875 		 */
1876 		if (pte_val(old_pte) != pte_val(new_pte)) {
1877 			set_pte_atomic(kpte, new_pte);
1878 			cpa->flags |= CPA_FLUSHTLB;
1879 		}
1880 		cpa->numpages = 1;
1881 		return 0;
1882 	}
1883 
1884 	/*
1885 	 * Check, whether we can keep the large page intact
1886 	 * and just change the pte:
1887 	 */
1888 	do_split = should_split_large_page(kpte, address, cpa);
1889 	/*
1890 	 * When the range fits into the existing large page,
1891 	 * return. cp->numpages and cpa->tlbflush have been updated in
1892 	 * try_large_page:
1893 	 */
1894 	if (do_split <= 0)
1895 		return do_split;
1896 
1897 	/*
1898 	 * We have to split the large page:
1899 	 */
1900 	err = split_large_page(cpa, kpte, address);
1901 	if (!err)
1902 		goto repeat;
1903 
1904 	return err;
1905 }
1906 
1907 static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
1908 
1909 /*
1910  * Check the directmap and "high kernel map" 'aliases'.
1911  */
1912 static int cpa_process_alias(struct cpa_data *cpa)
1913 {
1914 	struct cpa_data alias_cpa;
1915 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1916 	unsigned long vaddr;
1917 	int ret;
1918 
1919 	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1920 		return 0;
1921 
1922 	/*
1923 	 * No need to redo, when the primary call touched the direct
1924 	 * mapping already:
1925 	 */
1926 	vaddr = __cpa_addr(cpa, cpa->curpage);
1927 	if (!(within(vaddr, PAGE_OFFSET,
1928 		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1929 
1930 		alias_cpa = *cpa;
1931 		alias_cpa.vaddr = &laddr;
1932 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1933 		alias_cpa.curpage = 0;
1934 
1935 		/* Directmap always has NX set, do not modify. */
1936 		if (__supported_pte_mask & _PAGE_NX) {
1937 			alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
1938 			alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
1939 		}
1940 
1941 		cpa->force_flush_all = 1;
1942 
1943 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
1944 		if (ret)
1945 			return ret;
1946 	}
1947 
1948 #ifdef CONFIG_X86_64
1949 	/*
1950 	 * If the primary call didn't touch the high mapping already
1951 	 * and the physical address is inside the kernel map, we need
1952 	 * to touch the high mapped kernel as well:
1953 	 */
1954 	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1955 	    __cpa_pfn_in_highmap(cpa->pfn)) {
1956 		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1957 					       __START_KERNEL_map - phys_base;
1958 		alias_cpa = *cpa;
1959 		alias_cpa.vaddr = &temp_cpa_vaddr;
1960 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1961 		alias_cpa.curpage = 0;
1962 
1963 		/*
1964 		 * [_text, _brk_end) also covers data, do not modify NX except
1965 		 * in cases where the highmap is the primary target.
1966 		 */
1967 		if (__supported_pte_mask & _PAGE_NX) {
1968 			alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
1969 			alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
1970 		}
1971 
1972 		cpa->force_flush_all = 1;
1973 		/*
1974 		 * The high mapping range is imprecise, so ignore the
1975 		 * return value.
1976 		 */
1977 		__change_page_attr_set_clr(&alias_cpa, 0);
1978 	}
1979 #endif
1980 
1981 	return 0;
1982 }
1983 
1984 static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
1985 {
1986 	unsigned long numpages = cpa->numpages;
1987 	unsigned long rempages = numpages;
1988 	int ret = 0;
1989 
1990 	/*
1991 	 * No changes, easy!
1992 	 */
1993 	if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
1994 	    !cpa->force_split)
1995 		return ret;
1996 
1997 	while (rempages) {
1998 		/*
1999 		 * Store the remaining nr of pages for the large page
2000 		 * preservation check.
2001 		 */
2002 		cpa->numpages = rempages;
2003 		/* for array changes, we can't use large page */
2004 		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
2005 			cpa->numpages = 1;
2006 
2007 		if (!debug_pagealloc_enabled())
2008 			spin_lock(&cpa_lock);
2009 		ret = __change_page_attr(cpa, primary);
2010 		if (!debug_pagealloc_enabled())
2011 			spin_unlock(&cpa_lock);
2012 		if (ret)
2013 			goto out;
2014 
2015 		if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
2016 			ret = cpa_process_alias(cpa);
2017 			if (ret)
2018 				goto out;
2019 		}
2020 
2021 		/*
2022 		 * Adjust the number of pages with the result of the
2023 		 * CPA operation. Either a large page has been
2024 		 * preserved or a single page update happened.
2025 		 */
2026 		BUG_ON(cpa->numpages > rempages || !cpa->numpages);
2027 		rempages -= cpa->numpages;
2028 		cpa->curpage += cpa->numpages;
2029 	}
2030 
2031 out:
2032 	/* Restore the original numpages */
2033 	cpa->numpages = numpages;
2034 	return ret;
2035 }
2036 
2037 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
2038 				    pgprot_t mask_set, pgprot_t mask_clr,
2039 				    int force_split, int in_flag,
2040 				    struct page **pages)
2041 {
2042 	struct cpa_data cpa;
2043 	int ret, cache;
2044 
2045 	memset(&cpa, 0, sizeof(cpa));
2046 
2047 	/*
2048 	 * Check, if we are requested to set a not supported
2049 	 * feature.  Clearing non-supported features is OK.
2050 	 */
2051 	mask_set = canon_pgprot(mask_set);
2052 
2053 	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
2054 		return 0;
2055 
2056 	/* Ensure we are PAGE_SIZE aligned */
2057 	if (in_flag & CPA_ARRAY) {
2058 		int i;
2059 		for (i = 0; i < numpages; i++) {
2060 			if (addr[i] & ~PAGE_MASK) {
2061 				addr[i] &= PAGE_MASK;
2062 				WARN_ON_ONCE(1);
2063 			}
2064 		}
2065 	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
2066 		/*
2067 		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
2068 		 * No need to check in that case
2069 		 */
2070 		if (*addr & ~PAGE_MASK) {
2071 			*addr &= PAGE_MASK;
2072 			/*
2073 			 * People should not be passing in unaligned addresses:
2074 			 */
2075 			WARN_ON_ONCE(1);
2076 		}
2077 	}
2078 
2079 	/* Must avoid aliasing mappings in the highmem code */
2080 	kmap_flush_unused();
2081 
2082 	vm_unmap_aliases();
2083 
2084 	cpa.vaddr = addr;
2085 	cpa.pages = pages;
2086 	cpa.numpages = numpages;
2087 	cpa.mask_set = mask_set;
2088 	cpa.mask_clr = mask_clr;
2089 	cpa.flags = in_flag;
2090 	cpa.curpage = 0;
2091 	cpa.force_split = force_split;
2092 
2093 	ret = __change_page_attr_set_clr(&cpa, 1);
2094 
2095 	/*
2096 	 * Check whether we really changed something:
2097 	 */
2098 	if (!(cpa.flags & CPA_FLUSHTLB))
2099 		goto out;
2100 
2101 	/*
2102 	 * No need to flush, when we did not set any of the caching
2103 	 * attributes:
2104 	 */
2105 	cache = !!pgprot2cachemode(mask_set);
2106 
2107 	/*
2108 	 * On error; flush everything to be sure.
2109 	 */
2110 	if (ret) {
2111 		cpa_flush_all(cache);
2112 		goto out;
2113 	}
2114 
2115 	cpa_flush(&cpa, cache);
2116 out:
2117 	return ret;
2118 }
2119 
2120 static inline int change_page_attr_set(unsigned long *addr, int numpages,
2121 				       pgprot_t mask, int array)
2122 {
2123 	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
2124 		(array ? CPA_ARRAY : 0), NULL);
2125 }
2126 
2127 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
2128 					 pgprot_t mask, int array)
2129 {
2130 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
2131 		(array ? CPA_ARRAY : 0), NULL);
2132 }
2133 
2134 static inline int cpa_set_pages_array(struct page **pages, int numpages,
2135 				       pgprot_t mask)
2136 {
2137 	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
2138 		CPA_PAGES_ARRAY, pages);
2139 }
2140 
2141 static inline int cpa_clear_pages_array(struct page **pages, int numpages,
2142 					 pgprot_t mask)
2143 {
2144 	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
2145 		CPA_PAGES_ARRAY, pages);
2146 }
2147 
2148 /*
2149  * __set_memory_prot is an internal helper for callers that have been passed
2150  * a pgprot_t value from upper layers and a reservation has already been taken.
2151  * If you want to set the pgprot to a specific page protocol, use the
2152  * set_memory_xx() functions.
2153  */
2154 int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
2155 {
2156 	return change_page_attr_set_clr(&addr, numpages, prot,
2157 					__pgprot(~pgprot_val(prot)), 0, 0,
2158 					NULL);
2159 }
2160 
2161 int _set_memory_uc(unsigned long addr, int numpages)
2162 {
2163 	/*
2164 	 * for now UC MINUS. see comments in ioremap()
2165 	 * If you really need strong UC use ioremap_uc(), but note
2166 	 * that you cannot override IO areas with set_memory_*() as
2167 	 * these helpers cannot work with IO memory.
2168 	 */
2169 	return change_page_attr_set(&addr, numpages,
2170 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
2171 				    0);
2172 }
2173 
2174 int set_memory_uc(unsigned long addr, int numpages)
2175 {
2176 	int ret;
2177 
2178 	/*
2179 	 * for now UC MINUS. see comments in ioremap()
2180 	 */
2181 	ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
2182 			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
2183 	if (ret)
2184 		goto out_err;
2185 
2186 	ret = _set_memory_uc(addr, numpages);
2187 	if (ret)
2188 		goto out_free;
2189 
2190 	return 0;
2191 
2192 out_free:
2193 	memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2194 out_err:
2195 	return ret;
2196 }
2197 EXPORT_SYMBOL(set_memory_uc);
2198 
2199 int _set_memory_wc(unsigned long addr, int numpages)
2200 {
2201 	int ret;
2202 
2203 	ret = change_page_attr_set(&addr, numpages,
2204 				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
2205 				   0);
2206 	if (!ret) {
2207 		ret = change_page_attr_set_clr(&addr, numpages,
2208 					       cachemode2pgprot(_PAGE_CACHE_MODE_WC),
2209 					       __pgprot(_PAGE_CACHE_MASK),
2210 					       0, 0, NULL);
2211 	}
2212 	return ret;
2213 }
2214 
2215 int set_memory_wc(unsigned long addr, int numpages)
2216 {
2217 	int ret;
2218 
2219 	ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
2220 		_PAGE_CACHE_MODE_WC, NULL);
2221 	if (ret)
2222 		return ret;
2223 
2224 	ret = _set_memory_wc(addr, numpages);
2225 	if (ret)
2226 		memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2227 
2228 	return ret;
2229 }
2230 EXPORT_SYMBOL(set_memory_wc);
2231 
2232 int _set_memory_wt(unsigned long addr, int numpages)
2233 {
2234 	return change_page_attr_set(&addr, numpages,
2235 				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
2236 }
2237 
2238 int _set_memory_wb(unsigned long addr, int numpages)
2239 {
2240 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
2241 	return change_page_attr_clear(&addr, numpages,
2242 				      __pgprot(_PAGE_CACHE_MASK), 0);
2243 }
2244 
2245 int set_memory_wb(unsigned long addr, int numpages)
2246 {
2247 	int ret;
2248 
2249 	ret = _set_memory_wb(addr, numpages);
2250 	if (ret)
2251 		return ret;
2252 
2253 	memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2254 	return 0;
2255 }
2256 EXPORT_SYMBOL(set_memory_wb);
2257 
2258 /* Prevent speculative access to a page by marking it not-present */
2259 #ifdef CONFIG_X86_64
2260 int set_mce_nospec(unsigned long pfn)
2261 {
2262 	unsigned long decoy_addr;
2263 	int rc;
2264 
2265 	/* SGX pages are not in the 1:1 map */
2266 	if (arch_is_platform_page(pfn << PAGE_SHIFT))
2267 		return 0;
2268 	/*
2269 	 * We would like to just call:
2270 	 *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
2271 	 * but doing that would radically increase the odds of a
2272 	 * speculative access to the poison page because we'd have
2273 	 * the virtual address of the kernel 1:1 mapping sitting
2274 	 * around in registers.
2275 	 * Instead we get tricky.  We create a non-canonical address
2276 	 * that looks just like the one we want, but has bit 63 flipped.
2277 	 * This relies on set_memory_XX() properly sanitizing any __pa()
2278 	 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
2279 	 */
2280 	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
2281 
2282 	rc = set_memory_np(decoy_addr, 1);
2283 	if (rc)
2284 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
2285 	return rc;
2286 }
2287 EXPORT_SYMBOL_GPL(set_mce_nospec);
2288 
2289 /* Restore full speculative operation to the pfn. */
2290 int clear_mce_nospec(unsigned long pfn)
2291 {
2292 	unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
2293 
2294 	return set_memory_p(addr, 1);
2295 }
2296 EXPORT_SYMBOL_GPL(clear_mce_nospec);
2297 #endif /* CONFIG_X86_64 */
2298 
2299 int set_memory_x(unsigned long addr, int numpages)
2300 {
2301 	if (!(__supported_pte_mask & _PAGE_NX))
2302 		return 0;
2303 
2304 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
2305 }
2306 
2307 int set_memory_nx(unsigned long addr, int numpages)
2308 {
2309 	if (!(__supported_pte_mask & _PAGE_NX))
2310 		return 0;
2311 
2312 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
2313 }
2314 
2315 int set_memory_ro(unsigned long addr, int numpages)
2316 {
2317 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
2318 }
2319 
2320 int set_memory_rox(unsigned long addr, int numpages)
2321 {
2322 	pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);
2323 
2324 	if (__supported_pte_mask & _PAGE_NX)
2325 		clr.pgprot |= _PAGE_NX;
2326 
2327 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
2328 					CPA_COLLAPSE, NULL);
2329 }
2330 
2331 int set_memory_rw(unsigned long addr, int numpages)
2332 {
2333 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
2334 }
2335 
2336 int set_memory_np(unsigned long addr, int numpages)
2337 {
2338 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
2339 }
2340 
2341 int set_memory_np_noalias(unsigned long addr, int numpages)
2342 {
2343 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
2344 					__pgprot(_PAGE_PRESENT), 0,
2345 					CPA_NO_CHECK_ALIAS, NULL);
2346 }
2347 
2348 int set_memory_p(unsigned long addr, int numpages)
2349 {
2350 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
2351 }
2352 
2353 int set_memory_4k(unsigned long addr, int numpages)
2354 {
2355 	return change_page_attr_set_clr(&addr, numpages,
2356 					__pgprot(_PAGE_KERNEL_4K),
2357 					__pgprot(0), 1, 0, NULL);
2358 }
2359 
2360 int set_memory_nonglobal(unsigned long addr, int numpages)
2361 {
2362 	return change_page_attr_clear(&addr, numpages,
2363 				      __pgprot(_PAGE_GLOBAL), 0);
2364 }
2365 
2366 int set_memory_global(unsigned long addr, int numpages)
2367 {
2368 	return change_page_attr_set(&addr, numpages,
2369 				    __pgprot(_PAGE_GLOBAL), 0);
2370 }
2371 
2372 /*
2373  * __set_memory_enc_pgtable() is used for the hypervisors that get
2374  * informed about "encryption" status via page tables.
2375  */
2376 static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
2377 {
2378 	pgprot_t empty = __pgprot(0);
2379 	struct cpa_data cpa;
2380 	int ret;
2381 
2382 	/* Should not be working on unaligned addresses */
2383 	if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
2384 		addr &= PAGE_MASK;
2385 
2386 	memset(&cpa, 0, sizeof(cpa));
2387 	cpa.vaddr = &addr;
2388 	cpa.numpages = numpages;
2389 	cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
2390 	cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
2391 	cpa.pgd = init_mm.pgd;
2392 
2393 	/* Must avoid aliasing mappings in the highmem code */
2394 	kmap_flush_unused();
2395 	vm_unmap_aliases();
2396 
2397 	/* Flush the caches as needed before changing the encryption attribute. */
2398 	if (x86_platform.guest.enc_tlb_flush_required(enc))
2399 		cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
2400 
2401 	/* Notify hypervisor that we are about to set/clr encryption attribute. */
2402 	ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
2403 	if (ret)
2404 		goto vmm_fail;
2405 
2406 	ret = __change_page_attr_set_clr(&cpa, 1);
2407 
2408 	/*
2409 	 * After changing the encryption attribute, we need to flush TLBs again
2410 	 * in case any speculative TLB caching occurred (but no need to flush
2411 	 * caches again).  We could just use cpa_flush_all(), but in case TLB
2412 	 * flushing gets optimized in the cpa_flush() path use the same logic
2413 	 * as above.
2414 	 */
2415 	cpa_flush(&cpa, 0);
2416 
2417 	if (ret)
2418 		return ret;
2419 
2420 	/* Notify hypervisor that we have successfully set/clr encryption attribute. */
2421 	ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
2422 	if (ret)
2423 		goto vmm_fail;
2424 
2425 	return 0;
2426 
2427 vmm_fail:
2428 	WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
2429 		  (void *)addr, numpages, enc ? "private" : "shared", ret);
2430 
2431 	return ret;
2432 }
2433 
2434 /*
2435  * The lock serializes conversions between private and shared memory.
2436  *
2437  * It is taken for read on conversion. A write lock guarantees that no
2438  * concurrent conversions are in progress.
2439  */
2440 static DECLARE_RWSEM(mem_enc_lock);
2441 
2442 /*
2443  * Stop new private<->shared conversions.
2444  *
2445  * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
2446  * The lock is not released to prevent new conversions from being started.
2447  */
2448 bool set_memory_enc_stop_conversion(void)
2449 {
2450 	/*
2451 	 * In a crash scenario, sleep is not allowed. Try to take the lock.
2452 	 * Failure indicates that there is a race with the conversion.
2453 	 */
2454 	if (oops_in_progress)
2455 		return down_write_trylock(&mem_enc_lock);
2456 
2457 	down_write(&mem_enc_lock);
2458 
2459 	return true;
2460 }
2461 
2462 static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
2463 {
2464 	int ret = 0;
2465 
2466 	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
2467 		if (!down_read_trylock(&mem_enc_lock))
2468 			return -EBUSY;
2469 
2470 		ret = __set_memory_enc_pgtable(addr, numpages, enc);
2471 
2472 		up_read(&mem_enc_lock);
2473 	}
2474 
2475 	return ret;
2476 }
2477 
2478 int set_memory_encrypted(unsigned long addr, int numpages)
2479 {
2480 	return __set_memory_enc_dec(addr, numpages, true);
2481 }
2482 EXPORT_SYMBOL_GPL(set_memory_encrypted);
2483 
2484 int set_memory_decrypted(unsigned long addr, int numpages)
2485 {
2486 	return __set_memory_enc_dec(addr, numpages, false);
2487 }
2488 EXPORT_SYMBOL_GPL(set_memory_decrypted);
2489 
2490 int set_pages_uc(struct page *page, int numpages)
2491 {
2492 	unsigned long addr = (unsigned long)page_address(page);
2493 
2494 	return set_memory_uc(addr, numpages);
2495 }
2496 EXPORT_SYMBOL(set_pages_uc);
2497 
2498 static int _set_pages_array(struct page **pages, int numpages,
2499 		enum page_cache_mode new_type)
2500 {
2501 	unsigned long start;
2502 	unsigned long end;
2503 	enum page_cache_mode set_type;
2504 	int i;
2505 	int free_idx;
2506 	int ret;
2507 
2508 	for (i = 0; i < numpages; i++) {
2509 		if (PageHighMem(pages[i]))
2510 			continue;
2511 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2512 		end = start + PAGE_SIZE;
2513 		if (memtype_reserve(start, end, new_type, NULL))
2514 			goto err_out;
2515 	}
2516 
2517 	/* If WC, set to UC- first and then WC */
2518 	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
2519 				_PAGE_CACHE_MODE_UC_MINUS : new_type;
2520 
2521 	ret = cpa_set_pages_array(pages, numpages,
2522 				  cachemode2pgprot(set_type));
2523 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
2524 		ret = change_page_attr_set_clr(NULL, numpages,
2525 					       cachemode2pgprot(
2526 						_PAGE_CACHE_MODE_WC),
2527 					       __pgprot(_PAGE_CACHE_MASK),
2528 					       0, CPA_PAGES_ARRAY, pages);
2529 	if (ret)
2530 		goto err_out;
2531 	return 0; /* Success */
2532 err_out:
2533 	free_idx = i;
2534 	for (i = 0; i < free_idx; i++) {
2535 		if (PageHighMem(pages[i]))
2536 			continue;
2537 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2538 		end = start + PAGE_SIZE;
2539 		memtype_free(start, end);
2540 	}
2541 	return -EINVAL;
2542 }
2543 
2544 int set_pages_array_uc(struct page **pages, int numpages)
2545 {
2546 	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
2547 }
2548 EXPORT_SYMBOL(set_pages_array_uc);
2549 
2550 int set_pages_array_wc(struct page **pages, int numpages)
2551 {
2552 	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
2553 }
2554 EXPORT_SYMBOL(set_pages_array_wc);
2555 
2556 int set_pages_wb(struct page *page, int numpages)
2557 {
2558 	unsigned long addr = (unsigned long)page_address(page);
2559 
2560 	return set_memory_wb(addr, numpages);
2561 }
2562 EXPORT_SYMBOL(set_pages_wb);
2563 
2564 int set_pages_array_wb(struct page **pages, int numpages)
2565 {
2566 	int retval;
2567 	unsigned long start;
2568 	unsigned long end;
2569 	int i;
2570 
2571 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
2572 	retval = cpa_clear_pages_array(pages, numpages,
2573 			__pgprot(_PAGE_CACHE_MASK));
2574 	if (retval)
2575 		return retval;
2576 
2577 	for (i = 0; i < numpages; i++) {
2578 		if (PageHighMem(pages[i]))
2579 			continue;
2580 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2581 		end = start + PAGE_SIZE;
2582 		memtype_free(start, end);
2583 	}
2584 
2585 	return 0;
2586 }
2587 EXPORT_SYMBOL(set_pages_array_wb);
2588 
2589 int set_pages_ro(struct page *page, int numpages)
2590 {
2591 	unsigned long addr = (unsigned long)page_address(page);
2592 
2593 	return set_memory_ro(addr, numpages);
2594 }
2595 
2596 int set_pages_rw(struct page *page, int numpages)
2597 {
2598 	unsigned long addr = (unsigned long)page_address(page);
2599 
2600 	return set_memory_rw(addr, numpages);
2601 }
2602 
2603 static int __set_pages_p(struct page *page, int numpages)
2604 {
2605 	unsigned long tempaddr = (unsigned long) page_address(page);
2606 	struct cpa_data cpa = { .vaddr = &tempaddr,
2607 				.pgd = NULL,
2608 				.numpages = numpages,
2609 				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
2610 				.mask_clr = __pgprot(0),
2611 				.flags = CPA_NO_CHECK_ALIAS };
2612 
2613 	/*
2614 	 * No alias checking needed for setting present flag. otherwise,
2615 	 * we may need to break large pages for 64-bit kernel text
2616 	 * mappings (this adds to complexity if we want to do this from
2617 	 * atomic context especially). Let's keep it simple!
2618 	 */
2619 	return __change_page_attr_set_clr(&cpa, 1);
2620 }
2621 
2622 static int __set_pages_np(struct page *page, int numpages)
2623 {
2624 	unsigned long tempaddr = (unsigned long) page_address(page);
2625 	struct cpa_data cpa = { .vaddr = &tempaddr,
2626 				.pgd = NULL,
2627 				.numpages = numpages,
2628 				.mask_set = __pgprot(0),
2629 				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
2630 				.flags = CPA_NO_CHECK_ALIAS };
2631 
2632 	/*
2633 	 * No alias checking needed for setting not present flag. otherwise,
2634 	 * we may need to break large pages for 64-bit kernel text
2635 	 * mappings (this adds to complexity if we want to do this from
2636 	 * atomic context especially). Let's keep it simple!
2637 	 */
2638 	return __change_page_attr_set_clr(&cpa, 1);
2639 }
2640 
2641 int set_direct_map_invalid_noflush(struct page *page)
2642 {
2643 	return __set_pages_np(page, 1);
2644 }
2645 
2646 int set_direct_map_default_noflush(struct page *page)
2647 {
2648 	return __set_pages_p(page, 1);
2649 }
2650 
2651 int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
2652 {
2653 	if (valid)
2654 		return __set_pages_p(page, nr);
2655 
2656 	return __set_pages_np(page, nr);
2657 }
2658 
2659 #ifdef CONFIG_DEBUG_PAGEALLOC
2660 void __kernel_map_pages(struct page *page, int numpages, int enable)
2661 {
2662 	if (PageHighMem(page))
2663 		return;
2664 	if (!enable) {
2665 		debug_check_no_locks_freed(page_address(page),
2666 					   numpages * PAGE_SIZE);
2667 	}
2668 
2669 	/*
2670 	 * The return value is ignored as the calls cannot fail.
2671 	 * Large pages for identity mappings are not used at boot time
2672 	 * and hence no memory allocations during large page split.
2673 	 */
2674 	if (enable)
2675 		__set_pages_p(page, numpages);
2676 	else
2677 		__set_pages_np(page, numpages);
2678 
2679 	/*
2680 	 * We should perform an IPI and flush all tlbs,
2681 	 * but that can deadlock->flush only current cpu.
2682 	 * Preemption needs to be disabled around __flush_tlb_all() due to
2683 	 * CR3 reload in __native_flush_tlb().
2684 	 */
2685 	preempt_disable();
2686 	__flush_tlb_all();
2687 	preempt_enable();
2688 
2689 	arch_flush_lazy_mmu_mode();
2690 }
2691 #endif /* CONFIG_DEBUG_PAGEALLOC */
2692 
2693 bool kernel_page_present(struct page *page)
2694 {
2695 	unsigned int level;
2696 	pte_t *pte;
2697 
2698 	if (PageHighMem(page))
2699 		return false;
2700 
2701 	pte = lookup_address((unsigned long)page_address(page), &level);
2702 	return (pte_val(*pte) & _PAGE_PRESENT);
2703 }
2704 
2705 int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
2706 				   unsigned numpages, unsigned long page_flags)
2707 {
2708 	int retval = -EINVAL;
2709 
2710 	struct cpa_data cpa = {
2711 		.vaddr = &address,
2712 		.pfn = pfn,
2713 		.pgd = pgd,
2714 		.numpages = numpages,
2715 		.mask_set = __pgprot(0),
2716 		.mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
2717 		.flags = CPA_NO_CHECK_ALIAS,
2718 	};
2719 
2720 	WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
2721 
2722 	if (!(__supported_pte_mask & _PAGE_NX))
2723 		goto out;
2724 
2725 	if (!(page_flags & _PAGE_ENC))
2726 		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
2727 
2728 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
2729 
2730 	retval = __change_page_attr_set_clr(&cpa, 1);
2731 	__flush_tlb_all();
2732 
2733 out:
2734 	return retval;
2735 }
2736 
2737 /*
2738  * __flush_tlb_all() flushes mappings only on current CPU and hence this
2739  * function shouldn't be used in an SMP environment. Presently, it's used only
2740  * during boot (way before smp_init()) by EFI subsystem and hence is ok.
2741  */
2742 int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
2743 				     unsigned long numpages)
2744 {
2745 	int retval;
2746 
2747 	/*
2748 	 * The typical sequence for unmapping is to find a pte through
2749 	 * lookup_address_in_pgd() (ideally, it should never return NULL because
2750 	 * the address is already mapped) and change its protections. As pfn is
2751 	 * the *target* of a mapping, it's not useful while unmapping.
2752 	 */
2753 	struct cpa_data cpa = {
2754 		.vaddr		= &address,
2755 		.pfn		= 0,
2756 		.pgd		= pgd,
2757 		.numpages	= numpages,
2758 		.mask_set	= __pgprot(0),
2759 		.mask_clr	= __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
2760 		.flags		= CPA_NO_CHECK_ALIAS,
2761 	};
2762 
2763 	WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
2764 
2765 	retval = __change_page_attr_set_clr(&cpa, 1);
2766 	__flush_tlb_all();
2767 
2768 	return retval;
2769 }
2770 
2771 /*
2772  * The testcases use internal knowledge of the implementation that shouldn't
2773  * be exposed to the rest of the kernel. Include these directly here.
2774  */
2775 #ifdef CONFIG_CPA_DEBUG
2776 #include "cpa-test.c"
2777 #endif
2778