1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2002 Andi Kleen, SuSE Labs.
4 * Thanks to Ben LaHaise for precious feedback.
5 */
6 #include <linux/highmem.h>
7 #include <linux/memblock.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/interrupt.h>
11 #include <linux/seq_file.h>
12 #include <linux/proc_fs.h>
13 #include <linux/debugfs.h>
14 #include <linux/pfn.h>
15 #include <linux/percpu.h>
16 #include <linux/gfp.h>
17 #include <linux/pci.h>
18 #include <linux/vmalloc.h>
19 #include <linux/libnvdimm.h>
20 #include <linux/vmstat.h>
21 #include <linux/kernel.h>
22 #include <linux/cc_platform.h>
23 #include <linux/set_memory.h>
24 #include <linux/memregion.h>
25
26 #include <asm/e820/api.h>
27 #include <asm/processor.h>
28 #include <asm/tlbflush.h>
29 #include <asm/sections.h>
30 #include <asm/setup.h>
31 #include <linux/uaccess.h>
32 #include <asm/pgalloc.h>
33 #include <asm/proto.h>
34 #include <asm/memtype.h>
35
36 #include "../mm_internal.h"
37
38 /*
39 * The current flushing context - we pass it instead of 5 arguments:
40 */
41 struct cpa_data {
42 unsigned long *vaddr;
43 pgd_t *pgd;
44 pgprot_t mask_set;
45 pgprot_t mask_clr;
46 unsigned long numpages;
47 unsigned long curpage;
48 unsigned long pfn;
49 unsigned int flags;
50 unsigned int force_split : 1,
51 force_static_prot : 1,
52 force_flush_all : 1;
53 struct page **pages;
54 };
55
56 enum cpa_warn {
57 CPA_CONFLICT,
58 CPA_PROTECT,
59 CPA_DETECT,
60 };
61
62 static const int cpa_warn_level = CPA_PROTECT;
63
64 /*
65 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
66 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
67 * entries change the page attribute in parallel to some other cpu
68 * splitting a large page entry along with changing the attribute.
69 */
70 static DEFINE_SPINLOCK(cpa_lock);
71
72 #define CPA_FLUSHTLB 1
73 #define CPA_ARRAY 2
74 #define CPA_PAGES_ARRAY 4
75 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
76 #define CPA_COLLAPSE 16 /* try to collapse large pages */
77
cachemode2pgprot(enum page_cache_mode pcm)78 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
79 {
80 return __pgprot(cachemode2protval(pcm));
81 }
82
83 #ifdef CONFIG_PROC_FS
84 static unsigned long direct_pages_count[PG_LEVEL_NUM];
85
update_page_count(int level,unsigned long pages)86 void update_page_count(int level, unsigned long pages)
87 {
88 /* Protect against CPA */
89 spin_lock(&pgd_lock);
90 direct_pages_count[level] += pages;
91 spin_unlock(&pgd_lock);
92 }
93
split_page_count(int level)94 static void split_page_count(int level)
95 {
96 if (direct_pages_count[level] == 0)
97 return;
98
99 direct_pages_count[level]--;
100 if (system_state == SYSTEM_RUNNING) {
101 if (level == PG_LEVEL_2M)
102 count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
103 else if (level == PG_LEVEL_1G)
104 count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
105 }
106 direct_pages_count[level - 1] += PTRS_PER_PTE;
107 }
108
collapse_page_count(int level)109 static void collapse_page_count(int level)
110 {
111 direct_pages_count[level]++;
112 if (system_state == SYSTEM_RUNNING) {
113 if (level == PG_LEVEL_2M)
114 count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
115 else if (level == PG_LEVEL_1G)
116 count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
117 }
118 direct_pages_count[level - 1] -= PTRS_PER_PTE;
119 }
120
arch_report_meminfo(struct seq_file * m)121 void arch_report_meminfo(struct seq_file *m)
122 {
123 seq_printf(m, "DirectMap4k: %8lu kB\n",
124 direct_pages_count[PG_LEVEL_4K] << 2);
125 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
126 seq_printf(m, "DirectMap2M: %8lu kB\n",
127 direct_pages_count[PG_LEVEL_2M] << 11);
128 #else
129 seq_printf(m, "DirectMap4M: %8lu kB\n",
130 direct_pages_count[PG_LEVEL_2M] << 12);
131 #endif
132 if (direct_gbpages)
133 seq_printf(m, "DirectMap1G: %8lu kB\n",
134 direct_pages_count[PG_LEVEL_1G] << 20);
135 }
136 #else
split_page_count(int level)137 static inline void split_page_count(int level) { }
collapse_page_count(int level)138 static inline void collapse_page_count(int level) { }
139 #endif
140
141 #ifdef CONFIG_X86_CPA_STATISTICS
142
143 static unsigned long cpa_1g_checked;
144 static unsigned long cpa_1g_sameprot;
145 static unsigned long cpa_1g_preserved;
146 static unsigned long cpa_2m_checked;
147 static unsigned long cpa_2m_sameprot;
148 static unsigned long cpa_2m_preserved;
149 static unsigned long cpa_4k_install;
150
cpa_inc_1g_checked(void)151 static inline void cpa_inc_1g_checked(void)
152 {
153 cpa_1g_checked++;
154 }
155
cpa_inc_2m_checked(void)156 static inline void cpa_inc_2m_checked(void)
157 {
158 cpa_2m_checked++;
159 }
160
cpa_inc_4k_install(void)161 static inline void cpa_inc_4k_install(void)
162 {
163 data_race(cpa_4k_install++);
164 }
165
cpa_inc_lp_sameprot(int level)166 static inline void cpa_inc_lp_sameprot(int level)
167 {
168 if (level == PG_LEVEL_1G)
169 cpa_1g_sameprot++;
170 else
171 cpa_2m_sameprot++;
172 }
173
cpa_inc_lp_preserved(int level)174 static inline void cpa_inc_lp_preserved(int level)
175 {
176 if (level == PG_LEVEL_1G)
177 cpa_1g_preserved++;
178 else
179 cpa_2m_preserved++;
180 }
181
cpastats_show(struct seq_file * m,void * p)182 static int cpastats_show(struct seq_file *m, void *p)
183 {
184 seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
185 seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
186 seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
187 seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
188 seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
189 seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
190 seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
191 return 0;
192 }
193
cpastats_open(struct inode * inode,struct file * file)194 static int cpastats_open(struct inode *inode, struct file *file)
195 {
196 return single_open(file, cpastats_show, NULL);
197 }
198
199 static const struct file_operations cpastats_fops = {
200 .open = cpastats_open,
201 .read = seq_read,
202 .llseek = seq_lseek,
203 .release = single_release,
204 };
205
cpa_stats_init(void)206 static int __init cpa_stats_init(void)
207 {
208 debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
209 &cpastats_fops);
210 return 0;
211 }
212 late_initcall(cpa_stats_init);
213 #else
cpa_inc_1g_checked(void)214 static inline void cpa_inc_1g_checked(void) { }
cpa_inc_2m_checked(void)215 static inline void cpa_inc_2m_checked(void) { }
cpa_inc_4k_install(void)216 static inline void cpa_inc_4k_install(void) { }
cpa_inc_lp_sameprot(int level)217 static inline void cpa_inc_lp_sameprot(int level) { }
cpa_inc_lp_preserved(int level)218 static inline void cpa_inc_lp_preserved(int level) { }
219 #endif
220
221
222 static inline int
within(unsigned long addr,unsigned long start,unsigned long end)223 within(unsigned long addr, unsigned long start, unsigned long end)
224 {
225 return addr >= start && addr < end;
226 }
227
228 #ifdef CONFIG_X86_64
229
230 static inline int
within_inclusive(unsigned long addr,unsigned long start,unsigned long end)231 within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
232 {
233 return addr >= start && addr <= end;
234 }
235
236 /*
237 * The kernel image is mapped into two places in the virtual address space
238 * (addresses without KASLR, of course):
239 *
240 * 1. The kernel direct map (0xffff880000000000)
241 * 2. The "high kernel map" (0xffffffff81000000)
242 *
243 * We actually execute out of #2. If we get the address of a kernel symbol, it
244 * points to #2, but almost all physical-to-virtual translations point to #1.
245 *
246 * This is so that we can have both a directmap of all physical memory *and*
247 * take full advantage of the limited (s32) immediate addressing range (2G)
248 * of x86_64.
249 *
250 * See Documentation/arch/x86/x86_64/mm.rst for more detail.
251 */
252
highmap_start_pfn(void)253 static inline unsigned long highmap_start_pfn(void)
254 {
255 return __pa_symbol(_text) >> PAGE_SHIFT;
256 }
257
highmap_end_pfn(void)258 static inline unsigned long highmap_end_pfn(void)
259 {
260 /* Do not reference physical address outside the kernel. */
261 return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
262 }
263
__cpa_pfn_in_highmap(unsigned long pfn)264 static bool __cpa_pfn_in_highmap(unsigned long pfn)
265 {
266 /*
267 * Kernel text has an alias mapping at a high address, known
268 * here as "highmap".
269 */
270 return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
271 }
272
273 #else
274
__cpa_pfn_in_highmap(unsigned long pfn)275 static bool __cpa_pfn_in_highmap(unsigned long pfn)
276 {
277 /* There is no highmap on 32-bit */
278 return false;
279 }
280
281 #endif
282
283 /*
284 * See set_mce_nospec().
285 *
286 * Machine check recovery code needs to change cache mode of poisoned pages to
287 * UC to avoid speculative access logging another error. But passing the
288 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
289 * speculative access. So we cheat and flip the top bit of the address. This
290 * works fine for the code that updates the page tables. But at the end of the
291 * process we need to flush the TLB and cache and the non-canonical address
292 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
293 *
294 * But in the common case we already have a canonical address. This code
295 * will fix the top bit if needed and is a no-op otherwise.
296 */
fix_addr(unsigned long addr)297 static inline unsigned long fix_addr(unsigned long addr)
298 {
299 #ifdef CONFIG_X86_64
300 return (long)(addr << 1) >> 1;
301 #else
302 return addr;
303 #endif
304 }
305
__cpa_addr(struct cpa_data * cpa,unsigned long idx)306 static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
307 {
308 if (cpa->flags & CPA_PAGES_ARRAY) {
309 struct page *page = cpa->pages[idx];
310
311 if (unlikely(PageHighMem(page)))
312 return 0;
313
314 return (unsigned long)page_address(page);
315 }
316
317 if (cpa->flags & CPA_ARRAY)
318 return cpa->vaddr[idx];
319
320 return *cpa->vaddr + idx * PAGE_SIZE;
321 }
322
323 /*
324 * Flushing functions
325 */
326
clflush_cache_range_opt(void * vaddr,unsigned int size)327 static void clflush_cache_range_opt(void *vaddr, unsigned int size)
328 {
329 const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
330 void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
331 void *vend = vaddr + size;
332
333 if (p >= vend)
334 return;
335
336 for (; p < vend; p += clflush_size)
337 clflushopt(p);
338 }
339
340 /**
341 * clflush_cache_range - flush a cache range with clflush
342 * @vaddr: virtual start address
343 * @size: number of bytes to flush
344 *
345 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
346 * SFENCE to avoid ordering issues.
347 */
clflush_cache_range(void * vaddr,unsigned int size)348 void clflush_cache_range(void *vaddr, unsigned int size)
349 {
350 mb();
351 clflush_cache_range_opt(vaddr, size);
352 mb();
353 }
354 EXPORT_SYMBOL_GPL(clflush_cache_range);
355
356 #ifdef CONFIG_ARCH_HAS_PMEM_API
arch_invalidate_pmem(void * addr,size_t size)357 void arch_invalidate_pmem(void *addr, size_t size)
358 {
359 clflush_cache_range(addr, size);
360 }
361 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
362 #endif
363
364 #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
cpu_cache_has_invalidate_memregion(void)365 bool cpu_cache_has_invalidate_memregion(void)
366 {
367 return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
368 }
369 EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
370
cpu_cache_invalidate_memregion(int res_desc)371 int cpu_cache_invalidate_memregion(int res_desc)
372 {
373 if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
374 return -ENXIO;
375 wbinvd_on_all_cpus();
376 return 0;
377 }
378 EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
379 #endif
380
__cpa_flush_all(void * arg)381 static void __cpa_flush_all(void *arg)
382 {
383 unsigned long cache = (unsigned long)arg;
384
385 /*
386 * Flush all to work around Errata in early athlons regarding
387 * large page flushing.
388 */
389 __flush_tlb_all();
390
391 if (cache && boot_cpu_data.x86 >= 4)
392 wbinvd();
393 }
394
cpa_flush_all(unsigned long cache)395 static void cpa_flush_all(unsigned long cache)
396 {
397 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
398
399 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
400 }
401
402 static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
403
cpa_collapse_large_pages(struct cpa_data * cpa)404 static void cpa_collapse_large_pages(struct cpa_data *cpa)
405 {
406 unsigned long start, addr, end;
407 struct ptdesc *ptdesc, *tmp;
408 LIST_HEAD(pgtables);
409 int collapsed = 0;
410 int i;
411
412 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
413 for (i = 0; i < cpa->numpages; i++)
414 collapsed += collapse_large_pages(__cpa_addr(cpa, i),
415 &pgtables);
416 } else {
417 addr = __cpa_addr(cpa, 0);
418 start = addr & PMD_MASK;
419 end = addr + PAGE_SIZE * cpa->numpages;
420
421 for (addr = start; within(addr, start, end); addr += PMD_SIZE)
422 collapsed += collapse_large_pages(addr, &pgtables);
423 }
424
425 if (!collapsed)
426 return;
427
428 flush_tlb_all();
429
430 list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
431 list_del(&ptdesc->pt_list);
432 __free_page(ptdesc_page(ptdesc));
433 }
434 }
435
cpa_flush(struct cpa_data * cpa,int cache)436 static void cpa_flush(struct cpa_data *cpa, int cache)
437 {
438 unsigned long start, end;
439 unsigned int i;
440
441 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
442
443 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
444 cpa_flush_all(cache);
445 goto collapse_large_pages;
446 }
447
448 start = fix_addr(__cpa_addr(cpa, 0));
449 end = start + cpa->numpages * PAGE_SIZE;
450 if (cpa->force_flush_all)
451 end = TLB_FLUSH_ALL;
452
453 flush_tlb_kernel_range(start, end);
454
455 if (!cache)
456 goto collapse_large_pages;
457
458 mb();
459 for (i = 0; i < cpa->numpages; i++) {
460 unsigned long addr = __cpa_addr(cpa, i);
461 unsigned int level;
462
463 pte_t *pte = lookup_address(addr, &level);
464
465 /*
466 * Only flush present addresses:
467 */
468 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
469 clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
470 }
471 mb();
472
473 collapse_large_pages:
474 if (cpa->flags & CPA_COLLAPSE)
475 cpa_collapse_large_pages(cpa);
476 }
477
overlaps(unsigned long r1_start,unsigned long r1_end,unsigned long r2_start,unsigned long r2_end)478 static bool overlaps(unsigned long r1_start, unsigned long r1_end,
479 unsigned long r2_start, unsigned long r2_end)
480 {
481 return (r1_start <= r2_end && r1_end >= r2_start) ||
482 (r2_start <= r1_end && r2_end >= r1_start);
483 }
484
485 #ifdef CONFIG_PCI_BIOS
486 /*
487 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
488 * based config access (CONFIG_PCI_GOBIOS) support.
489 */
490 #define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
491 #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
492
protect_pci_bios(unsigned long spfn,unsigned long epfn)493 static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
494 {
495 if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
496 return _PAGE_NX;
497 return 0;
498 }
499 #else
protect_pci_bios(unsigned long spfn,unsigned long epfn)500 static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
501 {
502 return 0;
503 }
504 #endif
505
506 /*
507 * The .rodata section needs to be read-only. Using the pfn catches all
508 * aliases. This also includes __ro_after_init, so do not enforce until
509 * kernel_set_to_readonly is true.
510 */
protect_rodata(unsigned long spfn,unsigned long epfn)511 static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
512 {
513 unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
514
515 /*
516 * Note: __end_rodata is at page aligned and not inclusive, so
517 * subtract 1 to get the last enforced PFN in the rodata area.
518 */
519 epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
520
521 if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
522 return _PAGE_RW;
523 return 0;
524 }
525
526 /*
527 * Protect kernel text against becoming non executable by forbidding
528 * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
529 * out of which the kernel actually executes. Do not protect the low
530 * mapping.
531 *
532 * This does not cover __inittext since that is gone after boot.
533 */
protect_kernel_text(unsigned long start,unsigned long end)534 static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
535 {
536 unsigned long t_end = (unsigned long)_etext - 1;
537 unsigned long t_start = (unsigned long)_text;
538
539 if (overlaps(start, end, t_start, t_end))
540 return _PAGE_NX;
541 return 0;
542 }
543
544 #if defined(CONFIG_X86_64)
545 /*
546 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
547 * kernel text mappings for the large page aligned text, rodata sections
548 * will be always read-only. For the kernel identity mappings covering the
549 * holes caused by this alignment can be anything that user asks.
550 *
551 * This will preserve the large page mappings for kernel text/data at no
552 * extra cost.
553 */
protect_kernel_text_ro(unsigned long start,unsigned long end)554 static pgprotval_t protect_kernel_text_ro(unsigned long start,
555 unsigned long end)
556 {
557 unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
558 unsigned long t_start = (unsigned long)_text;
559 unsigned int level;
560
561 if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
562 return 0;
563 /*
564 * Don't enforce the !RW mapping for the kernel text mapping, if
565 * the current mapping is already using small page mapping. No
566 * need to work hard to preserve large page mappings in this case.
567 *
568 * This also fixes the Linux Xen paravirt guest boot failure caused
569 * by unexpected read-only mappings for kernel identity
570 * mappings. In this paravirt guest case, the kernel text mapping
571 * and the kernel identity mapping share the same page-table pages,
572 * so the protections for kernel text and identity mappings have to
573 * be the same.
574 */
575 if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
576 return _PAGE_RW;
577 return 0;
578 }
579 #else
protect_kernel_text_ro(unsigned long start,unsigned long end)580 static pgprotval_t protect_kernel_text_ro(unsigned long start,
581 unsigned long end)
582 {
583 return 0;
584 }
585 #endif
586
conflicts(pgprot_t prot,pgprotval_t val)587 static inline bool conflicts(pgprot_t prot, pgprotval_t val)
588 {
589 return (pgprot_val(prot) & ~val) != pgprot_val(prot);
590 }
591
check_conflict(int warnlvl,pgprot_t prot,pgprotval_t val,unsigned long start,unsigned long end,unsigned long pfn,const char * txt)592 static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
593 unsigned long start, unsigned long end,
594 unsigned long pfn, const char *txt)
595 {
596 static const char *lvltxt[] = {
597 [CPA_CONFLICT] = "conflict",
598 [CPA_PROTECT] = "protect",
599 [CPA_DETECT] = "detect",
600 };
601
602 if (warnlvl > cpa_warn_level || !conflicts(prot, val))
603 return;
604
605 pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
606 lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
607 (unsigned long long)val);
608 }
609
610 /*
611 * Certain areas of memory on x86 require very specific protection flags,
612 * for example the BIOS area or kernel text. Callers don't always get this
613 * right (again, ioremap() on BIOS memory is not uncommon) so this function
614 * checks and fixes these known static required protection bits.
615 */
static_protections(pgprot_t prot,unsigned long start,unsigned long pfn,unsigned long npg,unsigned long lpsize,int warnlvl)616 static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
617 unsigned long pfn, unsigned long npg,
618 unsigned long lpsize, int warnlvl)
619 {
620 pgprotval_t forbidden, res;
621 unsigned long end;
622
623 /*
624 * There is no point in checking RW/NX conflicts when the requested
625 * mapping is setting the page !PRESENT.
626 */
627 if (!(pgprot_val(prot) & _PAGE_PRESENT))
628 return prot;
629
630 /* Operate on the virtual address */
631 end = start + npg * PAGE_SIZE - 1;
632
633 res = protect_kernel_text(start, end);
634 check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
635 forbidden = res;
636
637 /*
638 * Special case to preserve a large page. If the change spawns the
639 * full large page mapping then there is no point to split it
640 * up. Happens with ftrace and is going to be removed once ftrace
641 * switched to text_poke().
642 */
643 if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
644 res = protect_kernel_text_ro(start, end);
645 check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
646 forbidden |= res;
647 }
648
649 /* Check the PFN directly */
650 res = protect_pci_bios(pfn, pfn + npg - 1);
651 check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
652 forbidden |= res;
653
654 res = protect_rodata(pfn, pfn + npg - 1);
655 check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
656 forbidden |= res;
657
658 return __pgprot(pgprot_val(prot) & ~forbidden);
659 }
660
661 /*
662 * Validate strict W^X semantics.
663 */
verify_rwx(pgprot_t old,pgprot_t new,unsigned long start,unsigned long pfn,unsigned long npg,bool nx,bool rw)664 static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
665 unsigned long pfn, unsigned long npg,
666 bool nx, bool rw)
667 {
668 unsigned long end;
669
670 /*
671 * 32-bit has some unfixable W+X issues, like EFI code
672 * and writeable data being in the same page. Disable
673 * detection and enforcement there.
674 */
675 if (IS_ENABLED(CONFIG_X86_32))
676 return new;
677
678 /* Only verify when NX is supported: */
679 if (!(__supported_pte_mask & _PAGE_NX))
680 return new;
681
682 if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
683 return new;
684
685 if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
686 return new;
687
688 /* Non-leaf translation entries can disable writing or execution. */
689 if (!rw || nx)
690 return new;
691
692 end = start + npg * PAGE_SIZE - 1;
693 WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
694 (unsigned long long)pgprot_val(old),
695 (unsigned long long)pgprot_val(new),
696 start, end, pfn);
697
698 /*
699 * For now, allow all permission change attempts by returning the
700 * attempted permissions. This can 'return old' to actively
701 * refuse the permission change at a later time.
702 */
703 return new;
704 }
705
706 /*
707 * Lookup the page table entry for a virtual address in a specific pgd.
708 * Return a pointer to the entry (or NULL if the entry does not exist),
709 * the level of the entry, and the effective NX and RW bits of all
710 * page table levels.
711 */
lookup_address_in_pgd_attr(pgd_t * pgd,unsigned long address,unsigned int * level,bool * nx,bool * rw)712 pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
713 unsigned int *level, bool *nx, bool *rw)
714 {
715 p4d_t *p4d;
716 pud_t *pud;
717 pmd_t *pmd;
718
719 *level = PG_LEVEL_256T;
720 *nx = false;
721 *rw = true;
722
723 if (pgd_none(*pgd))
724 return NULL;
725
726 *level = PG_LEVEL_512G;
727 *nx |= pgd_flags(*pgd) & _PAGE_NX;
728 *rw &= pgd_flags(*pgd) & _PAGE_RW;
729
730 p4d = p4d_offset(pgd, address);
731 if (p4d_none(*p4d))
732 return NULL;
733
734 if (p4d_leaf(*p4d) || !p4d_present(*p4d))
735 return (pte_t *)p4d;
736
737 *level = PG_LEVEL_1G;
738 *nx |= p4d_flags(*p4d) & _PAGE_NX;
739 *rw &= p4d_flags(*p4d) & _PAGE_RW;
740
741 pud = pud_offset(p4d, address);
742 if (pud_none(*pud))
743 return NULL;
744
745 if (pud_leaf(*pud) || !pud_present(*pud))
746 return (pte_t *)pud;
747
748 *level = PG_LEVEL_2M;
749 *nx |= pud_flags(*pud) & _PAGE_NX;
750 *rw &= pud_flags(*pud) & _PAGE_RW;
751
752 pmd = pmd_offset(pud, address);
753 if (pmd_none(*pmd))
754 return NULL;
755
756 if (pmd_leaf(*pmd) || !pmd_present(*pmd))
757 return (pte_t *)pmd;
758
759 *level = PG_LEVEL_4K;
760 *nx |= pmd_flags(*pmd) & _PAGE_NX;
761 *rw &= pmd_flags(*pmd) & _PAGE_RW;
762
763 return pte_offset_kernel(pmd, address);
764 }
765
766 /*
767 * Lookup the page table entry for a virtual address in a specific pgd.
768 * Return a pointer to the entry and the level of the mapping.
769 */
lookup_address_in_pgd(pgd_t * pgd,unsigned long address,unsigned int * level)770 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
771 unsigned int *level)
772 {
773 bool nx, rw;
774
775 return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
776 }
777
778 /*
779 * Lookup the page table entry for a virtual address. Return a pointer
780 * to the entry and the level of the mapping.
781 *
782 * Note: the function returns p4d, pud or pmd either when the entry is marked
783 * large or when the present bit is not set. Otherwise it returns NULL.
784 */
lookup_address(unsigned long address,unsigned int * level)785 pte_t *lookup_address(unsigned long address, unsigned int *level)
786 {
787 return lookup_address_in_pgd(pgd_offset_k(address), address, level);
788 }
789 EXPORT_SYMBOL_GPL(lookup_address);
790
_lookup_address_cpa(struct cpa_data * cpa,unsigned long address,unsigned int * level,bool * nx,bool * rw)791 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
792 unsigned int *level, bool *nx, bool *rw)
793 {
794 pgd_t *pgd;
795
796 if (!cpa->pgd)
797 pgd = pgd_offset_k(address);
798 else
799 pgd = cpa->pgd + pgd_index(address);
800
801 return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
802 }
803
804 /*
805 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
806 * or NULL if not present.
807 */
lookup_pmd_address(unsigned long address)808 pmd_t *lookup_pmd_address(unsigned long address)
809 {
810 pgd_t *pgd;
811 p4d_t *p4d;
812 pud_t *pud;
813
814 pgd = pgd_offset_k(address);
815 if (pgd_none(*pgd))
816 return NULL;
817
818 p4d = p4d_offset(pgd, address);
819 if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
820 return NULL;
821
822 pud = pud_offset(p4d, address);
823 if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
824 return NULL;
825
826 return pmd_offset(pud, address);
827 }
828
829 /*
830 * This is necessary because __pa() does not work on some
831 * kinds of memory, like vmalloc() or the alloc_remap()
832 * areas on 32-bit NUMA systems. The percpu areas can
833 * end up in this kind of memory, for instance.
834 *
835 * Note that as long as the PTEs are well-formed with correct PFNs, this
836 * works without checking the PRESENT bit in the leaf PTE. This is unlike
837 * the similar vmalloc_to_page() and derivatives. Callers may depend on
838 * this behavior.
839 *
840 * This could be optimized, but it is only used in paths that are not perf
841 * sensitive, and keeping it unoptimized should increase the testing coverage
842 * for the more obscure platforms.
843 */
slow_virt_to_phys(void * __virt_addr)844 phys_addr_t slow_virt_to_phys(void *__virt_addr)
845 {
846 unsigned long virt_addr = (unsigned long)__virt_addr;
847 phys_addr_t phys_addr;
848 unsigned long offset;
849 enum pg_level level;
850 pte_t *pte;
851
852 pte = lookup_address(virt_addr, &level);
853 BUG_ON(!pte);
854
855 /*
856 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
857 * before being left-shifted PAGE_SHIFT bits -- this trick is to
858 * make 32-PAE kernel work correctly.
859 */
860 switch (level) {
861 case PG_LEVEL_1G:
862 phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
863 offset = virt_addr & ~PUD_MASK;
864 break;
865 case PG_LEVEL_2M:
866 phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
867 offset = virt_addr & ~PMD_MASK;
868 break;
869 default:
870 phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
871 offset = virt_addr & ~PAGE_MASK;
872 }
873
874 return (phys_addr_t)(phys_addr | offset);
875 }
876 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
877
878 /*
879 * Set the new pmd in all the pgds we know about:
880 */
__set_pmd_pte(pte_t * kpte,unsigned long address,pte_t pte)881 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
882 {
883 /* change init_mm */
884 set_pte_atomic(kpte, pte);
885 #ifdef CONFIG_X86_32
886 {
887 struct page *page;
888
889 list_for_each_entry(page, &pgd_list, lru) {
890 pgd_t *pgd;
891 p4d_t *p4d;
892 pud_t *pud;
893 pmd_t *pmd;
894
895 pgd = (pgd_t *)page_address(page) + pgd_index(address);
896 p4d = p4d_offset(pgd, address);
897 pud = pud_offset(p4d, address);
898 pmd = pmd_offset(pud, address);
899 set_pte_atomic((pte_t *)pmd, pte);
900 }
901 }
902 #endif
903 }
904
pgprot_clear_protnone_bits(pgprot_t prot)905 static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
906 {
907 /*
908 * _PAGE_GLOBAL means "global page" for present PTEs.
909 * But, it is also used to indicate _PAGE_PROTNONE
910 * for non-present PTEs.
911 *
912 * This ensures that a _PAGE_GLOBAL PTE going from
913 * present to non-present is not confused as
914 * _PAGE_PROTNONE.
915 */
916 if (!(pgprot_val(prot) & _PAGE_PRESENT))
917 pgprot_val(prot) &= ~_PAGE_GLOBAL;
918
919 return prot;
920 }
921
__should_split_large_page(pte_t * kpte,unsigned long address,struct cpa_data * cpa)922 static int __should_split_large_page(pte_t *kpte, unsigned long address,
923 struct cpa_data *cpa)
924 {
925 unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
926 pgprot_t old_prot, new_prot, req_prot, chk_prot;
927 pte_t new_pte, *tmp;
928 enum pg_level level;
929 bool nx, rw;
930
931 /*
932 * Check for races, another CPU might have split this page
933 * up already:
934 */
935 tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
936 if (tmp != kpte)
937 return 1;
938
939 switch (level) {
940 case PG_LEVEL_2M:
941 old_prot = pmd_pgprot(*(pmd_t *)kpte);
942 old_pfn = pmd_pfn(*(pmd_t *)kpte);
943 cpa_inc_2m_checked();
944 break;
945 case PG_LEVEL_1G:
946 old_prot = pud_pgprot(*(pud_t *)kpte);
947 old_pfn = pud_pfn(*(pud_t *)kpte);
948 cpa_inc_1g_checked();
949 break;
950 default:
951 return -EINVAL;
952 }
953
954 psize = page_level_size(level);
955 pmask = page_level_mask(level);
956
957 /*
958 * Calculate the number of pages, which fit into this large
959 * page starting at address:
960 */
961 lpaddr = (address + psize) & pmask;
962 numpages = (lpaddr - address) >> PAGE_SHIFT;
963 if (numpages < cpa->numpages)
964 cpa->numpages = numpages;
965
966 /*
967 * We are safe now. Check whether the new pgprot is the same:
968 * Convert protection attributes to 4k-format, as cpa->mask* are set
969 * up accordingly.
970 */
971
972 /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
973 req_prot = pgprot_large_2_4k(old_prot);
974
975 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
976 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
977
978 /*
979 * req_prot is in format of 4k pages. It must be converted to large
980 * page format: the caching mode includes the PAT bit located at
981 * different bit positions in the two formats.
982 */
983 req_prot = pgprot_4k_2_large(req_prot);
984 req_prot = pgprot_clear_protnone_bits(req_prot);
985 if (pgprot_val(req_prot) & _PAGE_PRESENT)
986 pgprot_val(req_prot) |= _PAGE_PSE;
987
988 /*
989 * old_pfn points to the large page base pfn. So we need to add the
990 * offset of the virtual address:
991 */
992 pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
993 cpa->pfn = pfn;
994
995 /*
996 * Calculate the large page base address and the number of 4K pages
997 * in the large page
998 */
999 lpaddr = address & pmask;
1000 numpages = psize >> PAGE_SHIFT;
1001
1002 /*
1003 * Sanity check that the existing mapping is correct versus the static
1004 * protections. static_protections() guards against !PRESENT, so no
1005 * extra conditional required here.
1006 */
1007 chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
1008 psize, CPA_CONFLICT);
1009
1010 if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
1011 /*
1012 * Split the large page and tell the split code to
1013 * enforce static protections.
1014 */
1015 cpa->force_static_prot = 1;
1016 return 1;
1017 }
1018
1019 /*
1020 * Optimization: If the requested pgprot is the same as the current
1021 * pgprot, then the large page can be preserved and no updates are
1022 * required independent of alignment and length of the requested
1023 * range. The above already established that the current pgprot is
1024 * correct, which in consequence makes the requested pgprot correct
1025 * as well if it is the same. The static protection scan below will
1026 * not come to a different conclusion.
1027 */
1028 if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
1029 cpa_inc_lp_sameprot(level);
1030 return 0;
1031 }
1032
1033 /*
1034 * If the requested range does not cover the full page, split it up
1035 */
1036 if (address != lpaddr || cpa->numpages != numpages)
1037 return 1;
1038
1039 /*
1040 * Check whether the requested pgprot is conflicting with a static
1041 * protection requirement in the large page.
1042 */
1043 new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
1044 psize, CPA_DETECT);
1045
1046 new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
1047 nx, rw);
1048
1049 /*
1050 * If there is a conflict, split the large page.
1051 *
1052 * There used to be a 4k wise evaluation trying really hard to
1053 * preserve the large pages, but experimentation has shown, that this
1054 * does not help at all. There might be corner cases which would
1055 * preserve one large page occasionally, but it's really not worth the
1056 * extra code and cycles for the common case.
1057 */
1058 if (pgprot_val(req_prot) != pgprot_val(new_prot))
1059 return 1;
1060
1061 /* All checks passed. Update the large page mapping. */
1062 new_pte = pfn_pte(old_pfn, new_prot);
1063 __set_pmd_pte(kpte, address, new_pte);
1064 cpa->flags |= CPA_FLUSHTLB;
1065 cpa_inc_lp_preserved(level);
1066 return 0;
1067 }
1068
should_split_large_page(pte_t * kpte,unsigned long address,struct cpa_data * cpa)1069 static int should_split_large_page(pte_t *kpte, unsigned long address,
1070 struct cpa_data *cpa)
1071 {
1072 int do_split;
1073
1074 if (cpa->force_split)
1075 return 1;
1076
1077 spin_lock(&pgd_lock);
1078 do_split = __should_split_large_page(kpte, address, cpa);
1079 spin_unlock(&pgd_lock);
1080
1081 return do_split;
1082 }
1083
split_set_pte(struct cpa_data * cpa,pte_t * pte,unsigned long pfn,pgprot_t ref_prot,unsigned long address,unsigned long size)1084 static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
1085 pgprot_t ref_prot, unsigned long address,
1086 unsigned long size)
1087 {
1088 unsigned int npg = PFN_DOWN(size);
1089 pgprot_t prot;
1090
1091 /*
1092 * If should_split_large_page() discovered an inconsistent mapping,
1093 * remove the invalid protection in the split mapping.
1094 */
1095 if (!cpa->force_static_prot)
1096 goto set;
1097
1098 /* Hand in lpsize = 0 to enforce the protection mechanism */
1099 prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);
1100
1101 if (pgprot_val(prot) == pgprot_val(ref_prot))
1102 goto set;
1103
1104 /*
1105 * If this is splitting a PMD, fix it up. PUD splits cannot be
1106 * fixed trivially as that would require to rescan the newly
1107 * installed PMD mappings after returning from split_large_page()
1108 * so an eventual further split can allocate the necessary PTE
1109 * pages. Warn for now and revisit it in case this actually
1110 * happens.
1111 */
1112 if (size == PAGE_SIZE)
1113 ref_prot = prot;
1114 else
1115 pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
1116 set:
1117 set_pte(pte, pfn_pte(pfn, ref_prot));
1118 }
1119
1120 static int
__split_large_page(struct cpa_data * cpa,pte_t * kpte,unsigned long address,struct page * base)1121 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
1122 struct page *base)
1123 {
1124 unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
1125 pte_t *pbase = (pte_t *)page_address(base);
1126 unsigned int i, level;
1127 pgprot_t ref_prot;
1128 bool nx, rw;
1129 pte_t *tmp;
1130
1131 spin_lock(&pgd_lock);
1132 /*
1133 * Check for races, another CPU might have split this page
1134 * up for us already:
1135 */
1136 tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
1137 if (tmp != kpte) {
1138 spin_unlock(&pgd_lock);
1139 return 1;
1140 }
1141
1142 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
1143
1144 switch (level) {
1145 case PG_LEVEL_2M:
1146 ref_prot = pmd_pgprot(*(pmd_t *)kpte);
1147 /*
1148 * Clear PSE (aka _PAGE_PAT) and move
1149 * PAT bit to correct position.
1150 */
1151 ref_prot = pgprot_large_2_4k(ref_prot);
1152 ref_pfn = pmd_pfn(*(pmd_t *)kpte);
1153 lpaddr = address & PMD_MASK;
1154 lpinc = PAGE_SIZE;
1155 break;
1156
1157 case PG_LEVEL_1G:
1158 ref_prot = pud_pgprot(*(pud_t *)kpte);
1159 ref_pfn = pud_pfn(*(pud_t *)kpte);
1160 pfninc = PMD_SIZE >> PAGE_SHIFT;
1161 lpaddr = address & PUD_MASK;
1162 lpinc = PMD_SIZE;
1163 /*
1164 * Clear the PSE flags if the PRESENT flag is not set
1165 * otherwise pmd_present() will return true even on a non
1166 * present pmd.
1167 */
1168 if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
1169 pgprot_val(ref_prot) &= ~_PAGE_PSE;
1170 break;
1171
1172 default:
1173 spin_unlock(&pgd_lock);
1174 return 1;
1175 }
1176
1177 ref_prot = pgprot_clear_protnone_bits(ref_prot);
1178
1179 /*
1180 * Get the target pfn from the original entry:
1181 */
1182 pfn = ref_pfn;
1183 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
1184 split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
1185
1186 if (virt_addr_valid(address)) {
1187 unsigned long pfn = PFN_DOWN(__pa(address));
1188
1189 if (pfn_range_is_mapped(pfn, pfn + 1))
1190 split_page_count(level);
1191 }
1192
1193 /*
1194 * Install the new, split up pagetable.
1195 *
1196 * We use the standard kernel pagetable protections for the new
1197 * pagetable protections, the actual ptes set above control the
1198 * primary protection behavior:
1199 */
1200 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
1201
1202 /*
1203 * Do a global flush tlb after splitting the large page
1204 * and before we do the actual change page attribute in the PTE.
1205 *
1206 * Without this, we violate the TLB application note, that says:
1207 * "The TLBs may contain both ordinary and large-page
1208 * translations for a 4-KByte range of linear addresses. This
1209 * may occur if software modifies the paging structures so that
1210 * the page size used for the address range changes. If the two
1211 * translations differ with respect to page frame or attributes
1212 * (e.g., permissions), processor behavior is undefined and may
1213 * be implementation-specific."
1214 *
1215 * We do this global tlb flush inside the cpa_lock, so that we
1216 * don't allow any other cpu, with stale tlb entries change the
1217 * page attribute in parallel, that also falls into the
1218 * just split large page entry.
1219 */
1220 flush_tlb_all();
1221 spin_unlock(&pgd_lock);
1222
1223 return 0;
1224 }
1225
split_large_page(struct cpa_data * cpa,pte_t * kpte,unsigned long address)1226 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
1227 unsigned long address)
1228 {
1229 struct page *base;
1230
1231 if (!debug_pagealloc_enabled())
1232 spin_unlock(&cpa_lock);
1233 base = alloc_pages(GFP_KERNEL, 0);
1234 if (!debug_pagealloc_enabled())
1235 spin_lock(&cpa_lock);
1236 if (!base)
1237 return -ENOMEM;
1238
1239 if (__split_large_page(cpa, kpte, address, base))
1240 __free_page(base);
1241
1242 return 0;
1243 }
1244
collapse_pmd_page(pmd_t * pmd,unsigned long addr,struct list_head * pgtables)1245 static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
1246 struct list_head *pgtables)
1247 {
1248 pmd_t _pmd, old_pmd;
1249 pte_t *pte, first;
1250 unsigned long pfn;
1251 pgprot_t pgprot;
1252 int i = 0;
1253
1254 if (!cpu_feature_enabled(X86_FEATURE_PSE))
1255 return 0;
1256
1257 addr &= PMD_MASK;
1258 pte = pte_offset_kernel(pmd, addr);
1259 first = *pte;
1260 pfn = pte_pfn(first);
1261
1262 /* Make sure alignment is suitable */
1263 if (PFN_PHYS(pfn) & ~PMD_MASK)
1264 return 0;
1265
1266 /* The page is 4k intentionally */
1267 if (pte_flags(first) & _PAGE_KERNEL_4K)
1268 return 0;
1269
1270 /* Check that the rest of PTEs are compatible with the first one */
1271 for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
1272 pte_t entry = *pte;
1273
1274 if (!pte_present(entry))
1275 return 0;
1276 if (pte_flags(entry) != pte_flags(first))
1277 return 0;
1278 if (pte_pfn(entry) != pte_pfn(first) + i)
1279 return 0;
1280 }
1281
1282 old_pmd = *pmd;
1283
1284 /* Success: set up a large page */
1285 pgprot = pgprot_4k_2_large(pte_pgprot(first));
1286 pgprot_val(pgprot) |= _PAGE_PSE;
1287 _pmd = pfn_pmd(pfn, pgprot);
1288 set_pmd(pmd, _pmd);
1289
1290 /* Queue the page table to be freed after TLB flush */
1291 list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
1292
1293 if (IS_ENABLED(CONFIG_X86_32)) {
1294 struct page *page;
1295
1296 /* Update all PGD tables to use the same large page */
1297 list_for_each_entry(page, &pgd_list, lru) {
1298 pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
1299 p4d_t *p4d = p4d_offset(pgd, addr);
1300 pud_t *pud = pud_offset(p4d, addr);
1301 pmd_t *pmd = pmd_offset(pud, addr);
1302 /* Something is wrong if entries doesn't match */
1303 if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
1304 continue;
1305 set_pmd(pmd, _pmd);
1306 }
1307 }
1308
1309 if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1310 collapse_page_count(PG_LEVEL_2M);
1311
1312 return 1;
1313 }
1314
collapse_pud_page(pud_t * pud,unsigned long addr,struct list_head * pgtables)1315 static int collapse_pud_page(pud_t *pud, unsigned long addr,
1316 struct list_head *pgtables)
1317 {
1318 unsigned long pfn;
1319 pmd_t *pmd, first;
1320 int i;
1321
1322 if (!direct_gbpages)
1323 return 0;
1324
1325 addr &= PUD_MASK;
1326 pmd = pmd_offset(pud, addr);
1327 first = *pmd;
1328
1329 /*
1330 * To restore PUD page all PMD entries must be large and
1331 * have suitable alignment
1332 */
1333 pfn = pmd_pfn(first);
1334 if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
1335 return 0;
1336
1337 /*
1338 * To restore PUD page, all following PMDs must be compatible with the
1339 * first one.
1340 */
1341 for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
1342 pmd_t entry = *pmd;
1343
1344 if (!pmd_present(entry) || !pmd_leaf(entry))
1345 return 0;
1346 if (pmd_flags(entry) != pmd_flags(first))
1347 return 0;
1348 if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
1349 return 0;
1350 }
1351
1352 /* Restore PUD page and queue page table to be freed after TLB flush */
1353 list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
1354 set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
1355
1356 if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1357 collapse_page_count(PG_LEVEL_1G);
1358
1359 return 1;
1360 }
1361
1362 /*
1363 * Collapse PMD and PUD pages in the kernel mapping around the address where
1364 * possible.
1365 *
1366 * Caller must flush TLB and free page tables queued on the list before
1367 * touching the new entries. CPU must not see TLB entries of different size
1368 * with different attributes.
1369 */
collapse_large_pages(unsigned long addr,struct list_head * pgtables)1370 static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
1371 {
1372 int collapsed = 0;
1373 pgd_t *pgd;
1374 p4d_t *p4d;
1375 pud_t *pud;
1376 pmd_t *pmd;
1377
1378 addr &= PMD_MASK;
1379
1380 spin_lock(&pgd_lock);
1381 pgd = pgd_offset_k(addr);
1382 if (pgd_none(*pgd))
1383 goto out;
1384 p4d = p4d_offset(pgd, addr);
1385 if (p4d_none(*p4d))
1386 goto out;
1387 pud = pud_offset(p4d, addr);
1388 if (!pud_present(*pud) || pud_leaf(*pud))
1389 goto out;
1390 pmd = pmd_offset(pud, addr);
1391 if (!pmd_present(*pmd) || pmd_leaf(*pmd))
1392 goto out;
1393
1394 collapsed = collapse_pmd_page(pmd, addr, pgtables);
1395 if (collapsed)
1396 collapsed += collapse_pud_page(pud, addr, pgtables);
1397
1398 out:
1399 spin_unlock(&pgd_lock);
1400 return collapsed;
1401 }
1402
try_to_free_pte_page(pte_t * pte)1403 static bool try_to_free_pte_page(pte_t *pte)
1404 {
1405 int i;
1406
1407 for (i = 0; i < PTRS_PER_PTE; i++)
1408 if (!pte_none(pte[i]))
1409 return false;
1410
1411 free_page((unsigned long)pte);
1412 return true;
1413 }
1414
try_to_free_pmd_page(pmd_t * pmd)1415 static bool try_to_free_pmd_page(pmd_t *pmd)
1416 {
1417 int i;
1418
1419 for (i = 0; i < PTRS_PER_PMD; i++)
1420 if (!pmd_none(pmd[i]))
1421 return false;
1422
1423 free_page((unsigned long)pmd);
1424 return true;
1425 }
1426
unmap_pte_range(pmd_t * pmd,unsigned long start,unsigned long end)1427 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
1428 {
1429 pte_t *pte = pte_offset_kernel(pmd, start);
1430
1431 while (start < end) {
1432 set_pte(pte, __pte(0));
1433
1434 start += PAGE_SIZE;
1435 pte++;
1436 }
1437
1438 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
1439 pmd_clear(pmd);
1440 return true;
1441 }
1442 return false;
1443 }
1444
__unmap_pmd_range(pud_t * pud,pmd_t * pmd,unsigned long start,unsigned long end)1445 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
1446 unsigned long start, unsigned long end)
1447 {
1448 if (unmap_pte_range(pmd, start, end))
1449 if (try_to_free_pmd_page(pud_pgtable(*pud)))
1450 pud_clear(pud);
1451 }
1452
unmap_pmd_range(pud_t * pud,unsigned long start,unsigned long end)1453 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
1454 {
1455 pmd_t *pmd = pmd_offset(pud, start);
1456
1457 /*
1458 * Not on a 2MB page boundary?
1459 */
1460 if (start & (PMD_SIZE - 1)) {
1461 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
1462 unsigned long pre_end = min_t(unsigned long, end, next_page);
1463
1464 __unmap_pmd_range(pud, pmd, start, pre_end);
1465
1466 start = pre_end;
1467 pmd++;
1468 }
1469
1470 /*
1471 * Try to unmap in 2M chunks.
1472 */
1473 while (end - start >= PMD_SIZE) {
1474 if (pmd_leaf(*pmd))
1475 pmd_clear(pmd);
1476 else
1477 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
1478
1479 start += PMD_SIZE;
1480 pmd++;
1481 }
1482
1483 /*
1484 * 4K leftovers?
1485 */
1486 if (start < end)
1487 return __unmap_pmd_range(pud, pmd, start, end);
1488
1489 /*
1490 * Try again to free the PMD page if haven't succeeded above.
1491 */
1492 if (!pud_none(*pud))
1493 if (try_to_free_pmd_page(pud_pgtable(*pud)))
1494 pud_clear(pud);
1495 }
1496
unmap_pud_range(p4d_t * p4d,unsigned long start,unsigned long end)1497 static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
1498 {
1499 pud_t *pud = pud_offset(p4d, start);
1500
1501 /*
1502 * Not on a GB page boundary?
1503 */
1504 if (start & (PUD_SIZE - 1)) {
1505 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1506 unsigned long pre_end = min_t(unsigned long, end, next_page);
1507
1508 unmap_pmd_range(pud, start, pre_end);
1509
1510 start = pre_end;
1511 pud++;
1512 }
1513
1514 /*
1515 * Try to unmap in 1G chunks?
1516 */
1517 while (end - start >= PUD_SIZE) {
1518
1519 if (pud_leaf(*pud))
1520 pud_clear(pud);
1521 else
1522 unmap_pmd_range(pud, start, start + PUD_SIZE);
1523
1524 start += PUD_SIZE;
1525 pud++;
1526 }
1527
1528 /*
1529 * 2M leftovers?
1530 */
1531 if (start < end)
1532 unmap_pmd_range(pud, start, end);
1533
1534 /*
1535 * No need to try to free the PUD page because we'll free it in
1536 * populate_pgd's error path
1537 */
1538 }
1539
alloc_pte_page(pmd_t * pmd)1540 static int alloc_pte_page(pmd_t *pmd)
1541 {
1542 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
1543 if (!pte)
1544 return -1;
1545
1546 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
1547 return 0;
1548 }
1549
alloc_pmd_page(pud_t * pud)1550 static int alloc_pmd_page(pud_t *pud)
1551 {
1552 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
1553 if (!pmd)
1554 return -1;
1555
1556 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
1557 return 0;
1558 }
1559
populate_pte(struct cpa_data * cpa,unsigned long start,unsigned long end,unsigned num_pages,pmd_t * pmd,pgprot_t pgprot)1560 static void populate_pte(struct cpa_data *cpa,
1561 unsigned long start, unsigned long end,
1562 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
1563 {
1564 pte_t *pte;
1565
1566 pte = pte_offset_kernel(pmd, start);
1567
1568 pgprot = pgprot_clear_protnone_bits(pgprot);
1569
1570 while (num_pages-- && start < end) {
1571 set_pte(pte, pfn_pte(cpa->pfn, pgprot));
1572
1573 start += PAGE_SIZE;
1574 cpa->pfn++;
1575 pte++;
1576 }
1577 }
1578
populate_pmd(struct cpa_data * cpa,unsigned long start,unsigned long end,unsigned num_pages,pud_t * pud,pgprot_t pgprot)1579 static long populate_pmd(struct cpa_data *cpa,
1580 unsigned long start, unsigned long end,
1581 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
1582 {
1583 long cur_pages = 0;
1584 pmd_t *pmd;
1585 pgprot_t pmd_pgprot;
1586
1587 /*
1588 * Not on a 2M boundary?
1589 */
1590 if (start & (PMD_SIZE - 1)) {
1591 unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
1592 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
1593
1594 pre_end = min_t(unsigned long, pre_end, next_page);
1595 cur_pages = (pre_end - start) >> PAGE_SHIFT;
1596 cur_pages = min_t(unsigned int, num_pages, cur_pages);
1597
1598 /*
1599 * Need a PTE page?
1600 */
1601 pmd = pmd_offset(pud, start);
1602 if (pmd_none(*pmd))
1603 if (alloc_pte_page(pmd))
1604 return -1;
1605
1606 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
1607
1608 start = pre_end;
1609 }
1610
1611 /*
1612 * We mapped them all?
1613 */
1614 if (num_pages == cur_pages)
1615 return cur_pages;
1616
1617 pmd_pgprot = pgprot_4k_2_large(pgprot);
1618
1619 while (end - start >= PMD_SIZE) {
1620
1621 /*
1622 * We cannot use a 1G page so allocate a PMD page if needed.
1623 */
1624 if (pud_none(*pud))
1625 if (alloc_pmd_page(pud))
1626 return -1;
1627
1628 pmd = pmd_offset(pud, start);
1629
1630 set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
1631 canon_pgprot(pmd_pgprot))));
1632
1633 start += PMD_SIZE;
1634 cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
1635 cur_pages += PMD_SIZE >> PAGE_SHIFT;
1636 }
1637
1638 /*
1639 * Map trailing 4K pages.
1640 */
1641 if (start < end) {
1642 pmd = pmd_offset(pud, start);
1643 if (pmd_none(*pmd))
1644 if (alloc_pte_page(pmd))
1645 return -1;
1646
1647 populate_pte(cpa, start, end, num_pages - cur_pages,
1648 pmd, pgprot);
1649 }
1650 return num_pages;
1651 }
1652
populate_pud(struct cpa_data * cpa,unsigned long start,p4d_t * p4d,pgprot_t pgprot)1653 static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
1654 pgprot_t pgprot)
1655 {
1656 pud_t *pud;
1657 unsigned long end;
1658 long cur_pages = 0;
1659 pgprot_t pud_pgprot;
1660
1661 end = start + (cpa->numpages << PAGE_SHIFT);
1662
1663 /*
1664 * Not on a Gb page boundary? => map everything up to it with
1665 * smaller pages.
1666 */
1667 if (start & (PUD_SIZE - 1)) {
1668 unsigned long pre_end;
1669 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1670
1671 pre_end = min_t(unsigned long, end, next_page);
1672 cur_pages = (pre_end - start) >> PAGE_SHIFT;
1673 cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1674
1675 pud = pud_offset(p4d, start);
1676
1677 /*
1678 * Need a PMD page?
1679 */
1680 if (pud_none(*pud))
1681 if (alloc_pmd_page(pud))
1682 return -1;
1683
1684 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1685 pud, pgprot);
1686 if (cur_pages < 0)
1687 return cur_pages;
1688
1689 start = pre_end;
1690 }
1691
1692 /* We mapped them all? */
1693 if (cpa->numpages == cur_pages)
1694 return cur_pages;
1695
1696 pud = pud_offset(p4d, start);
1697 pud_pgprot = pgprot_4k_2_large(pgprot);
1698
1699 /*
1700 * Map everything starting from the Gb boundary, possibly with 1G pages
1701 */
1702 while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1703 set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
1704 canon_pgprot(pud_pgprot))));
1705
1706 start += PUD_SIZE;
1707 cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
1708 cur_pages += PUD_SIZE >> PAGE_SHIFT;
1709 pud++;
1710 }
1711
1712 /* Map trailing leftover */
1713 if (start < end) {
1714 long tmp;
1715
1716 pud = pud_offset(p4d, start);
1717 if (pud_none(*pud))
1718 if (alloc_pmd_page(pud))
1719 return -1;
1720
1721 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1722 pud, pgprot);
1723 if (tmp < 0)
1724 return cur_pages;
1725
1726 cur_pages += tmp;
1727 }
1728 return cur_pages;
1729 }
1730
1731 /*
1732 * Restrictions for kernel page table do not necessarily apply when mapping in
1733 * an alternate PGD.
1734 */
populate_pgd(struct cpa_data * cpa,unsigned long addr)1735 static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1736 {
1737 pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1738 pud_t *pud = NULL; /* shut up gcc */
1739 p4d_t *p4d;
1740 pgd_t *pgd_entry;
1741 long ret;
1742
1743 pgd_entry = cpa->pgd + pgd_index(addr);
1744
1745 if (pgd_none(*pgd_entry)) {
1746 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
1747 if (!p4d)
1748 return -1;
1749
1750 set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
1751 }
1752
1753 /*
1754 * Allocate a PUD page and hand it down for mapping.
1755 */
1756 p4d = p4d_offset(pgd_entry, addr);
1757 if (p4d_none(*p4d)) {
1758 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
1759 if (!pud)
1760 return -1;
1761
1762 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
1763 }
1764
1765 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1766 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
1767
1768 ret = populate_pud(cpa, addr, p4d, pgprot);
1769 if (ret < 0) {
1770 /*
1771 * Leave the PUD page in place in case some other CPU or thread
1772 * already found it, but remove any useless entries we just
1773 * added to it.
1774 */
1775 unmap_pud_range(p4d, addr,
1776 addr + (cpa->numpages << PAGE_SHIFT));
1777 return ret;
1778 }
1779
1780 cpa->numpages = ret;
1781 return 0;
1782 }
1783
__cpa_process_fault(struct cpa_data * cpa,unsigned long vaddr,int primary)1784 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1785 int primary)
1786 {
1787 if (cpa->pgd) {
1788 /*
1789 * Right now, we only execute this code path when mapping
1790 * the EFI virtual memory map regions, no other users
1791 * provide a ->pgd value. This may change in the future.
1792 */
1793 return populate_pgd(cpa, vaddr);
1794 }
1795
1796 /*
1797 * Ignore all non primary paths.
1798 */
1799 if (!primary) {
1800 cpa->numpages = 1;
1801 return 0;
1802 }
1803
1804 /*
1805 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1806 * to have holes.
1807 * Also set numpages to '1' indicating that we processed cpa req for
1808 * one virtual address page and its pfn. TBD: numpages can be set based
1809 * on the initial value and the level returned by lookup_address().
1810 */
1811 if (within(vaddr, PAGE_OFFSET,
1812 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1813 cpa->numpages = 1;
1814 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1815 return 0;
1816
1817 } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
1818 /* Faults in the highmap are OK, so do not warn: */
1819 return -EFAULT;
1820 } else {
1821 WARN(1, KERN_WARNING "CPA: called for zero pte. "
1822 "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1823 *cpa->vaddr);
1824
1825 return -EFAULT;
1826 }
1827 }
1828
__change_page_attr(struct cpa_data * cpa,int primary)1829 static int __change_page_attr(struct cpa_data *cpa, int primary)
1830 {
1831 unsigned long address;
1832 int do_split, err;
1833 unsigned int level;
1834 pte_t *kpte, old_pte;
1835 bool nx, rw;
1836
1837 address = __cpa_addr(cpa, cpa->curpage);
1838 repeat:
1839 kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
1840 if (!kpte)
1841 return __cpa_process_fault(cpa, address, primary);
1842
1843 old_pte = *kpte;
1844 if (pte_none(old_pte))
1845 return __cpa_process_fault(cpa, address, primary);
1846
1847 if (level == PG_LEVEL_4K) {
1848 pte_t new_pte;
1849 pgprot_t old_prot = pte_pgprot(old_pte);
1850 pgprot_t new_prot = pte_pgprot(old_pte);
1851 unsigned long pfn = pte_pfn(old_pte);
1852
1853 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1854 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1855
1856 cpa_inc_4k_install();
1857 /* Hand in lpsize = 0 to enforce the protection mechanism */
1858 new_prot = static_protections(new_prot, address, pfn, 1, 0,
1859 CPA_PROTECT);
1860
1861 new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
1862 nx, rw);
1863
1864 new_prot = pgprot_clear_protnone_bits(new_prot);
1865
1866 /*
1867 * We need to keep the pfn from the existing PTE,
1868 * after all we're only going to change its attributes
1869 * not the memory it points to
1870 */
1871 new_pte = pfn_pte(pfn, new_prot);
1872 cpa->pfn = pfn;
1873 /*
1874 * Do we really change anything ?
1875 */
1876 if (pte_val(old_pte) != pte_val(new_pte)) {
1877 set_pte_atomic(kpte, new_pte);
1878 cpa->flags |= CPA_FLUSHTLB;
1879 }
1880 cpa->numpages = 1;
1881 return 0;
1882 }
1883
1884 /*
1885 * Check, whether we can keep the large page intact
1886 * and just change the pte:
1887 */
1888 do_split = should_split_large_page(kpte, address, cpa);
1889 /*
1890 * When the range fits into the existing large page,
1891 * return. cp->numpages and cpa->tlbflush have been updated in
1892 * try_large_page:
1893 */
1894 if (do_split <= 0)
1895 return do_split;
1896
1897 /*
1898 * We have to split the large page:
1899 */
1900 err = split_large_page(cpa, kpte, address);
1901 if (!err)
1902 goto repeat;
1903
1904 return err;
1905 }
1906
1907 static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
1908
1909 /*
1910 * Check the directmap and "high kernel map" 'aliases'.
1911 */
cpa_process_alias(struct cpa_data * cpa)1912 static int cpa_process_alias(struct cpa_data *cpa)
1913 {
1914 struct cpa_data alias_cpa;
1915 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1916 unsigned long vaddr;
1917 int ret;
1918
1919 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1920 return 0;
1921
1922 /*
1923 * No need to redo, when the primary call touched the direct
1924 * mapping already:
1925 */
1926 vaddr = __cpa_addr(cpa, cpa->curpage);
1927 if (!(within(vaddr, PAGE_OFFSET,
1928 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1929
1930 alias_cpa = *cpa;
1931 alias_cpa.vaddr = &laddr;
1932 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1933 alias_cpa.curpage = 0;
1934
1935 /* Directmap always has NX set, do not modify. */
1936 if (__supported_pte_mask & _PAGE_NX) {
1937 alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
1938 alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
1939 }
1940
1941 cpa->force_flush_all = 1;
1942
1943 ret = __change_page_attr_set_clr(&alias_cpa, 0);
1944 if (ret)
1945 return ret;
1946 }
1947
1948 #ifdef CONFIG_X86_64
1949 /*
1950 * If the primary call didn't touch the high mapping already
1951 * and the physical address is inside the kernel map, we need
1952 * to touch the high mapped kernel as well:
1953 */
1954 if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1955 __cpa_pfn_in_highmap(cpa->pfn)) {
1956 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1957 __START_KERNEL_map - phys_base;
1958 alias_cpa = *cpa;
1959 alias_cpa.vaddr = &temp_cpa_vaddr;
1960 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1961 alias_cpa.curpage = 0;
1962
1963 /*
1964 * [_text, _brk_end) also covers data, do not modify NX except
1965 * in cases where the highmap is the primary target.
1966 */
1967 if (__supported_pte_mask & _PAGE_NX) {
1968 alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
1969 alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
1970 }
1971
1972 cpa->force_flush_all = 1;
1973 /*
1974 * The high mapping range is imprecise, so ignore the
1975 * return value.
1976 */
1977 __change_page_attr_set_clr(&alias_cpa, 0);
1978 }
1979 #endif
1980
1981 return 0;
1982 }
1983
__change_page_attr_set_clr(struct cpa_data * cpa,int primary)1984 static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
1985 {
1986 unsigned long numpages = cpa->numpages;
1987 unsigned long rempages = numpages;
1988 int ret = 0;
1989
1990 /*
1991 * No changes, easy!
1992 */
1993 if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
1994 !cpa->force_split)
1995 return ret;
1996
1997 while (rempages) {
1998 /*
1999 * Store the remaining nr of pages for the large page
2000 * preservation check.
2001 */
2002 cpa->numpages = rempages;
2003 /* for array changes, we can't use large page */
2004 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
2005 cpa->numpages = 1;
2006
2007 if (!debug_pagealloc_enabled())
2008 spin_lock(&cpa_lock);
2009 ret = __change_page_attr(cpa, primary);
2010 if (!debug_pagealloc_enabled())
2011 spin_unlock(&cpa_lock);
2012 if (ret)
2013 goto out;
2014
2015 if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
2016 ret = cpa_process_alias(cpa);
2017 if (ret)
2018 goto out;
2019 }
2020
2021 /*
2022 * Adjust the number of pages with the result of the
2023 * CPA operation. Either a large page has been
2024 * preserved or a single page update happened.
2025 */
2026 BUG_ON(cpa->numpages > rempages || !cpa->numpages);
2027 rempages -= cpa->numpages;
2028 cpa->curpage += cpa->numpages;
2029 }
2030
2031 out:
2032 /* Restore the original numpages */
2033 cpa->numpages = numpages;
2034 return ret;
2035 }
2036
change_page_attr_set_clr(unsigned long * addr,int numpages,pgprot_t mask_set,pgprot_t mask_clr,int force_split,int in_flag,struct page ** pages)2037 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
2038 pgprot_t mask_set, pgprot_t mask_clr,
2039 int force_split, int in_flag,
2040 struct page **pages)
2041 {
2042 struct cpa_data cpa;
2043 int ret, cache;
2044
2045 memset(&cpa, 0, sizeof(cpa));
2046
2047 /*
2048 * Check, if we are requested to set a not supported
2049 * feature. Clearing non-supported features is OK.
2050 */
2051 mask_set = canon_pgprot(mask_set);
2052
2053 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
2054 return 0;
2055
2056 /* Ensure we are PAGE_SIZE aligned */
2057 if (in_flag & CPA_ARRAY) {
2058 int i;
2059 for (i = 0; i < numpages; i++) {
2060 if (addr[i] & ~PAGE_MASK) {
2061 addr[i] &= PAGE_MASK;
2062 WARN_ON_ONCE(1);
2063 }
2064 }
2065 } else if (!(in_flag & CPA_PAGES_ARRAY)) {
2066 /*
2067 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
2068 * No need to check in that case
2069 */
2070 if (*addr & ~PAGE_MASK) {
2071 *addr &= PAGE_MASK;
2072 /*
2073 * People should not be passing in unaligned addresses:
2074 */
2075 WARN_ON_ONCE(1);
2076 }
2077 }
2078
2079 /* Must avoid aliasing mappings in the highmem code */
2080 kmap_flush_unused();
2081
2082 vm_unmap_aliases();
2083
2084 cpa.vaddr = addr;
2085 cpa.pages = pages;
2086 cpa.numpages = numpages;
2087 cpa.mask_set = mask_set;
2088 cpa.mask_clr = mask_clr;
2089 cpa.flags = in_flag;
2090 cpa.curpage = 0;
2091 cpa.force_split = force_split;
2092
2093 ret = __change_page_attr_set_clr(&cpa, 1);
2094
2095 /*
2096 * Check whether we really changed something:
2097 */
2098 if (!(cpa.flags & CPA_FLUSHTLB))
2099 goto out;
2100
2101 /*
2102 * No need to flush, when we did not set any of the caching
2103 * attributes:
2104 */
2105 cache = !!pgprot2cachemode(mask_set);
2106
2107 /*
2108 * On error; flush everything to be sure.
2109 */
2110 if (ret) {
2111 cpa_flush_all(cache);
2112 goto out;
2113 }
2114
2115 cpa_flush(&cpa, cache);
2116 out:
2117 return ret;
2118 }
2119
change_page_attr_set(unsigned long * addr,int numpages,pgprot_t mask,int array)2120 static inline int change_page_attr_set(unsigned long *addr, int numpages,
2121 pgprot_t mask, int array)
2122 {
2123 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
2124 (array ? CPA_ARRAY : 0), NULL);
2125 }
2126
change_page_attr_clear(unsigned long * addr,int numpages,pgprot_t mask,int array)2127 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
2128 pgprot_t mask, int array)
2129 {
2130 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
2131 (array ? CPA_ARRAY : 0), NULL);
2132 }
2133
cpa_set_pages_array(struct page ** pages,int numpages,pgprot_t mask)2134 static inline int cpa_set_pages_array(struct page **pages, int numpages,
2135 pgprot_t mask)
2136 {
2137 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
2138 CPA_PAGES_ARRAY, pages);
2139 }
2140
cpa_clear_pages_array(struct page ** pages,int numpages,pgprot_t mask)2141 static inline int cpa_clear_pages_array(struct page **pages, int numpages,
2142 pgprot_t mask)
2143 {
2144 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
2145 CPA_PAGES_ARRAY, pages);
2146 }
2147
2148 /*
2149 * __set_memory_prot is an internal helper for callers that have been passed
2150 * a pgprot_t value from upper layers and a reservation has already been taken.
2151 * If you want to set the pgprot to a specific page protocol, use the
2152 * set_memory_xx() functions.
2153 */
__set_memory_prot(unsigned long addr,int numpages,pgprot_t prot)2154 int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
2155 {
2156 return change_page_attr_set_clr(&addr, numpages, prot,
2157 __pgprot(~pgprot_val(prot)), 0, 0,
2158 NULL);
2159 }
2160
_set_memory_uc(unsigned long addr,int numpages)2161 int _set_memory_uc(unsigned long addr, int numpages)
2162 {
2163 /*
2164 * for now UC MINUS. see comments in ioremap()
2165 * If you really need strong UC use ioremap_uc(), but note
2166 * that you cannot override IO areas with set_memory_*() as
2167 * these helpers cannot work with IO memory.
2168 */
2169 return change_page_attr_set(&addr, numpages,
2170 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
2171 0);
2172 }
2173
set_memory_uc(unsigned long addr,int numpages)2174 int set_memory_uc(unsigned long addr, int numpages)
2175 {
2176 int ret;
2177
2178 /*
2179 * for now UC MINUS. see comments in ioremap()
2180 */
2181 ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
2182 _PAGE_CACHE_MODE_UC_MINUS, NULL);
2183 if (ret)
2184 goto out_err;
2185
2186 ret = _set_memory_uc(addr, numpages);
2187 if (ret)
2188 goto out_free;
2189
2190 return 0;
2191
2192 out_free:
2193 memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2194 out_err:
2195 return ret;
2196 }
2197 EXPORT_SYMBOL(set_memory_uc);
2198
_set_memory_wc(unsigned long addr,int numpages)2199 int _set_memory_wc(unsigned long addr, int numpages)
2200 {
2201 int ret;
2202
2203 ret = change_page_attr_set(&addr, numpages,
2204 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
2205 0);
2206 if (!ret) {
2207 ret = change_page_attr_set_clr(&addr, numpages,
2208 cachemode2pgprot(_PAGE_CACHE_MODE_WC),
2209 __pgprot(_PAGE_CACHE_MASK),
2210 0, 0, NULL);
2211 }
2212 return ret;
2213 }
2214
set_memory_wc(unsigned long addr,int numpages)2215 int set_memory_wc(unsigned long addr, int numpages)
2216 {
2217 int ret;
2218
2219 ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
2220 _PAGE_CACHE_MODE_WC, NULL);
2221 if (ret)
2222 return ret;
2223
2224 ret = _set_memory_wc(addr, numpages);
2225 if (ret)
2226 memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2227
2228 return ret;
2229 }
2230 EXPORT_SYMBOL(set_memory_wc);
2231
_set_memory_wt(unsigned long addr,int numpages)2232 int _set_memory_wt(unsigned long addr, int numpages)
2233 {
2234 return change_page_attr_set(&addr, numpages,
2235 cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
2236 }
2237
_set_memory_wb(unsigned long addr,int numpages)2238 int _set_memory_wb(unsigned long addr, int numpages)
2239 {
2240 /* WB cache mode is hard wired to all cache attribute bits being 0 */
2241 return change_page_attr_clear(&addr, numpages,
2242 __pgprot(_PAGE_CACHE_MASK), 0);
2243 }
2244
set_memory_wb(unsigned long addr,int numpages)2245 int set_memory_wb(unsigned long addr, int numpages)
2246 {
2247 int ret;
2248
2249 ret = _set_memory_wb(addr, numpages);
2250 if (ret)
2251 return ret;
2252
2253 memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2254 return 0;
2255 }
2256 EXPORT_SYMBOL(set_memory_wb);
2257
2258 /* Prevent speculative access to a page by marking it not-present */
2259 #ifdef CONFIG_X86_64
set_mce_nospec(unsigned long pfn)2260 int set_mce_nospec(unsigned long pfn)
2261 {
2262 unsigned long decoy_addr;
2263 int rc;
2264
2265 /* SGX pages are not in the 1:1 map */
2266 if (arch_is_platform_page(pfn << PAGE_SHIFT))
2267 return 0;
2268 /*
2269 * We would like to just call:
2270 * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
2271 * but doing that would radically increase the odds of a
2272 * speculative access to the poison page because we'd have
2273 * the virtual address of the kernel 1:1 mapping sitting
2274 * around in registers.
2275 * Instead we get tricky. We create a non-canonical address
2276 * that looks just like the one we want, but has bit 63 flipped.
2277 * This relies on set_memory_XX() properly sanitizing any __pa()
2278 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
2279 */
2280 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
2281
2282 rc = set_memory_np(decoy_addr, 1);
2283 if (rc)
2284 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
2285 return rc;
2286 }
2287 EXPORT_SYMBOL_GPL(set_mce_nospec);
2288
2289 /* Restore full speculative operation to the pfn. */
clear_mce_nospec(unsigned long pfn)2290 int clear_mce_nospec(unsigned long pfn)
2291 {
2292 unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
2293
2294 return set_memory_p(addr, 1);
2295 }
2296 EXPORT_SYMBOL_GPL(clear_mce_nospec);
2297 #endif /* CONFIG_X86_64 */
2298
set_memory_x(unsigned long addr,int numpages)2299 int set_memory_x(unsigned long addr, int numpages)
2300 {
2301 if (!(__supported_pte_mask & _PAGE_NX))
2302 return 0;
2303
2304 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
2305 }
2306
set_memory_nx(unsigned long addr,int numpages)2307 int set_memory_nx(unsigned long addr, int numpages)
2308 {
2309 if (!(__supported_pte_mask & _PAGE_NX))
2310 return 0;
2311
2312 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
2313 }
2314
set_memory_ro(unsigned long addr,int numpages)2315 int set_memory_ro(unsigned long addr, int numpages)
2316 {
2317 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
2318 }
2319
set_memory_rox(unsigned long addr,int numpages)2320 int set_memory_rox(unsigned long addr, int numpages)
2321 {
2322 pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);
2323
2324 if (__supported_pte_mask & _PAGE_NX)
2325 clr.pgprot |= _PAGE_NX;
2326
2327 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
2328 CPA_COLLAPSE, NULL);
2329 }
2330
set_memory_rw(unsigned long addr,int numpages)2331 int set_memory_rw(unsigned long addr, int numpages)
2332 {
2333 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
2334 }
2335
set_memory_np(unsigned long addr,int numpages)2336 int set_memory_np(unsigned long addr, int numpages)
2337 {
2338 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
2339 }
2340
set_memory_np_noalias(unsigned long addr,int numpages)2341 int set_memory_np_noalias(unsigned long addr, int numpages)
2342 {
2343 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
2344 __pgprot(_PAGE_PRESENT), 0,
2345 CPA_NO_CHECK_ALIAS, NULL);
2346 }
2347
set_memory_p(unsigned long addr,int numpages)2348 int set_memory_p(unsigned long addr, int numpages)
2349 {
2350 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
2351 }
2352
set_memory_4k(unsigned long addr,int numpages)2353 int set_memory_4k(unsigned long addr, int numpages)
2354 {
2355 return change_page_attr_set_clr(&addr, numpages,
2356 __pgprot(_PAGE_KERNEL_4K),
2357 __pgprot(0), 1, 0, NULL);
2358 }
2359
set_memory_nonglobal(unsigned long addr,int numpages)2360 int set_memory_nonglobal(unsigned long addr, int numpages)
2361 {
2362 return change_page_attr_clear(&addr, numpages,
2363 __pgprot(_PAGE_GLOBAL), 0);
2364 }
2365
set_memory_global(unsigned long addr,int numpages)2366 int set_memory_global(unsigned long addr, int numpages)
2367 {
2368 return change_page_attr_set(&addr, numpages,
2369 __pgprot(_PAGE_GLOBAL), 0);
2370 }
2371
2372 /*
2373 * __set_memory_enc_pgtable() is used for the hypervisors that get
2374 * informed about "encryption" status via page tables.
2375 */
__set_memory_enc_pgtable(unsigned long addr,int numpages,bool enc)2376 static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
2377 {
2378 pgprot_t empty = __pgprot(0);
2379 struct cpa_data cpa;
2380 int ret;
2381
2382 /* Should not be working on unaligned addresses */
2383 if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
2384 addr &= PAGE_MASK;
2385
2386 memset(&cpa, 0, sizeof(cpa));
2387 cpa.vaddr = &addr;
2388 cpa.numpages = numpages;
2389 cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
2390 cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
2391 cpa.pgd = init_mm.pgd;
2392
2393 /* Must avoid aliasing mappings in the highmem code */
2394 kmap_flush_unused();
2395 vm_unmap_aliases();
2396
2397 /* Flush the caches as needed before changing the encryption attribute. */
2398 if (x86_platform.guest.enc_tlb_flush_required(enc))
2399 cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
2400
2401 /* Notify hypervisor that we are about to set/clr encryption attribute. */
2402 ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
2403 if (ret)
2404 goto vmm_fail;
2405
2406 ret = __change_page_attr_set_clr(&cpa, 1);
2407
2408 /*
2409 * After changing the encryption attribute, we need to flush TLBs again
2410 * in case any speculative TLB caching occurred (but no need to flush
2411 * caches again). We could just use cpa_flush_all(), but in case TLB
2412 * flushing gets optimized in the cpa_flush() path use the same logic
2413 * as above.
2414 */
2415 cpa_flush(&cpa, 0);
2416
2417 if (ret)
2418 return ret;
2419
2420 /* Notify hypervisor that we have successfully set/clr encryption attribute. */
2421 ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
2422 if (ret)
2423 goto vmm_fail;
2424
2425 return 0;
2426
2427 vmm_fail:
2428 WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
2429 (void *)addr, numpages, enc ? "private" : "shared", ret);
2430
2431 return ret;
2432 }
2433
2434 /*
2435 * The lock serializes conversions between private and shared memory.
2436 *
2437 * It is taken for read on conversion. A write lock guarantees that no
2438 * concurrent conversions are in progress.
2439 */
2440 static DECLARE_RWSEM(mem_enc_lock);
2441
2442 /*
2443 * Stop new private<->shared conversions.
2444 *
2445 * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
2446 * The lock is not released to prevent new conversions from being started.
2447 */
set_memory_enc_stop_conversion(void)2448 bool set_memory_enc_stop_conversion(void)
2449 {
2450 /*
2451 * In a crash scenario, sleep is not allowed. Try to take the lock.
2452 * Failure indicates that there is a race with the conversion.
2453 */
2454 if (oops_in_progress)
2455 return down_write_trylock(&mem_enc_lock);
2456
2457 down_write(&mem_enc_lock);
2458
2459 return true;
2460 }
2461
__set_memory_enc_dec(unsigned long addr,int numpages,bool enc)2462 static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
2463 {
2464 int ret = 0;
2465
2466 if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
2467 if (!down_read_trylock(&mem_enc_lock))
2468 return -EBUSY;
2469
2470 ret = __set_memory_enc_pgtable(addr, numpages, enc);
2471
2472 up_read(&mem_enc_lock);
2473 }
2474
2475 return ret;
2476 }
2477
set_memory_encrypted(unsigned long addr,int numpages)2478 int set_memory_encrypted(unsigned long addr, int numpages)
2479 {
2480 return __set_memory_enc_dec(addr, numpages, true);
2481 }
2482 EXPORT_SYMBOL_GPL(set_memory_encrypted);
2483
set_memory_decrypted(unsigned long addr,int numpages)2484 int set_memory_decrypted(unsigned long addr, int numpages)
2485 {
2486 return __set_memory_enc_dec(addr, numpages, false);
2487 }
2488 EXPORT_SYMBOL_GPL(set_memory_decrypted);
2489
set_pages_uc(struct page * page,int numpages)2490 int set_pages_uc(struct page *page, int numpages)
2491 {
2492 unsigned long addr = (unsigned long)page_address(page);
2493
2494 return set_memory_uc(addr, numpages);
2495 }
2496 EXPORT_SYMBOL(set_pages_uc);
2497
_set_pages_array(struct page ** pages,int numpages,enum page_cache_mode new_type)2498 static int _set_pages_array(struct page **pages, int numpages,
2499 enum page_cache_mode new_type)
2500 {
2501 unsigned long start;
2502 unsigned long end;
2503 enum page_cache_mode set_type;
2504 int i;
2505 int free_idx;
2506 int ret;
2507
2508 for (i = 0; i < numpages; i++) {
2509 if (PageHighMem(pages[i]))
2510 continue;
2511 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2512 end = start + PAGE_SIZE;
2513 if (memtype_reserve(start, end, new_type, NULL))
2514 goto err_out;
2515 }
2516
2517 /* If WC, set to UC- first and then WC */
2518 set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
2519 _PAGE_CACHE_MODE_UC_MINUS : new_type;
2520
2521 ret = cpa_set_pages_array(pages, numpages,
2522 cachemode2pgprot(set_type));
2523 if (!ret && new_type == _PAGE_CACHE_MODE_WC)
2524 ret = change_page_attr_set_clr(NULL, numpages,
2525 cachemode2pgprot(
2526 _PAGE_CACHE_MODE_WC),
2527 __pgprot(_PAGE_CACHE_MASK),
2528 0, CPA_PAGES_ARRAY, pages);
2529 if (ret)
2530 goto err_out;
2531 return 0; /* Success */
2532 err_out:
2533 free_idx = i;
2534 for (i = 0; i < free_idx; i++) {
2535 if (PageHighMem(pages[i]))
2536 continue;
2537 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2538 end = start + PAGE_SIZE;
2539 memtype_free(start, end);
2540 }
2541 return -EINVAL;
2542 }
2543
set_pages_array_uc(struct page ** pages,int numpages)2544 int set_pages_array_uc(struct page **pages, int numpages)
2545 {
2546 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
2547 }
2548 EXPORT_SYMBOL(set_pages_array_uc);
2549
set_pages_array_wc(struct page ** pages,int numpages)2550 int set_pages_array_wc(struct page **pages, int numpages)
2551 {
2552 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
2553 }
2554 EXPORT_SYMBOL(set_pages_array_wc);
2555
set_pages_wb(struct page * page,int numpages)2556 int set_pages_wb(struct page *page, int numpages)
2557 {
2558 unsigned long addr = (unsigned long)page_address(page);
2559
2560 return set_memory_wb(addr, numpages);
2561 }
2562 EXPORT_SYMBOL(set_pages_wb);
2563
set_pages_array_wb(struct page ** pages,int numpages)2564 int set_pages_array_wb(struct page **pages, int numpages)
2565 {
2566 int retval;
2567 unsigned long start;
2568 unsigned long end;
2569 int i;
2570
2571 /* WB cache mode is hard wired to all cache attribute bits being 0 */
2572 retval = cpa_clear_pages_array(pages, numpages,
2573 __pgprot(_PAGE_CACHE_MASK));
2574 if (retval)
2575 return retval;
2576
2577 for (i = 0; i < numpages; i++) {
2578 if (PageHighMem(pages[i]))
2579 continue;
2580 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2581 end = start + PAGE_SIZE;
2582 memtype_free(start, end);
2583 }
2584
2585 return 0;
2586 }
2587 EXPORT_SYMBOL(set_pages_array_wb);
2588
set_pages_ro(struct page * page,int numpages)2589 int set_pages_ro(struct page *page, int numpages)
2590 {
2591 unsigned long addr = (unsigned long)page_address(page);
2592
2593 return set_memory_ro(addr, numpages);
2594 }
2595
set_pages_rw(struct page * page,int numpages)2596 int set_pages_rw(struct page *page, int numpages)
2597 {
2598 unsigned long addr = (unsigned long)page_address(page);
2599
2600 return set_memory_rw(addr, numpages);
2601 }
2602
__set_pages_p(struct page * page,int numpages)2603 static int __set_pages_p(struct page *page, int numpages)
2604 {
2605 unsigned long tempaddr = (unsigned long) page_address(page);
2606 struct cpa_data cpa = { .vaddr = &tempaddr,
2607 .pgd = NULL,
2608 .numpages = numpages,
2609 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
2610 .mask_clr = __pgprot(0),
2611 .flags = CPA_NO_CHECK_ALIAS };
2612
2613 /*
2614 * No alias checking needed for setting present flag. otherwise,
2615 * we may need to break large pages for 64-bit kernel text
2616 * mappings (this adds to complexity if we want to do this from
2617 * atomic context especially). Let's keep it simple!
2618 */
2619 return __change_page_attr_set_clr(&cpa, 1);
2620 }
2621
__set_pages_np(struct page * page,int numpages)2622 static int __set_pages_np(struct page *page, int numpages)
2623 {
2624 unsigned long tempaddr = (unsigned long) page_address(page);
2625 struct cpa_data cpa = { .vaddr = &tempaddr,
2626 .pgd = NULL,
2627 .numpages = numpages,
2628 .mask_set = __pgprot(0),
2629 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
2630 .flags = CPA_NO_CHECK_ALIAS };
2631
2632 /*
2633 * No alias checking needed for setting not present flag. otherwise,
2634 * we may need to break large pages for 64-bit kernel text
2635 * mappings (this adds to complexity if we want to do this from
2636 * atomic context especially). Let's keep it simple!
2637 */
2638 return __change_page_attr_set_clr(&cpa, 1);
2639 }
2640
set_direct_map_invalid_noflush(struct page * page)2641 int set_direct_map_invalid_noflush(struct page *page)
2642 {
2643 return __set_pages_np(page, 1);
2644 }
2645
set_direct_map_default_noflush(struct page * page)2646 int set_direct_map_default_noflush(struct page *page)
2647 {
2648 return __set_pages_p(page, 1);
2649 }
2650
set_direct_map_valid_noflush(struct page * page,unsigned nr,bool valid)2651 int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
2652 {
2653 if (valid)
2654 return __set_pages_p(page, nr);
2655
2656 return __set_pages_np(page, nr);
2657 }
2658
2659 #ifdef CONFIG_DEBUG_PAGEALLOC
__kernel_map_pages(struct page * page,int numpages,int enable)2660 void __kernel_map_pages(struct page *page, int numpages, int enable)
2661 {
2662 if (PageHighMem(page))
2663 return;
2664 if (!enable) {
2665 debug_check_no_locks_freed(page_address(page),
2666 numpages * PAGE_SIZE);
2667 }
2668
2669 /*
2670 * The return value is ignored as the calls cannot fail.
2671 * Large pages for identity mappings are not used at boot time
2672 * and hence no memory allocations during large page split.
2673 */
2674 if (enable)
2675 __set_pages_p(page, numpages);
2676 else
2677 __set_pages_np(page, numpages);
2678
2679 /*
2680 * We should perform an IPI and flush all tlbs,
2681 * but that can deadlock->flush only current cpu.
2682 * Preemption needs to be disabled around __flush_tlb_all() due to
2683 * CR3 reload in __native_flush_tlb().
2684 */
2685 preempt_disable();
2686 __flush_tlb_all();
2687 preempt_enable();
2688
2689 arch_flush_lazy_mmu_mode();
2690 }
2691 #endif /* CONFIG_DEBUG_PAGEALLOC */
2692
kernel_page_present(struct page * page)2693 bool kernel_page_present(struct page *page)
2694 {
2695 unsigned int level;
2696 pte_t *pte;
2697
2698 if (PageHighMem(page))
2699 return false;
2700
2701 pte = lookup_address((unsigned long)page_address(page), &level);
2702 return (pte_val(*pte) & _PAGE_PRESENT);
2703 }
2704
kernel_map_pages_in_pgd(pgd_t * pgd,u64 pfn,unsigned long address,unsigned numpages,unsigned long page_flags)2705 int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
2706 unsigned numpages, unsigned long page_flags)
2707 {
2708 int retval = -EINVAL;
2709
2710 struct cpa_data cpa = {
2711 .vaddr = &address,
2712 .pfn = pfn,
2713 .pgd = pgd,
2714 .numpages = numpages,
2715 .mask_set = __pgprot(0),
2716 .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
2717 .flags = CPA_NO_CHECK_ALIAS,
2718 };
2719
2720 WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
2721
2722 if (!(__supported_pte_mask & _PAGE_NX))
2723 goto out;
2724
2725 if (!(page_flags & _PAGE_ENC))
2726 cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
2727
2728 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
2729
2730 retval = __change_page_attr_set_clr(&cpa, 1);
2731 __flush_tlb_all();
2732
2733 out:
2734 return retval;
2735 }
2736
2737 /*
2738 * __flush_tlb_all() flushes mappings only on current CPU and hence this
2739 * function shouldn't be used in an SMP environment. Presently, it's used only
2740 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
2741 */
kernel_unmap_pages_in_pgd(pgd_t * pgd,unsigned long address,unsigned long numpages)2742 int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
2743 unsigned long numpages)
2744 {
2745 int retval;
2746
2747 /*
2748 * The typical sequence for unmapping is to find a pte through
2749 * lookup_address_in_pgd() (ideally, it should never return NULL because
2750 * the address is already mapped) and change its protections. As pfn is
2751 * the *target* of a mapping, it's not useful while unmapping.
2752 */
2753 struct cpa_data cpa = {
2754 .vaddr = &address,
2755 .pfn = 0,
2756 .pgd = pgd,
2757 .numpages = numpages,
2758 .mask_set = __pgprot(0),
2759 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
2760 .flags = CPA_NO_CHECK_ALIAS,
2761 };
2762
2763 WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
2764
2765 retval = __change_page_attr_set_clr(&cpa, 1);
2766 __flush_tlb_all();
2767
2768 return retval;
2769 }
2770
2771 /*
2772 * The testcases use internal knowledge of the implementation that shouldn't
2773 * be exposed to the rest of the kernel. Include these directly here.
2774 */
2775 #ifdef CONFIG_CPA_DEBUG
2776 #include "cpa-test.c"
2777 #endif
2778