1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Page table handling routines for radix page table.
4 *
5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6 */
7
8 #define pr_fmt(fmt) "radix-mmu: " fmt
9
10 #include <linux/io.h>
11 #include <linux/kernel.h>
12 #include <linux/sched/mm.h>
13 #include <linux/memblock.h>
14 #include <linux/of.h>
15 #include <linux/of_fdt.h>
16 #include <linux/mm.h>
17 #include <linux/hugetlb.h>
18 #include <linux/string_helpers.h>
19 #include <linux/memory.h>
20 #include <linux/kfence.h>
21
22 #include <asm/pgalloc.h>
23 #include <asm/mmu_context.h>
24 #include <asm/dma.h>
25 #include <asm/machdep.h>
26 #include <asm/mmu.h>
27 #include <asm/firmware.h>
28 #include <asm/powernv.h>
29 #include <asm/sections.h>
30 #include <asm/smp.h>
31 #include <asm/trace.h>
32 #include <asm/uaccess.h>
33 #include <asm/ultravisor.h>
34 #include <asm/set_memory.h>
35 #include <asm/kfence.h>
36
37 #include <trace/events/thp.h>
38
39 #include <mm/mmu_decl.h>
40
41 unsigned int mmu_base_pid;
42
early_alloc_pgtable(unsigned long size,int nid,unsigned long region_start,unsigned long region_end)43 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
44 unsigned long region_start, unsigned long region_end)
45 {
46 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
47 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
48 void *ptr;
49
50 if (region_start)
51 min_addr = region_start;
52 if (region_end)
53 max_addr = region_end;
54
55 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
56
57 if (!ptr)
58 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
59 __func__, size, size, nid, &min_addr, &max_addr);
60
61 return ptr;
62 }
63
64 /*
65 * When allocating pud or pmd pointers, we allocate a complete page
66 * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
67 * is to ensure that the page obtained from the memblock allocator
68 * can be completely used as page table page and can be freed
69 * correctly when the page table entries are removed.
70 */
early_map_kernel_page(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size,int nid,unsigned long region_start,unsigned long region_end)71 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
72 pgprot_t flags,
73 unsigned int map_page_size,
74 int nid,
75 unsigned long region_start, unsigned long region_end)
76 {
77 unsigned long pfn = pa >> PAGE_SHIFT;
78 pgd_t *pgdp;
79 p4d_t *p4dp;
80 pud_t *pudp;
81 pmd_t *pmdp;
82 pte_t *ptep;
83
84 pgdp = pgd_offset_k(ea);
85 p4dp = p4d_offset(pgdp, ea);
86 if (p4d_none(*p4dp)) {
87 pudp = early_alloc_pgtable(PAGE_SIZE, nid,
88 region_start, region_end);
89 p4d_populate(&init_mm, p4dp, pudp);
90 }
91 pudp = pud_offset(p4dp, ea);
92 if (map_page_size == PUD_SIZE) {
93 ptep = (pte_t *)pudp;
94 goto set_the_pte;
95 }
96 if (pud_none(*pudp)) {
97 pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
98 region_end);
99 pud_populate(&init_mm, pudp, pmdp);
100 }
101 pmdp = pmd_offset(pudp, ea);
102 if (map_page_size == PMD_SIZE) {
103 ptep = pmdp_ptep(pmdp);
104 goto set_the_pte;
105 }
106 if (!pmd_present(*pmdp)) {
107 ptep = early_alloc_pgtable(PAGE_SIZE, nid,
108 region_start, region_end);
109 pmd_populate_kernel(&init_mm, pmdp, ptep);
110 }
111 ptep = pte_offset_kernel(pmdp, ea);
112
113 set_the_pte:
114 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
115 asm volatile("ptesync": : :"memory");
116 return 0;
117 }
118
119 /*
120 * nid, region_start, and region_end are hints to try to place the page
121 * table memory in the same node or region.
122 */
__map_kernel_page(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size,int nid,unsigned long region_start,unsigned long region_end)123 static int __map_kernel_page(unsigned long ea, unsigned long pa,
124 pgprot_t flags,
125 unsigned int map_page_size,
126 int nid,
127 unsigned long region_start, unsigned long region_end)
128 {
129 unsigned long pfn = pa >> PAGE_SHIFT;
130 pgd_t *pgdp;
131 p4d_t *p4dp;
132 pud_t *pudp;
133 pmd_t *pmdp;
134 pte_t *ptep;
135 /*
136 * Make sure task size is correct as per the max adddr
137 */
138 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
139
140 #ifdef CONFIG_PPC_64K_PAGES
141 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
142 #endif
143
144 if (unlikely(!slab_is_available()))
145 return early_map_kernel_page(ea, pa, flags, map_page_size,
146 nid, region_start, region_end);
147
148 /*
149 * Should make page table allocation functions be able to take a
150 * node, so we can place kernel page tables on the right nodes after
151 * boot.
152 */
153 pgdp = pgd_offset_k(ea);
154 p4dp = p4d_offset(pgdp, ea);
155 pudp = pud_alloc(&init_mm, p4dp, ea);
156 if (!pudp)
157 return -ENOMEM;
158 if (map_page_size == PUD_SIZE) {
159 ptep = (pte_t *)pudp;
160 goto set_the_pte;
161 }
162 pmdp = pmd_alloc(&init_mm, pudp, ea);
163 if (!pmdp)
164 return -ENOMEM;
165 if (map_page_size == PMD_SIZE) {
166 ptep = pmdp_ptep(pmdp);
167 goto set_the_pte;
168 }
169 ptep = pte_alloc_kernel(pmdp, ea);
170 if (!ptep)
171 return -ENOMEM;
172
173 set_the_pte:
174 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
175 asm volatile("ptesync": : :"memory");
176 return 0;
177 }
178
radix__map_kernel_page(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size)179 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
180 pgprot_t flags,
181 unsigned int map_page_size)
182 {
183 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
184 }
185
186 #ifdef CONFIG_STRICT_KERNEL_RWX
radix__change_memory_range(unsigned long start,unsigned long end,unsigned long clear)187 static void radix__change_memory_range(unsigned long start, unsigned long end,
188 unsigned long clear)
189 {
190 unsigned long idx;
191 pgd_t *pgdp;
192 p4d_t *p4dp;
193 pud_t *pudp;
194 pmd_t *pmdp;
195 pte_t *ptep;
196
197 start = ALIGN_DOWN(start, PAGE_SIZE);
198 end = PAGE_ALIGN(end); // aligns up
199
200 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
201 start, end, clear);
202
203 for (idx = start; idx < end; idx += PAGE_SIZE) {
204 pgdp = pgd_offset_k(idx);
205 p4dp = p4d_offset(pgdp, idx);
206 pudp = pud_alloc(&init_mm, p4dp, idx);
207 if (!pudp)
208 continue;
209 if (pud_leaf(*pudp)) {
210 ptep = (pte_t *)pudp;
211 goto update_the_pte;
212 }
213 pmdp = pmd_alloc(&init_mm, pudp, idx);
214 if (!pmdp)
215 continue;
216 if (pmd_leaf(*pmdp)) {
217 ptep = pmdp_ptep(pmdp);
218 goto update_the_pte;
219 }
220 ptep = pte_alloc_kernel(pmdp, idx);
221 if (!ptep)
222 continue;
223 update_the_pte:
224 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
225 }
226
227 radix__flush_tlb_kernel_range(start, end);
228 }
229
radix__mark_rodata_ro(void)230 void radix__mark_rodata_ro(void)
231 {
232 unsigned long start, end;
233
234 start = (unsigned long)_stext;
235 end = (unsigned long)__end_rodata;
236
237 radix__change_memory_range(start, end, _PAGE_WRITE);
238
239 for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
240 end = start + PAGE_SIZE;
241 if (overlaps_interrupt_vector_text(start, end))
242 radix__change_memory_range(start, end, _PAGE_WRITE);
243 else
244 break;
245 }
246 }
247
radix__mark_initmem_nx(void)248 void radix__mark_initmem_nx(void)
249 {
250 unsigned long start = (unsigned long)__init_begin;
251 unsigned long end = (unsigned long)__init_end;
252
253 radix__change_memory_range(start, end, _PAGE_EXEC);
254 }
255 #endif /* CONFIG_STRICT_KERNEL_RWX */
256
257 static inline void __meminit
print_mapping(unsigned long start,unsigned long end,unsigned long size,bool exec)258 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
259 {
260 char buf[10];
261
262 if (end <= start)
263 return;
264
265 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
266
267 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
268 exec ? " (exec)" : "");
269 }
270
next_boundary(unsigned long addr,unsigned long end)271 static unsigned long next_boundary(unsigned long addr, unsigned long end)
272 {
273 #ifdef CONFIG_STRICT_KERNEL_RWX
274 unsigned long stext_phys;
275
276 stext_phys = __pa_symbol(_stext);
277
278 // Relocatable kernel running at non-zero real address
279 if (stext_phys != 0) {
280 // The end of interrupts code at zero is a rodata boundary
281 unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
282 if (addr < end_intr)
283 return end_intr;
284
285 // Start of relocated kernel text is a rodata boundary
286 if (addr < stext_phys)
287 return stext_phys;
288 }
289
290 if (addr < __pa_symbol(__srwx_boundary))
291 return __pa_symbol(__srwx_boundary);
292 #endif
293 return end;
294 }
295
create_physical_mapping(unsigned long start,unsigned long end,int nid,pgprot_t _prot,unsigned long mapping_sz_limit)296 static int __meminit create_physical_mapping(unsigned long start,
297 unsigned long end,
298 int nid, pgprot_t _prot,
299 unsigned long mapping_sz_limit)
300 {
301 unsigned long vaddr, addr, mapping_size = 0;
302 bool prev_exec, exec = false;
303 pgprot_t prot;
304 int psize;
305 unsigned long max_mapping_size = memory_block_size;
306
307 if (mapping_sz_limit < max_mapping_size)
308 max_mapping_size = mapping_sz_limit;
309
310 if (debug_pagealloc_enabled())
311 max_mapping_size = PAGE_SIZE;
312
313 start = ALIGN(start, PAGE_SIZE);
314 end = ALIGN_DOWN(end, PAGE_SIZE);
315 for (addr = start; addr < end; addr += mapping_size) {
316 unsigned long gap, previous_size;
317 int rc;
318
319 gap = next_boundary(addr, end) - addr;
320 if (gap > max_mapping_size)
321 gap = max_mapping_size;
322 previous_size = mapping_size;
323 prev_exec = exec;
324
325 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
326 mmu_psize_defs[MMU_PAGE_1G].shift) {
327 mapping_size = PUD_SIZE;
328 psize = MMU_PAGE_1G;
329 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
330 mmu_psize_defs[MMU_PAGE_2M].shift) {
331 mapping_size = PMD_SIZE;
332 psize = MMU_PAGE_2M;
333 } else {
334 mapping_size = PAGE_SIZE;
335 psize = mmu_virtual_psize;
336 }
337
338 vaddr = (unsigned long)__va(addr);
339
340 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
341 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
342 prot = PAGE_KERNEL_X;
343 exec = true;
344 } else {
345 prot = _prot;
346 exec = false;
347 }
348
349 if (mapping_size != previous_size || exec != prev_exec) {
350 print_mapping(start, addr, previous_size, prev_exec);
351 start = addr;
352 }
353
354 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
355 if (rc)
356 return rc;
357
358 update_page_count(psize, 1);
359 }
360
361 print_mapping(start, addr, mapping_size, exec);
362 return 0;
363 }
364
365 #ifdef CONFIG_KFENCE
alloc_kfence_pool(void)366 static inline phys_addr_t alloc_kfence_pool(void)
367 {
368 phys_addr_t kfence_pool;
369
370 /*
371 * TODO: Support to enable KFENCE after bootup depends on the ability to
372 * split page table mappings. As such support is not currently
373 * implemented for radix pagetables, support enabling KFENCE
374 * only at system startup for now.
375 *
376 * After support for splitting mappings is available on radix,
377 * alloc_kfence_pool() & map_kfence_pool() can be dropped and
378 * mapping for __kfence_pool memory can be
379 * split during arch_kfence_init_pool().
380 */
381 if (!kfence_early_init)
382 goto no_kfence;
383
384 kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
385 if (!kfence_pool)
386 goto no_kfence;
387
388 memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
389 return kfence_pool;
390
391 no_kfence:
392 disable_kfence();
393 return 0;
394 }
395
map_kfence_pool(phys_addr_t kfence_pool)396 static inline void map_kfence_pool(phys_addr_t kfence_pool)
397 {
398 if (!kfence_pool)
399 return;
400
401 if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
402 -1, PAGE_KERNEL, PAGE_SIZE))
403 goto err;
404
405 memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
406 __kfence_pool = __va(kfence_pool);
407 return;
408
409 err:
410 memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
411 disable_kfence();
412 }
413 #else
alloc_kfence_pool(void)414 static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
map_kfence_pool(phys_addr_t kfence_pool)415 static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
416 #endif
417
radix_init_pgtable(void)418 static void __init radix_init_pgtable(void)
419 {
420 phys_addr_t kfence_pool;
421 unsigned long rts_field;
422 phys_addr_t start, end;
423 u64 i;
424
425 /* We don't support slb for radix */
426 slb_set_size(0);
427
428 kfence_pool = alloc_kfence_pool();
429
430 /*
431 * Create the linear mapping
432 */
433 for_each_mem_range(i, &start, &end) {
434 /*
435 * The memblock allocator is up at this point, so the
436 * page tables will be allocated within the range. No
437 * need or a node (which we don't have yet).
438 */
439
440 if (end >= RADIX_VMALLOC_START) {
441 pr_warn("Outside the supported range\n");
442 continue;
443 }
444
445 WARN_ON(create_physical_mapping(start, end,
446 -1, PAGE_KERNEL, ~0UL));
447 }
448
449 map_kfence_pool(kfence_pool);
450
451 if (!cpu_has_feature(CPU_FTR_HVMODE) &&
452 cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
453 /*
454 * Older versions of KVM on these machines prefer if the
455 * guest only uses the low 19 PID bits.
456 */
457 mmu_pid_bits = 19;
458 }
459 mmu_base_pid = 1;
460
461 /*
462 * Allocate Partition table and process table for the
463 * host.
464 */
465 BUG_ON(PRTB_SIZE_SHIFT > 36);
466 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
467 /*
468 * Fill in the process table.
469 */
470 rts_field = radix__get_tree_size();
471 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
472
473 /*
474 * The init_mm context is given the first available (non-zero) PID,
475 * which is the "guard PID" and contains no page table. PIDR should
476 * never be set to zero because that duplicates the kernel address
477 * space at the 0x0... offset (quadrant 0)!
478 *
479 * An arbitrary PID that may later be allocated by the PID allocator
480 * for userspace processes must not be used either, because that
481 * would cause stale user mappings for that PID on CPUs outside of
482 * the TLB invalidation scheme (because it won't be in mm_cpumask).
483 *
484 * So permanently carve out one PID for the purpose of a guard PID.
485 */
486 init_mm.context.id = mmu_base_pid;
487 mmu_base_pid++;
488 }
489
radix_init_partition_table(void)490 static void __init radix_init_partition_table(void)
491 {
492 unsigned long rts_field, dw0, dw1;
493
494 mmu_partition_table_init();
495 rts_field = radix__get_tree_size();
496 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
497 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
498 mmu_partition_table_set_entry(0, dw0, dw1, false);
499
500 pr_info("Initializing Radix MMU\n");
501 }
502
get_idx_from_shift(unsigned int shift)503 static int __init get_idx_from_shift(unsigned int shift)
504 {
505 int idx = -1;
506
507 switch (shift) {
508 case 0xc:
509 idx = MMU_PAGE_4K;
510 break;
511 case 0x10:
512 idx = MMU_PAGE_64K;
513 break;
514 case 0x15:
515 idx = MMU_PAGE_2M;
516 break;
517 case 0x1e:
518 idx = MMU_PAGE_1G;
519 break;
520 }
521 return idx;
522 }
523
radix_dt_scan_page_sizes(unsigned long node,const char * uname,int depth,void * data)524 static int __init radix_dt_scan_page_sizes(unsigned long node,
525 const char *uname, int depth,
526 void *data)
527 {
528 int size = 0;
529 int shift, idx;
530 unsigned int ap;
531 const __be32 *prop;
532 const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
533
534 /* We are scanning "cpu" nodes only */
535 if (type == NULL || strcmp(type, "cpu") != 0)
536 return 0;
537
538 /* Grab page size encodings */
539 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
540 if (!prop)
541 return 0;
542
543 pr_info("Page sizes from device-tree:\n");
544 for (; size >= 4; size -= 4, ++prop) {
545
546 struct mmu_psize_def *def;
547
548 /* top 3 bit is AP encoding */
549 shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
550 ap = be32_to_cpu(prop[0]) >> 29;
551 pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
552
553 idx = get_idx_from_shift(shift);
554 if (idx < 0)
555 continue;
556
557 def = &mmu_psize_defs[idx];
558 def->shift = shift;
559 def->ap = ap;
560 def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
561 }
562
563 /* needed ? */
564 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
565 return 1;
566 }
567
radix__early_init_devtree(void)568 void __init radix__early_init_devtree(void)
569 {
570 int rc;
571
572 /*
573 * Try to find the available page sizes in the device-tree
574 */
575 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
576 if (!rc) {
577 /*
578 * No page size details found in device tree.
579 * Let's assume we have page 4k and 64k support
580 */
581 mmu_psize_defs[MMU_PAGE_4K].shift = 12;
582 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
583 mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
584 psize_to_rpti_pgsize(MMU_PAGE_4K);
585
586 mmu_psize_defs[MMU_PAGE_64K].shift = 16;
587 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
588 mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
589 psize_to_rpti_pgsize(MMU_PAGE_64K);
590 }
591 return;
592 }
593
radix__early_init_mmu(void)594 void __init radix__early_init_mmu(void)
595 {
596 unsigned long lpcr;
597
598 #ifdef CONFIG_PPC_64S_HASH_MMU
599 #ifdef CONFIG_PPC_64K_PAGES
600 /* PAGE_SIZE mappings */
601 mmu_virtual_psize = MMU_PAGE_64K;
602 #else
603 mmu_virtual_psize = MMU_PAGE_4K;
604 #endif
605 #endif
606 /*
607 * initialize page table size
608 */
609 __pte_index_size = RADIX_PTE_INDEX_SIZE;
610 __pmd_index_size = RADIX_PMD_INDEX_SIZE;
611 __pud_index_size = RADIX_PUD_INDEX_SIZE;
612 __pgd_index_size = RADIX_PGD_INDEX_SIZE;
613 __pud_cache_index = RADIX_PUD_INDEX_SIZE;
614 __pte_table_size = RADIX_PTE_TABLE_SIZE;
615 __pmd_table_size = RADIX_PMD_TABLE_SIZE;
616 __pud_table_size = RADIX_PUD_TABLE_SIZE;
617 __pgd_table_size = RADIX_PGD_TABLE_SIZE;
618
619 __pmd_val_bits = RADIX_PMD_VAL_BITS;
620 __pud_val_bits = RADIX_PUD_VAL_BITS;
621 __pgd_val_bits = RADIX_PGD_VAL_BITS;
622
623 __kernel_virt_start = RADIX_KERN_VIRT_START;
624 __vmalloc_start = RADIX_VMALLOC_START;
625 __vmalloc_end = RADIX_VMALLOC_END;
626 __kernel_io_start = RADIX_KERN_IO_START;
627 __kernel_io_end = RADIX_KERN_IO_END;
628 vmemmap = (struct page *)RADIX_VMEMMAP_START;
629 ioremap_bot = IOREMAP_BASE;
630
631 #ifdef CONFIG_PCI
632 pci_io_base = ISA_IO_BASE;
633 #endif
634 __pte_frag_nr = RADIX_PTE_FRAG_NR;
635 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
636 __pmd_frag_nr = RADIX_PMD_FRAG_NR;
637 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
638
639 radix_init_pgtable();
640
641 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
642 lpcr = mfspr(SPRN_LPCR);
643 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
644 radix_init_partition_table();
645 } else {
646 radix_init_pseries();
647 }
648
649 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
650
651 /* Switch to the guard PID before turning on MMU */
652 radix__switch_mmu_context(NULL, &init_mm);
653 tlbiel_all();
654 }
655
radix__early_init_mmu_secondary(void)656 void radix__early_init_mmu_secondary(void)
657 {
658 unsigned long lpcr;
659 /*
660 * update partition table control register and UPRT
661 */
662 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
663 lpcr = mfspr(SPRN_LPCR);
664 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
665
666 set_ptcr_when_no_uv(__pa(partition_tb) |
667 (PATB_SIZE_SHIFT - 12));
668 }
669
670 radix__switch_mmu_context(NULL, &init_mm);
671 tlbiel_all();
672
673 /* Make sure userspace can't change the AMR */
674 mtspr(SPRN_UAMOR, 0);
675 }
676
677 /* Called during kexec sequence with MMU off */
radix__mmu_cleanup_all(void)678 notrace void radix__mmu_cleanup_all(void)
679 {
680 unsigned long lpcr;
681
682 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
683 lpcr = mfspr(SPRN_LPCR);
684 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
685 set_ptcr_when_no_uv(0);
686 powernv_set_nmmu_ptcr(0);
687 radix__flush_tlb_all();
688 }
689 }
690
691 #ifdef CONFIG_MEMORY_HOTPLUG
free_pte_table(pte_t * pte_start,pmd_t * pmd)692 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
693 {
694 pte_t *pte;
695 int i;
696
697 for (i = 0; i < PTRS_PER_PTE; i++) {
698 pte = pte_start + i;
699 if (!pte_none(*pte))
700 return;
701 }
702
703 pte_free_kernel(&init_mm, pte_start);
704 pmd_clear(pmd);
705 }
706
free_pmd_table(pmd_t * pmd_start,pud_t * pud)707 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
708 {
709 pmd_t *pmd;
710 int i;
711
712 for (i = 0; i < PTRS_PER_PMD; i++) {
713 pmd = pmd_start + i;
714 if (!pmd_none(*pmd))
715 return;
716 }
717
718 pmd_free(&init_mm, pmd_start);
719 pud_clear(pud);
720 }
721
free_pud_table(pud_t * pud_start,p4d_t * p4d)722 static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
723 {
724 pud_t *pud;
725 int i;
726
727 for (i = 0; i < PTRS_PER_PUD; i++) {
728 pud = pud_start + i;
729 if (!pud_none(*pud))
730 return;
731 }
732
733 pud_free(&init_mm, pud_start);
734 p4d_clear(p4d);
735 }
736
737 #ifdef CONFIG_SPARSEMEM_VMEMMAP
vmemmap_pmd_is_unused(unsigned long addr,unsigned long end)738 static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
739 {
740 unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
741
742 return !vmemmap_populated(start, PMD_SIZE);
743 }
744
vmemmap_page_is_unused(unsigned long addr,unsigned long end)745 static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
746 {
747 unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
748
749 return !vmemmap_populated(start, PAGE_SIZE);
750
751 }
752 #endif
753
free_vmemmap_pages(struct page * page,struct vmem_altmap * altmap,int order)754 static void __meminit free_vmemmap_pages(struct page *page,
755 struct vmem_altmap *altmap,
756 int order)
757 {
758 unsigned int nr_pages = 1 << order;
759
760 if (altmap) {
761 unsigned long alt_start, alt_end;
762 unsigned long base_pfn = page_to_pfn(page);
763
764 /*
765 * with 2M vmemmap mmaping we can have things setup
766 * such that even though atlmap is specified we never
767 * used altmap.
768 */
769 alt_start = altmap->base_pfn;
770 alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
771
772 if (base_pfn >= alt_start && base_pfn < alt_end) {
773 vmem_altmap_free(altmap, nr_pages);
774 return;
775 }
776 }
777
778 if (PageReserved(page)) {
779 /* allocated from memblock */
780 while (nr_pages--)
781 free_reserved_page(page++);
782 } else
783 free_pages((unsigned long)page_address(page), order);
784 }
785
remove_pte_table(pte_t * pte_start,unsigned long addr,unsigned long end,bool direct,struct vmem_altmap * altmap)786 static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
787 unsigned long end, bool direct,
788 struct vmem_altmap *altmap)
789 {
790 unsigned long next, pages = 0;
791 pte_t *pte;
792
793 pte = pte_start + pte_index(addr);
794 for (; addr < end; addr = next, pte++) {
795 next = (addr + PAGE_SIZE) & PAGE_MASK;
796 if (next > end)
797 next = end;
798
799 if (!pte_present(*pte))
800 continue;
801
802 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
803 if (!direct)
804 free_vmemmap_pages(pte_page(*pte), altmap, 0);
805 pte_clear(&init_mm, addr, pte);
806 pages++;
807 }
808 #ifdef CONFIG_SPARSEMEM_VMEMMAP
809 else if (!direct && vmemmap_page_is_unused(addr, next)) {
810 free_vmemmap_pages(pte_page(*pte), altmap, 0);
811 pte_clear(&init_mm, addr, pte);
812 }
813 #endif
814 }
815 if (direct)
816 update_page_count(mmu_virtual_psize, -pages);
817 }
818
remove_pmd_table(pmd_t * pmd_start,unsigned long addr,unsigned long end,bool direct,struct vmem_altmap * altmap)819 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
820 unsigned long end, bool direct,
821 struct vmem_altmap *altmap)
822 {
823 unsigned long next, pages = 0;
824 pte_t *pte_base;
825 pmd_t *pmd;
826
827 pmd = pmd_start + pmd_index(addr);
828 for (; addr < end; addr = next, pmd++) {
829 next = pmd_addr_end(addr, end);
830
831 if (!pmd_present(*pmd))
832 continue;
833
834 if (pmd_leaf(*pmd)) {
835 if (IS_ALIGNED(addr, PMD_SIZE) &&
836 IS_ALIGNED(next, PMD_SIZE)) {
837 if (!direct)
838 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
839 pte_clear(&init_mm, addr, (pte_t *)pmd);
840 pages++;
841 }
842 #ifdef CONFIG_SPARSEMEM_VMEMMAP
843 else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
844 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
845 pte_clear(&init_mm, addr, (pte_t *)pmd);
846 }
847 #endif
848 continue;
849 }
850
851 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
852 remove_pte_table(pte_base, addr, next, direct, altmap);
853 free_pte_table(pte_base, pmd);
854 }
855 if (direct)
856 update_page_count(MMU_PAGE_2M, -pages);
857 }
858
remove_pud_table(pud_t * pud_start,unsigned long addr,unsigned long end,bool direct,struct vmem_altmap * altmap)859 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
860 unsigned long end, bool direct,
861 struct vmem_altmap *altmap)
862 {
863 unsigned long next, pages = 0;
864 pmd_t *pmd_base;
865 pud_t *pud;
866
867 pud = pud_start + pud_index(addr);
868 for (; addr < end; addr = next, pud++) {
869 next = pud_addr_end(addr, end);
870
871 if (!pud_present(*pud))
872 continue;
873
874 if (pud_leaf(*pud)) {
875 if (!IS_ALIGNED(addr, PUD_SIZE) ||
876 !IS_ALIGNED(next, PUD_SIZE)) {
877 WARN_ONCE(1, "%s: unaligned range\n", __func__);
878 continue;
879 }
880 pte_clear(&init_mm, addr, (pte_t *)pud);
881 pages++;
882 continue;
883 }
884
885 pmd_base = pud_pgtable(*pud);
886 remove_pmd_table(pmd_base, addr, next, direct, altmap);
887 free_pmd_table(pmd_base, pud);
888 }
889 if (direct)
890 update_page_count(MMU_PAGE_1G, -pages);
891 }
892
893 static void __meminit
remove_pagetable(unsigned long start,unsigned long end,bool direct,struct vmem_altmap * altmap)894 remove_pagetable(unsigned long start, unsigned long end, bool direct,
895 struct vmem_altmap *altmap)
896 {
897 unsigned long addr, next;
898 pud_t *pud_base;
899 pgd_t *pgd;
900 p4d_t *p4d;
901
902 spin_lock(&init_mm.page_table_lock);
903
904 for (addr = start; addr < end; addr = next) {
905 next = pgd_addr_end(addr, end);
906
907 pgd = pgd_offset_k(addr);
908 p4d = p4d_offset(pgd, addr);
909 if (!p4d_present(*p4d))
910 continue;
911
912 if (p4d_leaf(*p4d)) {
913 if (!IS_ALIGNED(addr, P4D_SIZE) ||
914 !IS_ALIGNED(next, P4D_SIZE)) {
915 WARN_ONCE(1, "%s: unaligned range\n", __func__);
916 continue;
917 }
918
919 pte_clear(&init_mm, addr, (pte_t *)pgd);
920 continue;
921 }
922
923 pud_base = p4d_pgtable(*p4d);
924 remove_pud_table(pud_base, addr, next, direct, altmap);
925 free_pud_table(pud_base, p4d);
926 }
927
928 spin_unlock(&init_mm.page_table_lock);
929 radix__flush_tlb_kernel_range(start, end);
930 }
931
radix__create_section_mapping(unsigned long start,unsigned long end,int nid,pgprot_t prot)932 int __meminit radix__create_section_mapping(unsigned long start,
933 unsigned long end, int nid,
934 pgprot_t prot)
935 {
936 if (end >= RADIX_VMALLOC_START) {
937 pr_warn("Outside the supported range\n");
938 return -1;
939 }
940
941 return create_physical_mapping(__pa(start), __pa(end),
942 nid, prot, ~0UL);
943 }
944
radix__remove_section_mapping(unsigned long start,unsigned long end)945 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
946 {
947 remove_pagetable(start, end, true, NULL);
948 return 0;
949 }
950 #endif /* CONFIG_MEMORY_HOTPLUG */
951
952 #ifdef CONFIG_SPARSEMEM_VMEMMAP
__map_kernel_page_nid(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size,int nid)953 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
954 pgprot_t flags, unsigned int map_page_size,
955 int nid)
956 {
957 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
958 }
959
radix__vmemmap_create_mapping(unsigned long start,unsigned long page_size,unsigned long phys)960 int __meminit radix__vmemmap_create_mapping(unsigned long start,
961 unsigned long page_size,
962 unsigned long phys)
963 {
964 /* Create a PTE encoding */
965 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
966 int ret;
967
968 if ((start + page_size) >= RADIX_VMEMMAP_END) {
969 pr_warn("Outside the supported range\n");
970 return -1;
971 }
972
973 ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
974 BUG_ON(ret);
975
976 return 0;
977 }
978
979 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
vmemmap_can_optimize(struct vmem_altmap * altmap,struct dev_pagemap * pgmap)980 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
981 {
982 if (radix_enabled())
983 return __vmemmap_can_optimize(altmap, pgmap);
984
985 return false;
986 }
987 #endif
988
vmemmap_check_pmd(pmd_t * pmdp,int node,unsigned long addr,unsigned long next)989 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
990 unsigned long addr, unsigned long next)
991 {
992 int large = pmd_leaf(*pmdp);
993
994 if (large)
995 vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
996
997 return large;
998 }
999
vmemmap_set_pmd(pmd_t * pmdp,void * p,int node,unsigned long addr,unsigned long next)1000 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1001 unsigned long addr, unsigned long next)
1002 {
1003 pte_t entry;
1004 pte_t *ptep = pmdp_ptep(pmdp);
1005
1006 VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
1007 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1008 set_pte_at(&init_mm, addr, ptep, entry);
1009 asm volatile("ptesync": : :"memory");
1010
1011 vmemmap_verify(ptep, node, addr, next);
1012 }
1013
radix__vmemmap_pte_populate(pmd_t * pmdp,unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)1014 static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
1015 int node,
1016 struct vmem_altmap *altmap,
1017 struct page *reuse)
1018 {
1019 pte_t *pte = pte_offset_kernel(pmdp, addr);
1020
1021 if (pte_none(*pte)) {
1022 pte_t entry;
1023 void *p;
1024
1025 if (!reuse) {
1026 /*
1027 * make sure we don't create altmap mappings
1028 * covering things outside the device.
1029 */
1030 if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
1031 altmap = NULL;
1032
1033 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
1034 if (!p && altmap)
1035 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
1036 if (!p)
1037 return NULL;
1038 pr_debug("PAGE_SIZE vmemmap mapping\n");
1039 } else {
1040 /*
1041 * When a PTE/PMD entry is freed from the init_mm
1042 * there's a free_pages() call to this page allocated
1043 * above. Thus this get_page() is paired with the
1044 * put_page_testzero() on the freeing path.
1045 * This can only called by certain ZONE_DEVICE path,
1046 * and through vmemmap_populate_compound_pages() when
1047 * slab is available.
1048 */
1049 get_page(reuse);
1050 p = page_to_virt(reuse);
1051 pr_debug("Tail page reuse vmemmap mapping\n");
1052 }
1053
1054 VM_BUG_ON(!PAGE_ALIGNED(addr));
1055 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1056 set_pte_at(&init_mm, addr, pte, entry);
1057 asm volatile("ptesync": : :"memory");
1058 }
1059 return pte;
1060 }
1061
vmemmap_pud_alloc(p4d_t * p4dp,int node,unsigned long address)1062 static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
1063 unsigned long address)
1064 {
1065 pud_t *pud;
1066
1067 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1068 if (unlikely(p4d_none(*p4dp))) {
1069 if (unlikely(!slab_is_available())) {
1070 pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1071 p4d_populate(&init_mm, p4dp, pud);
1072 /* go to the pud_offset */
1073 } else
1074 return pud_alloc(&init_mm, p4dp, address);
1075 }
1076 return pud_offset(p4dp, address);
1077 }
1078
vmemmap_pmd_alloc(pud_t * pudp,int node,unsigned long address)1079 static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
1080 unsigned long address)
1081 {
1082 pmd_t *pmd;
1083
1084 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1085 if (unlikely(pud_none(*pudp))) {
1086 if (unlikely(!slab_is_available())) {
1087 pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1088 pud_populate(&init_mm, pudp, pmd);
1089 } else
1090 return pmd_alloc(&init_mm, pudp, address);
1091 }
1092 return pmd_offset(pudp, address);
1093 }
1094
vmemmap_pte_alloc(pmd_t * pmdp,int node,unsigned long address)1095 static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
1096 unsigned long address)
1097 {
1098 pte_t *pte;
1099
1100 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1101 if (unlikely(pmd_none(*pmdp))) {
1102 if (unlikely(!slab_is_available())) {
1103 pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1104 pmd_populate(&init_mm, pmdp, pte);
1105 } else
1106 return pte_alloc_kernel(pmdp, address);
1107 }
1108 return pte_offset_kernel(pmdp, address);
1109 }
1110
1111
1112
radix__vmemmap_populate(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap)1113 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
1114 struct vmem_altmap *altmap)
1115 {
1116 unsigned long addr;
1117 unsigned long next;
1118 pgd_t *pgd;
1119 p4d_t *p4d;
1120 pud_t *pud;
1121 pmd_t *pmd;
1122 pte_t *pte;
1123
1124 /*
1125 * Make sure we align the start vmemmap addr so that we calculate
1126 * the correct start_pfn in altmap boundary check to decided whether
1127 * we should use altmap or RAM based backing memory allocation. Also
1128 * the address need to be aligned for set_pte operation.
1129
1130 * If the start addr is already PMD_SIZE aligned we will try to use
1131 * a pmd mapping. We don't want to be too aggressive here beacause
1132 * that will cause more allocations in RAM. So only if the namespace
1133 * vmemmap start addr is PMD_SIZE aligned we will use PMD mapping.
1134 */
1135
1136 start = ALIGN_DOWN(start, PAGE_SIZE);
1137 for (addr = start; addr < end; addr = next) {
1138 next = pmd_addr_end(addr, end);
1139
1140 pgd = pgd_offset_k(addr);
1141 p4d = p4d_offset(pgd, addr);
1142 pud = vmemmap_pud_alloc(p4d, node, addr);
1143 if (!pud)
1144 return -ENOMEM;
1145 pmd = vmemmap_pmd_alloc(pud, node, addr);
1146 if (!pmd)
1147 return -ENOMEM;
1148
1149 if (pmd_none(READ_ONCE(*pmd))) {
1150 void *p;
1151
1152 /*
1153 * keep it simple by checking addr PMD_SIZE alignment
1154 * and verifying the device boundary condition.
1155 * For us to use a pmd mapping, both addr and pfn should
1156 * be aligned. We skip if addr is not aligned and for
1157 * pfn we hope we have extra area in the altmap that
1158 * can help to find an aligned block. This can result
1159 * in altmap block allocation failures, in which case
1160 * we fallback to RAM for vmemmap allocation.
1161 */
1162 if (!IS_ALIGNED(addr, PMD_SIZE) || (altmap &&
1163 altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
1164 /*
1165 * make sure we don't create altmap mappings
1166 * covering things outside the device.
1167 */
1168 goto base_mapping;
1169 }
1170
1171 p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1172 if (p) {
1173 vmemmap_set_pmd(pmd, p, node, addr, next);
1174 pr_debug("PMD_SIZE vmemmap mapping\n");
1175 continue;
1176 } else if (altmap) {
1177 /*
1178 * A vmemmap block allocation can fail due to
1179 * alignment requirements and we trying to align
1180 * things aggressively there by running out of
1181 * space. Try base mapping on failure.
1182 */
1183 goto base_mapping;
1184 }
1185 } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
1186 /*
1187 * If a huge mapping exist due to early call to
1188 * vmemmap_populate, let's try to use that.
1189 */
1190 continue;
1191 }
1192 base_mapping:
1193 /*
1194 * Not able allocate higher order memory to back memmap
1195 * or we found a pointer to pte page. Allocate base page
1196 * size vmemmap
1197 */
1198 pte = vmemmap_pte_alloc(pmd, node, addr);
1199 if (!pte)
1200 return -ENOMEM;
1201
1202 pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
1203 if (!pte)
1204 return -ENOMEM;
1205
1206 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1207 next = addr + PAGE_SIZE;
1208 }
1209 return 0;
1210 }
1211
radix__vmemmap_populate_address(unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)1212 static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
1213 struct vmem_altmap *altmap,
1214 struct page *reuse)
1215 {
1216 pgd_t *pgd;
1217 p4d_t *p4d;
1218 pud_t *pud;
1219 pmd_t *pmd;
1220 pte_t *pte;
1221
1222 pgd = pgd_offset_k(addr);
1223 p4d = p4d_offset(pgd, addr);
1224 pud = vmemmap_pud_alloc(p4d, node, addr);
1225 if (!pud)
1226 return NULL;
1227 pmd = vmemmap_pmd_alloc(pud, node, addr);
1228 if (!pmd)
1229 return NULL;
1230 if (pmd_leaf(*pmd))
1231 /*
1232 * The second page is mapped as a hugepage due to a nearby request.
1233 * Force our mapping to page size without deduplication
1234 */
1235 return NULL;
1236 pte = vmemmap_pte_alloc(pmd, node, addr);
1237 if (!pte)
1238 return NULL;
1239 radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1240 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1241
1242 return pte;
1243 }
1244
vmemmap_compound_tail_page(unsigned long addr,unsigned long pfn_offset,int node)1245 static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
1246 unsigned long pfn_offset, int node)
1247 {
1248 pgd_t *pgd;
1249 p4d_t *p4d;
1250 pud_t *pud;
1251 pmd_t *pmd;
1252 pte_t *pte;
1253 unsigned long map_addr;
1254
1255 /* the second vmemmap page which we use for duplication */
1256 map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
1257 pgd = pgd_offset_k(map_addr);
1258 p4d = p4d_offset(pgd, map_addr);
1259 pud = vmemmap_pud_alloc(p4d, node, map_addr);
1260 if (!pud)
1261 return NULL;
1262 pmd = vmemmap_pmd_alloc(pud, node, map_addr);
1263 if (!pmd)
1264 return NULL;
1265 if (pmd_leaf(*pmd))
1266 /*
1267 * The second page is mapped as a hugepage due to a nearby request.
1268 * Force our mapping to page size without deduplication
1269 */
1270 return NULL;
1271 pte = vmemmap_pte_alloc(pmd, node, map_addr);
1272 if (!pte)
1273 return NULL;
1274 /*
1275 * Check if there exist a mapping to the left
1276 */
1277 if (pte_none(*pte)) {
1278 /*
1279 * Populate the head page vmemmap page.
1280 * It can fall in different pmd, hence
1281 * vmemmap_populate_address()
1282 */
1283 pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
1284 if (!pte)
1285 return NULL;
1286 /*
1287 * Populate the tail pages vmemmap page
1288 */
1289 pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
1290 if (!pte)
1291 return NULL;
1292 vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
1293 return pte;
1294 }
1295 return pte;
1296 }
1297
vmemmap_populate_compound_pages(unsigned long start_pfn,unsigned long start,unsigned long end,int node,struct dev_pagemap * pgmap)1298 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
1299 unsigned long start,
1300 unsigned long end, int node,
1301 struct dev_pagemap *pgmap)
1302 {
1303 /*
1304 * we want to map things as base page size mapping so that
1305 * we can save space in vmemmap. We could have huge mapping
1306 * covering out both edges.
1307 */
1308 unsigned long addr;
1309 unsigned long addr_pfn = start_pfn;
1310 unsigned long next;
1311 pgd_t *pgd;
1312 p4d_t *p4d;
1313 pud_t *pud;
1314 pmd_t *pmd;
1315 pte_t *pte;
1316
1317 for (addr = start; addr < end; addr = next) {
1318
1319 pgd = pgd_offset_k(addr);
1320 p4d = p4d_offset(pgd, addr);
1321 pud = vmemmap_pud_alloc(p4d, node, addr);
1322 if (!pud)
1323 return -ENOMEM;
1324 pmd = vmemmap_pmd_alloc(pud, node, addr);
1325 if (!pmd)
1326 return -ENOMEM;
1327
1328 if (pmd_leaf(READ_ONCE(*pmd))) {
1329 /* existing huge mapping. Skip the range */
1330 addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
1331 next = pmd_addr_end(addr, end);
1332 continue;
1333 }
1334 pte = vmemmap_pte_alloc(pmd, node, addr);
1335 if (!pte)
1336 return -ENOMEM;
1337 if (!pte_none(*pte)) {
1338 /*
1339 * This could be because we already have a compound
1340 * page whose VMEMMAP_RESERVE_NR pages were mapped and
1341 * this request fall in those pages.
1342 */
1343 addr_pfn += 1;
1344 next = addr + PAGE_SIZE;
1345 continue;
1346 } else {
1347 unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
1348 unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
1349 pte_t *tail_page_pte;
1350
1351 /*
1352 * if the address is aligned to huge page size it is the
1353 * head mapping.
1354 */
1355 if (pfn_offset == 0) {
1356 /* Populate the head page vmemmap page */
1357 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1358 if (!pte)
1359 return -ENOMEM;
1360 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1361
1362 /*
1363 * Populate the tail pages vmemmap page
1364 * It can fall in different pmd, hence
1365 * vmemmap_populate_address()
1366 */
1367 pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
1368 if (!pte)
1369 return -ENOMEM;
1370
1371 addr_pfn += 2;
1372 next = addr + 2 * PAGE_SIZE;
1373 continue;
1374 }
1375 /*
1376 * get the 2nd mapping details
1377 * Also create it if that doesn't exist
1378 */
1379 tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
1380 if (!tail_page_pte) {
1381
1382 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1383 if (!pte)
1384 return -ENOMEM;
1385 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1386
1387 addr_pfn += 1;
1388 next = addr + PAGE_SIZE;
1389 continue;
1390 }
1391
1392 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
1393 if (!pte)
1394 return -ENOMEM;
1395 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1396
1397 addr_pfn += 1;
1398 next = addr + PAGE_SIZE;
1399 continue;
1400 }
1401 }
1402 return 0;
1403 }
1404
1405
1406 #ifdef CONFIG_MEMORY_HOTPLUG
radix__vmemmap_remove_mapping(unsigned long start,unsigned long page_size)1407 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
1408 {
1409 remove_pagetable(start, start + page_size, true, NULL);
1410 }
1411
radix__vmemmap_free(unsigned long start,unsigned long end,struct vmem_altmap * altmap)1412 void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
1413 struct vmem_altmap *altmap)
1414 {
1415 remove_pagetable(start, end, false, altmap);
1416 }
1417 #endif
1418 #endif
1419
1420 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1421
radix__pmd_hugepage_update(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp,unsigned long clr,unsigned long set)1422 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
1423 pmd_t *pmdp, unsigned long clr,
1424 unsigned long set)
1425 {
1426 unsigned long old;
1427
1428 #ifdef CONFIG_DEBUG_VM
1429 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
1430 assert_spin_locked(pmd_lockptr(mm, pmdp));
1431 #endif
1432
1433 old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
1434 trace_hugepage_update_pmd(addr, old, clr, set);
1435
1436 return old;
1437 }
1438
radix__pud_hugepage_update(struct mm_struct * mm,unsigned long addr,pud_t * pudp,unsigned long clr,unsigned long set)1439 unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
1440 pud_t *pudp, unsigned long clr,
1441 unsigned long set)
1442 {
1443 unsigned long old;
1444
1445 #ifdef CONFIG_DEBUG_VM
1446 WARN_ON(!pud_devmap(*pudp));
1447 assert_spin_locked(pud_lockptr(mm, pudp));
1448 #endif
1449
1450 old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
1451 trace_hugepage_update_pud(addr, old, clr, set);
1452
1453 return old;
1454 }
1455
radix__pmdp_collapse_flush(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)1456 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
1457 pmd_t *pmdp)
1458
1459 {
1460 pmd_t pmd;
1461
1462 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1463 VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
1464 VM_BUG_ON(pmd_devmap(*pmdp));
1465 /*
1466 * khugepaged calls this for normal pmd
1467 */
1468 pmd = *pmdp;
1469 pmd_clear(pmdp);
1470
1471 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
1472
1473 return pmd;
1474 }
1475
1476 /*
1477 * For us pgtable_t is pte_t *. Inorder to save the deposisted
1478 * page table, we consider the allocated page table as a list
1479 * head. On withdraw we need to make sure we zero out the used
1480 * list_head memory area.
1481 */
radix__pgtable_trans_huge_deposit(struct mm_struct * mm,pmd_t * pmdp,pgtable_t pgtable)1482 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1483 pgtable_t pgtable)
1484 {
1485 struct list_head *lh = (struct list_head *) pgtable;
1486
1487 assert_spin_locked(pmd_lockptr(mm, pmdp));
1488
1489 /* FIFO */
1490 if (!pmd_huge_pte(mm, pmdp))
1491 INIT_LIST_HEAD(lh);
1492 else
1493 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1494 pmd_huge_pte(mm, pmdp) = pgtable;
1495 }
1496
radix__pgtable_trans_huge_withdraw(struct mm_struct * mm,pmd_t * pmdp)1497 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1498 {
1499 pte_t *ptep;
1500 pgtable_t pgtable;
1501 struct list_head *lh;
1502
1503 assert_spin_locked(pmd_lockptr(mm, pmdp));
1504
1505 /* FIFO */
1506 pgtable = pmd_huge_pte(mm, pmdp);
1507 lh = (struct list_head *) pgtable;
1508 if (list_empty(lh))
1509 pmd_huge_pte(mm, pmdp) = NULL;
1510 else {
1511 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1512 list_del(lh);
1513 }
1514 ptep = (pte_t *) pgtable;
1515 *ptep = __pte(0);
1516 ptep++;
1517 *ptep = __pte(0);
1518 return pgtable;
1519 }
1520
radix__pmdp_huge_get_and_clear(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp)1521 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1522 unsigned long addr, pmd_t *pmdp)
1523 {
1524 pmd_t old_pmd;
1525 unsigned long old;
1526
1527 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1528 old_pmd = __pmd(old);
1529 return old_pmd;
1530 }
1531
radix__pudp_huge_get_and_clear(struct mm_struct * mm,unsigned long addr,pud_t * pudp)1532 pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
1533 unsigned long addr, pud_t *pudp)
1534 {
1535 pud_t old_pud;
1536 unsigned long old;
1537
1538 old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
1539 old_pud = __pud(old);
1540 return old_pud;
1541 }
1542
1543 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1544
radix__ptep_set_access_flags(struct vm_area_struct * vma,pte_t * ptep,pte_t entry,unsigned long address,int psize)1545 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1546 pte_t entry, unsigned long address, int psize)
1547 {
1548 struct mm_struct *mm = vma->vm_mm;
1549 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
1550 _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
1551
1552 unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1553 /*
1554 * On POWER9, the NMMU is not able to relax PTE access permissions
1555 * for a translation with a TLB. The PTE must be invalidated, TLB
1556 * flushed before the new PTE is installed.
1557 *
1558 * This only needs to be done for radix, because hash translation does
1559 * flush when updating the linux pte (and we don't support NMMU
1560 * accelerators on HPT on POWER9 anyway XXX: do we?).
1561 *
1562 * POWER10 (and P9P) NMMU does behave as per ISA.
1563 */
1564 if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1565 atomic_read(&mm->context.copros) > 0) {
1566 unsigned long old_pte, new_pte;
1567
1568 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1569 new_pte = old_pte | set;
1570 radix__flush_tlb_page_psize(mm, address, psize);
1571 __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1572 } else {
1573 __radix_pte_update(ptep, 0, set);
1574 /*
1575 * Book3S does not require a TLB flush when relaxing access
1576 * restrictions when the address space (modulo the POWER9 nest
1577 * MMU issue above) because the MMU will reload the PTE after
1578 * taking an access fault, as defined by the architecture. See
1579 * "Setting a Reference or Change Bit or Upgrading Access
1580 * Authority (PTE Subject to Atomic Hardware Updates)" in
1581 * Power ISA Version 3.1B.
1582 */
1583 }
1584 /* See ptesync comment in radix__set_pte_at */
1585 }
1586
radix__ptep_modify_prot_commit(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pte_t old_pte,pte_t pte)1587 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1588 unsigned long addr, pte_t *ptep,
1589 pte_t old_pte, pte_t pte)
1590 {
1591 struct mm_struct *mm = vma->vm_mm;
1592
1593 /*
1594 * POWER9 NMMU must flush the TLB after clearing the PTE before
1595 * installing a PTE with more relaxed access permissions, see
1596 * radix__ptep_set_access_flags.
1597 */
1598 if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1599 is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1600 (atomic_read(&mm->context.copros) > 0))
1601 radix__flush_tlb_page(vma, addr);
1602
1603 set_pte_at(mm, addr, ptep, pte);
1604 }
1605
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)1606 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1607 {
1608 pte_t *ptep = (pte_t *)pud;
1609 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1610
1611 if (!radix_enabled())
1612 return 0;
1613
1614 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1615
1616 return 1;
1617 }
1618
pud_clear_huge(pud_t * pud)1619 int pud_clear_huge(pud_t *pud)
1620 {
1621 if (pud_leaf(*pud)) {
1622 pud_clear(pud);
1623 return 1;
1624 }
1625
1626 return 0;
1627 }
1628
pud_free_pmd_page(pud_t * pud,unsigned long addr)1629 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1630 {
1631 pmd_t *pmd;
1632 int i;
1633
1634 pmd = pud_pgtable(*pud);
1635 pud_clear(pud);
1636
1637 flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1638
1639 for (i = 0; i < PTRS_PER_PMD; i++) {
1640 if (!pmd_none(pmd[i])) {
1641 pte_t *pte;
1642 pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1643
1644 pte_free_kernel(&init_mm, pte);
1645 }
1646 }
1647
1648 pmd_free(&init_mm, pmd);
1649
1650 return 1;
1651 }
1652
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)1653 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1654 {
1655 pte_t *ptep = (pte_t *)pmd;
1656 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1657
1658 if (!radix_enabled())
1659 return 0;
1660
1661 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1662
1663 return 1;
1664 }
1665
pmd_clear_huge(pmd_t * pmd)1666 int pmd_clear_huge(pmd_t *pmd)
1667 {
1668 if (pmd_leaf(*pmd)) {
1669 pmd_clear(pmd);
1670 return 1;
1671 }
1672
1673 return 0;
1674 }
1675
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)1676 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1677 {
1678 pte_t *pte;
1679
1680 pte = (pte_t *)pmd_page_vaddr(*pmd);
1681 pmd_clear(pmd);
1682
1683 flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1684
1685 pte_free_kernel(&init_mm, pte);
1686
1687 return 1;
1688 }
1689