xref: /linux/arch/powerpc/mm/book3s64/radix_pgtable.c (revision f9bff0e31881d03badf191d3b0005839391f5f2b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Page table handling routines for radix page table.
4  *
5  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6  */
7 
8 #define pr_fmt(fmt) "radix-mmu: " fmt
9 
10 #include <linux/io.h>
11 #include <linux/kernel.h>
12 #include <linux/sched/mm.h>
13 #include <linux/memblock.h>
14 #include <linux/of.h>
15 #include <linux/of_fdt.h>
16 #include <linux/mm.h>
17 #include <linux/hugetlb.h>
18 #include <linux/string_helpers.h>
19 #include <linux/memory.h>
20 
21 #include <asm/pgalloc.h>
22 #include <asm/mmu_context.h>
23 #include <asm/dma.h>
24 #include <asm/machdep.h>
25 #include <asm/mmu.h>
26 #include <asm/firmware.h>
27 #include <asm/powernv.h>
28 #include <asm/sections.h>
29 #include <asm/smp.h>
30 #include <asm/trace.h>
31 #include <asm/uaccess.h>
32 #include <asm/ultravisor.h>
33 #include <asm/set_memory.h>
34 
35 #include <trace/events/thp.h>
36 
37 #include <mm/mmu_decl.h>
38 
39 unsigned int mmu_base_pid;
40 unsigned long radix_mem_block_size __ro_after_init;
41 
42 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
43 			unsigned long region_start, unsigned long region_end)
44 {
45 	phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
46 	phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
47 	void *ptr;
48 
49 	if (region_start)
50 		min_addr = region_start;
51 	if (region_end)
52 		max_addr = region_end;
53 
54 	ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
55 
56 	if (!ptr)
57 		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
58 		      __func__, size, size, nid, &min_addr, &max_addr);
59 
60 	return ptr;
61 }
62 
63 /*
64  * When allocating pud or pmd pointers, we allocate a complete page
65  * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
66  * is to ensure that the page obtained from the memblock allocator
67  * can be completely used as page table page and can be freed
68  * correctly when the page table entries are removed.
69  */
70 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
71 			  pgprot_t flags,
72 			  unsigned int map_page_size,
73 			  int nid,
74 			  unsigned long region_start, unsigned long region_end)
75 {
76 	unsigned long pfn = pa >> PAGE_SHIFT;
77 	pgd_t *pgdp;
78 	p4d_t *p4dp;
79 	pud_t *pudp;
80 	pmd_t *pmdp;
81 	pte_t *ptep;
82 
83 	pgdp = pgd_offset_k(ea);
84 	p4dp = p4d_offset(pgdp, ea);
85 	if (p4d_none(*p4dp)) {
86 		pudp = early_alloc_pgtable(PAGE_SIZE, nid,
87 					   region_start, region_end);
88 		p4d_populate(&init_mm, p4dp, pudp);
89 	}
90 	pudp = pud_offset(p4dp, ea);
91 	if (map_page_size == PUD_SIZE) {
92 		ptep = (pte_t *)pudp;
93 		goto set_the_pte;
94 	}
95 	if (pud_none(*pudp)) {
96 		pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
97 					   region_end);
98 		pud_populate(&init_mm, pudp, pmdp);
99 	}
100 	pmdp = pmd_offset(pudp, ea);
101 	if (map_page_size == PMD_SIZE) {
102 		ptep = pmdp_ptep(pmdp);
103 		goto set_the_pte;
104 	}
105 	if (!pmd_present(*pmdp)) {
106 		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
107 						region_start, region_end);
108 		pmd_populate_kernel(&init_mm, pmdp, ptep);
109 	}
110 	ptep = pte_offset_kernel(pmdp, ea);
111 
112 set_the_pte:
113 	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
114 	asm volatile("ptesync": : :"memory");
115 	return 0;
116 }
117 
118 /*
119  * nid, region_start, and region_end are hints to try to place the page
120  * table memory in the same node or region.
121  */
122 static int __map_kernel_page(unsigned long ea, unsigned long pa,
123 			  pgprot_t flags,
124 			  unsigned int map_page_size,
125 			  int nid,
126 			  unsigned long region_start, unsigned long region_end)
127 {
128 	unsigned long pfn = pa >> PAGE_SHIFT;
129 	pgd_t *pgdp;
130 	p4d_t *p4dp;
131 	pud_t *pudp;
132 	pmd_t *pmdp;
133 	pte_t *ptep;
134 	/*
135 	 * Make sure task size is correct as per the max adddr
136 	 */
137 	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
138 
139 #ifdef CONFIG_PPC_64K_PAGES
140 	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
141 #endif
142 
143 	if (unlikely(!slab_is_available()))
144 		return early_map_kernel_page(ea, pa, flags, map_page_size,
145 						nid, region_start, region_end);
146 
147 	/*
148 	 * Should make page table allocation functions be able to take a
149 	 * node, so we can place kernel page tables on the right nodes after
150 	 * boot.
151 	 */
152 	pgdp = pgd_offset_k(ea);
153 	p4dp = p4d_offset(pgdp, ea);
154 	pudp = pud_alloc(&init_mm, p4dp, ea);
155 	if (!pudp)
156 		return -ENOMEM;
157 	if (map_page_size == PUD_SIZE) {
158 		ptep = (pte_t *)pudp;
159 		goto set_the_pte;
160 	}
161 	pmdp = pmd_alloc(&init_mm, pudp, ea);
162 	if (!pmdp)
163 		return -ENOMEM;
164 	if (map_page_size == PMD_SIZE) {
165 		ptep = pmdp_ptep(pmdp);
166 		goto set_the_pte;
167 	}
168 	ptep = pte_alloc_kernel(pmdp, ea);
169 	if (!ptep)
170 		return -ENOMEM;
171 
172 set_the_pte:
173 	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
174 	asm volatile("ptesync": : :"memory");
175 	return 0;
176 }
177 
178 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
179 			  pgprot_t flags,
180 			  unsigned int map_page_size)
181 {
182 	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
183 }
184 
185 #ifdef CONFIG_STRICT_KERNEL_RWX
186 static void radix__change_memory_range(unsigned long start, unsigned long end,
187 				       unsigned long clear)
188 {
189 	unsigned long idx;
190 	pgd_t *pgdp;
191 	p4d_t *p4dp;
192 	pud_t *pudp;
193 	pmd_t *pmdp;
194 	pte_t *ptep;
195 
196 	start = ALIGN_DOWN(start, PAGE_SIZE);
197 	end = PAGE_ALIGN(end); // aligns up
198 
199 	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
200 		 start, end, clear);
201 
202 	for (idx = start; idx < end; idx += PAGE_SIZE) {
203 		pgdp = pgd_offset_k(idx);
204 		p4dp = p4d_offset(pgdp, idx);
205 		pudp = pud_alloc(&init_mm, p4dp, idx);
206 		if (!pudp)
207 			continue;
208 		if (pud_is_leaf(*pudp)) {
209 			ptep = (pte_t *)pudp;
210 			goto update_the_pte;
211 		}
212 		pmdp = pmd_alloc(&init_mm, pudp, idx);
213 		if (!pmdp)
214 			continue;
215 		if (pmd_is_leaf(*pmdp)) {
216 			ptep = pmdp_ptep(pmdp);
217 			goto update_the_pte;
218 		}
219 		ptep = pte_alloc_kernel(pmdp, idx);
220 		if (!ptep)
221 			continue;
222 update_the_pte:
223 		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
224 	}
225 
226 	radix__flush_tlb_kernel_range(start, end);
227 }
228 
229 void radix__mark_rodata_ro(void)
230 {
231 	unsigned long start, end;
232 
233 	start = (unsigned long)_stext;
234 	end = (unsigned long)__end_rodata;
235 
236 	radix__change_memory_range(start, end, _PAGE_WRITE);
237 
238 	for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
239 		end = start + PAGE_SIZE;
240 		if (overlaps_interrupt_vector_text(start, end))
241 			radix__change_memory_range(start, end, _PAGE_WRITE);
242 		else
243 			break;
244 	}
245 }
246 
247 void radix__mark_initmem_nx(void)
248 {
249 	unsigned long start = (unsigned long)__init_begin;
250 	unsigned long end = (unsigned long)__init_end;
251 
252 	radix__change_memory_range(start, end, _PAGE_EXEC);
253 }
254 #endif /* CONFIG_STRICT_KERNEL_RWX */
255 
256 static inline void __meminit
257 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
258 {
259 	char buf[10];
260 
261 	if (end <= start)
262 		return;
263 
264 	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
265 
266 	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
267 		exec ? " (exec)" : "");
268 }
269 
270 static unsigned long next_boundary(unsigned long addr, unsigned long end)
271 {
272 #ifdef CONFIG_STRICT_KERNEL_RWX
273 	unsigned long stext_phys;
274 
275 	stext_phys = __pa_symbol(_stext);
276 
277 	// Relocatable kernel running at non-zero real address
278 	if (stext_phys != 0) {
279 		// The end of interrupts code at zero is a rodata boundary
280 		unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
281 		if (addr < end_intr)
282 			return end_intr;
283 
284 		// Start of relocated kernel text is a rodata boundary
285 		if (addr < stext_phys)
286 			return stext_phys;
287 	}
288 
289 	if (addr < __pa_symbol(__srwx_boundary))
290 		return __pa_symbol(__srwx_boundary);
291 #endif
292 	return end;
293 }
294 
295 static int __meminit create_physical_mapping(unsigned long start,
296 					     unsigned long end,
297 					     int nid, pgprot_t _prot)
298 {
299 	unsigned long vaddr, addr, mapping_size = 0;
300 	bool prev_exec, exec = false;
301 	pgprot_t prot;
302 	int psize;
303 	unsigned long max_mapping_size = radix_mem_block_size;
304 
305 	if (debug_pagealloc_enabled_or_kfence())
306 		max_mapping_size = PAGE_SIZE;
307 
308 	start = ALIGN(start, PAGE_SIZE);
309 	end   = ALIGN_DOWN(end, PAGE_SIZE);
310 	for (addr = start; addr < end; addr += mapping_size) {
311 		unsigned long gap, previous_size;
312 		int rc;
313 
314 		gap = next_boundary(addr, end) - addr;
315 		if (gap > max_mapping_size)
316 			gap = max_mapping_size;
317 		previous_size = mapping_size;
318 		prev_exec = exec;
319 
320 		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
321 		    mmu_psize_defs[MMU_PAGE_1G].shift) {
322 			mapping_size = PUD_SIZE;
323 			psize = MMU_PAGE_1G;
324 		} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
325 			   mmu_psize_defs[MMU_PAGE_2M].shift) {
326 			mapping_size = PMD_SIZE;
327 			psize = MMU_PAGE_2M;
328 		} else {
329 			mapping_size = PAGE_SIZE;
330 			psize = mmu_virtual_psize;
331 		}
332 
333 		vaddr = (unsigned long)__va(addr);
334 
335 		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
336 		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
337 			prot = PAGE_KERNEL_X;
338 			exec = true;
339 		} else {
340 			prot = _prot;
341 			exec = false;
342 		}
343 
344 		if (mapping_size != previous_size || exec != prev_exec) {
345 			print_mapping(start, addr, previous_size, prev_exec);
346 			start = addr;
347 		}
348 
349 		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
350 		if (rc)
351 			return rc;
352 
353 		update_page_count(psize, 1);
354 	}
355 
356 	print_mapping(start, addr, mapping_size, exec);
357 	return 0;
358 }
359 
360 static void __init radix_init_pgtable(void)
361 {
362 	unsigned long rts_field;
363 	phys_addr_t start, end;
364 	u64 i;
365 
366 	/* We don't support slb for radix */
367 	slb_set_size(0);
368 
369 	/*
370 	 * Create the linear mapping
371 	 */
372 	for_each_mem_range(i, &start, &end) {
373 		/*
374 		 * The memblock allocator  is up at this point, so the
375 		 * page tables will be allocated within the range. No
376 		 * need or a node (which we don't have yet).
377 		 */
378 
379 		if (end >= RADIX_VMALLOC_START) {
380 			pr_warn("Outside the supported range\n");
381 			continue;
382 		}
383 
384 		WARN_ON(create_physical_mapping(start, end,
385 						-1, PAGE_KERNEL));
386 	}
387 
388 	if (!cpu_has_feature(CPU_FTR_HVMODE) &&
389 			cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
390 		/*
391 		 * Older versions of KVM on these machines prefer if the
392 		 * guest only uses the low 19 PID bits.
393 		 */
394 		mmu_pid_bits = 19;
395 	}
396 	mmu_base_pid = 1;
397 
398 	/*
399 	 * Allocate Partition table and process table for the
400 	 * host.
401 	 */
402 	BUG_ON(PRTB_SIZE_SHIFT > 36);
403 	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
404 	/*
405 	 * Fill in the process table.
406 	 */
407 	rts_field = radix__get_tree_size();
408 	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
409 
410 	/*
411 	 * The init_mm context is given the first available (non-zero) PID,
412 	 * which is the "guard PID" and contains no page table. PIDR should
413 	 * never be set to zero because that duplicates the kernel address
414 	 * space at the 0x0... offset (quadrant 0)!
415 	 *
416 	 * An arbitrary PID that may later be allocated by the PID allocator
417 	 * for userspace processes must not be used either, because that
418 	 * would cause stale user mappings for that PID on CPUs outside of
419 	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
420 	 *
421 	 * So permanently carve out one PID for the purpose of a guard PID.
422 	 */
423 	init_mm.context.id = mmu_base_pid;
424 	mmu_base_pid++;
425 }
426 
427 static void __init radix_init_partition_table(void)
428 {
429 	unsigned long rts_field, dw0, dw1;
430 
431 	mmu_partition_table_init();
432 	rts_field = radix__get_tree_size();
433 	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
434 	dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
435 	mmu_partition_table_set_entry(0, dw0, dw1, false);
436 
437 	pr_info("Initializing Radix MMU\n");
438 }
439 
440 static int __init get_idx_from_shift(unsigned int shift)
441 {
442 	int idx = -1;
443 
444 	switch (shift) {
445 	case 0xc:
446 		idx = MMU_PAGE_4K;
447 		break;
448 	case 0x10:
449 		idx = MMU_PAGE_64K;
450 		break;
451 	case 0x15:
452 		idx = MMU_PAGE_2M;
453 		break;
454 	case 0x1e:
455 		idx = MMU_PAGE_1G;
456 		break;
457 	}
458 	return idx;
459 }
460 
461 static int __init radix_dt_scan_page_sizes(unsigned long node,
462 					   const char *uname, int depth,
463 					   void *data)
464 {
465 	int size = 0;
466 	int shift, idx;
467 	unsigned int ap;
468 	const __be32 *prop;
469 	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
470 
471 	/* We are scanning "cpu" nodes only */
472 	if (type == NULL || strcmp(type, "cpu") != 0)
473 		return 0;
474 
475 	/* Grab page size encodings */
476 	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
477 	if (!prop)
478 		return 0;
479 
480 	pr_info("Page sizes from device-tree:\n");
481 	for (; size >= 4; size -= 4, ++prop) {
482 
483 		struct mmu_psize_def *def;
484 
485 		/* top 3 bit is AP encoding */
486 		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
487 		ap = be32_to_cpu(prop[0]) >> 29;
488 		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
489 
490 		idx = get_idx_from_shift(shift);
491 		if (idx < 0)
492 			continue;
493 
494 		def = &mmu_psize_defs[idx];
495 		def->shift = shift;
496 		def->ap  = ap;
497 		def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
498 	}
499 
500 	/* needed ? */
501 	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
502 	return 1;
503 }
504 
505 #ifdef CONFIG_MEMORY_HOTPLUG
506 static int __init probe_memory_block_size(unsigned long node, const char *uname, int
507 					  depth, void *data)
508 {
509 	unsigned long *mem_block_size = (unsigned long *)data;
510 	const __be32 *prop;
511 	int len;
512 
513 	if (depth != 1)
514 		return 0;
515 
516 	if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
517 		return 0;
518 
519 	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
520 
521 	if (!prop || len < dt_root_size_cells * sizeof(__be32))
522 		/*
523 		 * Nothing in the device tree
524 		 */
525 		*mem_block_size = MIN_MEMORY_BLOCK_SIZE;
526 	else
527 		*mem_block_size = of_read_number(prop, dt_root_size_cells);
528 	return 1;
529 }
530 
531 static unsigned long __init radix_memory_block_size(void)
532 {
533 	unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
534 
535 	/*
536 	 * OPAL firmware feature is set by now. Hence we are ok
537 	 * to test OPAL feature.
538 	 */
539 	if (firmware_has_feature(FW_FEATURE_OPAL))
540 		mem_block_size = 1UL * 1024 * 1024 * 1024;
541 	else
542 		of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
543 
544 	return mem_block_size;
545 }
546 
547 #else   /* CONFIG_MEMORY_HOTPLUG */
548 
549 static unsigned long __init radix_memory_block_size(void)
550 {
551 	return 1UL * 1024 * 1024 * 1024;
552 }
553 
554 #endif /* CONFIG_MEMORY_HOTPLUG */
555 
556 
557 void __init radix__early_init_devtree(void)
558 {
559 	int rc;
560 
561 	/*
562 	 * Try to find the available page sizes in the device-tree
563 	 */
564 	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
565 	if (!rc) {
566 		/*
567 		 * No page size details found in device tree.
568 		 * Let's assume we have page 4k and 64k support
569 		 */
570 		mmu_psize_defs[MMU_PAGE_4K].shift = 12;
571 		mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
572 		mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
573 			psize_to_rpti_pgsize(MMU_PAGE_4K);
574 
575 		mmu_psize_defs[MMU_PAGE_64K].shift = 16;
576 		mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
577 		mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
578 			psize_to_rpti_pgsize(MMU_PAGE_64K);
579 	}
580 
581 	/*
582 	 * Max mapping size used when mapping pages. We don't use
583 	 * ppc_md.memory_block_size() here because this get called
584 	 * early and we don't have machine probe called yet. Also
585 	 * the pseries implementation only check for ibm,lmb-size.
586 	 * All hypervisor supporting radix do expose that device
587 	 * tree node.
588 	 */
589 	radix_mem_block_size = radix_memory_block_size();
590 	return;
591 }
592 
593 void __init radix__early_init_mmu(void)
594 {
595 	unsigned long lpcr;
596 
597 #ifdef CONFIG_PPC_64S_HASH_MMU
598 #ifdef CONFIG_PPC_64K_PAGES
599 	/* PAGE_SIZE mappings */
600 	mmu_virtual_psize = MMU_PAGE_64K;
601 #else
602 	mmu_virtual_psize = MMU_PAGE_4K;
603 #endif
604 #endif
605 	/*
606 	 * initialize page table size
607 	 */
608 	__pte_index_size = RADIX_PTE_INDEX_SIZE;
609 	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
610 	__pud_index_size = RADIX_PUD_INDEX_SIZE;
611 	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
612 	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
613 	__pte_table_size = RADIX_PTE_TABLE_SIZE;
614 	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
615 	__pud_table_size = RADIX_PUD_TABLE_SIZE;
616 	__pgd_table_size = RADIX_PGD_TABLE_SIZE;
617 
618 	__pmd_val_bits = RADIX_PMD_VAL_BITS;
619 	__pud_val_bits = RADIX_PUD_VAL_BITS;
620 	__pgd_val_bits = RADIX_PGD_VAL_BITS;
621 
622 	__kernel_virt_start = RADIX_KERN_VIRT_START;
623 	__vmalloc_start = RADIX_VMALLOC_START;
624 	__vmalloc_end = RADIX_VMALLOC_END;
625 	__kernel_io_start = RADIX_KERN_IO_START;
626 	__kernel_io_end = RADIX_KERN_IO_END;
627 	vmemmap = (struct page *)RADIX_VMEMMAP_START;
628 	ioremap_bot = IOREMAP_BASE;
629 
630 #ifdef CONFIG_PCI
631 	pci_io_base = ISA_IO_BASE;
632 #endif
633 	__pte_frag_nr = RADIX_PTE_FRAG_NR;
634 	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
635 	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
636 	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
637 
638 	radix_init_pgtable();
639 
640 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
641 		lpcr = mfspr(SPRN_LPCR);
642 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
643 		radix_init_partition_table();
644 	} else {
645 		radix_init_pseries();
646 	}
647 
648 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
649 
650 	/* Switch to the guard PID before turning on MMU */
651 	radix__switch_mmu_context(NULL, &init_mm);
652 	tlbiel_all();
653 }
654 
655 void radix__early_init_mmu_secondary(void)
656 {
657 	unsigned long lpcr;
658 	/*
659 	 * update partition table control register and UPRT
660 	 */
661 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
662 		lpcr = mfspr(SPRN_LPCR);
663 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
664 
665 		set_ptcr_when_no_uv(__pa(partition_tb) |
666 				    (PATB_SIZE_SHIFT - 12));
667 	}
668 
669 	radix__switch_mmu_context(NULL, &init_mm);
670 	tlbiel_all();
671 
672 	/* Make sure userspace can't change the AMR */
673 	mtspr(SPRN_UAMOR, 0);
674 }
675 
676 /* Called during kexec sequence with MMU off */
677 notrace void radix__mmu_cleanup_all(void)
678 {
679 	unsigned long lpcr;
680 
681 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
682 		lpcr = mfspr(SPRN_LPCR);
683 		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
684 		set_ptcr_when_no_uv(0);
685 		powernv_set_nmmu_ptcr(0);
686 		radix__flush_tlb_all();
687 	}
688 }
689 
690 #ifdef CONFIG_MEMORY_HOTPLUG
691 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
692 {
693 	pte_t *pte;
694 	int i;
695 
696 	for (i = 0; i < PTRS_PER_PTE; i++) {
697 		pte = pte_start + i;
698 		if (!pte_none(*pte))
699 			return;
700 	}
701 
702 	pte_free_kernel(&init_mm, pte_start);
703 	pmd_clear(pmd);
704 }
705 
706 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
707 {
708 	pmd_t *pmd;
709 	int i;
710 
711 	for (i = 0; i < PTRS_PER_PMD; i++) {
712 		pmd = pmd_start + i;
713 		if (!pmd_none(*pmd))
714 			return;
715 	}
716 
717 	pmd_free(&init_mm, pmd_start);
718 	pud_clear(pud);
719 }
720 
721 static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
722 {
723 	pud_t *pud;
724 	int i;
725 
726 	for (i = 0; i < PTRS_PER_PUD; i++) {
727 		pud = pud_start + i;
728 		if (!pud_none(*pud))
729 			return;
730 	}
731 
732 	pud_free(&init_mm, pud_start);
733 	p4d_clear(p4d);
734 }
735 
736 #ifdef CONFIG_SPARSEMEM_VMEMMAP
737 static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
738 {
739 	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
740 
741 	return !vmemmap_populated(start, PMD_SIZE);
742 }
743 
744 static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
745 {
746 	unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
747 
748 	return !vmemmap_populated(start, PAGE_SIZE);
749 
750 }
751 #endif
752 
753 static void __meminit free_vmemmap_pages(struct page *page,
754 					 struct vmem_altmap *altmap,
755 					 int order)
756 {
757 	unsigned int nr_pages = 1 << order;
758 
759 	if (altmap) {
760 		unsigned long alt_start, alt_end;
761 		unsigned long base_pfn = page_to_pfn(page);
762 
763 		/*
764 		 * with 2M vmemmap mmaping we can have things setup
765 		 * such that even though atlmap is specified we never
766 		 * used altmap.
767 		 */
768 		alt_start = altmap->base_pfn;
769 		alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
770 
771 		if (base_pfn >= alt_start && base_pfn < alt_end) {
772 			vmem_altmap_free(altmap, nr_pages);
773 			return;
774 		}
775 	}
776 
777 	if (PageReserved(page)) {
778 		/* allocated from memblock */
779 		while (nr_pages--)
780 			free_reserved_page(page++);
781 	} else
782 		free_pages((unsigned long)page_address(page), order);
783 }
784 
785 static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
786 				       unsigned long end, bool direct,
787 				       struct vmem_altmap *altmap)
788 {
789 	unsigned long next, pages = 0;
790 	pte_t *pte;
791 
792 	pte = pte_start + pte_index(addr);
793 	for (; addr < end; addr = next, pte++) {
794 		next = (addr + PAGE_SIZE) & PAGE_MASK;
795 		if (next > end)
796 			next = end;
797 
798 		if (!pte_present(*pte))
799 			continue;
800 
801 		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
802 			if (!direct)
803 				free_vmemmap_pages(pte_page(*pte), altmap, 0);
804 			pte_clear(&init_mm, addr, pte);
805 			pages++;
806 		}
807 #ifdef CONFIG_SPARSEMEM_VMEMMAP
808 		else if (!direct && vmemmap_page_is_unused(addr, next)) {
809 			free_vmemmap_pages(pte_page(*pte), altmap, 0);
810 			pte_clear(&init_mm, addr, pte);
811 		}
812 #endif
813 	}
814 	if (direct)
815 		update_page_count(mmu_virtual_psize, -pages);
816 }
817 
818 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
819 				       unsigned long end, bool direct,
820 				       struct vmem_altmap *altmap)
821 {
822 	unsigned long next, pages = 0;
823 	pte_t *pte_base;
824 	pmd_t *pmd;
825 
826 	pmd = pmd_start + pmd_index(addr);
827 	for (; addr < end; addr = next, pmd++) {
828 		next = pmd_addr_end(addr, end);
829 
830 		if (!pmd_present(*pmd))
831 			continue;
832 
833 		if (pmd_is_leaf(*pmd)) {
834 			if (IS_ALIGNED(addr, PMD_SIZE) &&
835 			    IS_ALIGNED(next, PMD_SIZE)) {
836 				if (!direct)
837 					free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
838 				pte_clear(&init_mm, addr, (pte_t *)pmd);
839 				pages++;
840 			}
841 #ifdef CONFIG_SPARSEMEM_VMEMMAP
842 			else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
843 				free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
844 				pte_clear(&init_mm, addr, (pte_t *)pmd);
845 			}
846 #endif
847 			continue;
848 		}
849 
850 		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
851 		remove_pte_table(pte_base, addr, next, direct, altmap);
852 		free_pte_table(pte_base, pmd);
853 	}
854 	if (direct)
855 		update_page_count(MMU_PAGE_2M, -pages);
856 }
857 
858 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
859 				       unsigned long end, bool direct,
860 				       struct vmem_altmap *altmap)
861 {
862 	unsigned long next, pages = 0;
863 	pmd_t *pmd_base;
864 	pud_t *pud;
865 
866 	pud = pud_start + pud_index(addr);
867 	for (; addr < end; addr = next, pud++) {
868 		next = pud_addr_end(addr, end);
869 
870 		if (!pud_present(*pud))
871 			continue;
872 
873 		if (pud_is_leaf(*pud)) {
874 			if (!IS_ALIGNED(addr, PUD_SIZE) ||
875 			    !IS_ALIGNED(next, PUD_SIZE)) {
876 				WARN_ONCE(1, "%s: unaligned range\n", __func__);
877 				continue;
878 			}
879 			pte_clear(&init_mm, addr, (pte_t *)pud);
880 			pages++;
881 			continue;
882 		}
883 
884 		pmd_base = pud_pgtable(*pud);
885 		remove_pmd_table(pmd_base, addr, next, direct, altmap);
886 		free_pmd_table(pmd_base, pud);
887 	}
888 	if (direct)
889 		update_page_count(MMU_PAGE_1G, -pages);
890 }
891 
892 static void __meminit
893 remove_pagetable(unsigned long start, unsigned long end, bool direct,
894 		 struct vmem_altmap *altmap)
895 {
896 	unsigned long addr, next;
897 	pud_t *pud_base;
898 	pgd_t *pgd;
899 	p4d_t *p4d;
900 
901 	spin_lock(&init_mm.page_table_lock);
902 
903 	for (addr = start; addr < end; addr = next) {
904 		next = pgd_addr_end(addr, end);
905 
906 		pgd = pgd_offset_k(addr);
907 		p4d = p4d_offset(pgd, addr);
908 		if (!p4d_present(*p4d))
909 			continue;
910 
911 		if (p4d_is_leaf(*p4d)) {
912 			if (!IS_ALIGNED(addr, P4D_SIZE) ||
913 			    !IS_ALIGNED(next, P4D_SIZE)) {
914 				WARN_ONCE(1, "%s: unaligned range\n", __func__);
915 				continue;
916 			}
917 
918 			pte_clear(&init_mm, addr, (pte_t *)pgd);
919 			continue;
920 		}
921 
922 		pud_base = p4d_pgtable(*p4d);
923 		remove_pud_table(pud_base, addr, next, direct, altmap);
924 		free_pud_table(pud_base, p4d);
925 	}
926 
927 	spin_unlock(&init_mm.page_table_lock);
928 	radix__flush_tlb_kernel_range(start, end);
929 }
930 
931 int __meminit radix__create_section_mapping(unsigned long start,
932 					    unsigned long end, int nid,
933 					    pgprot_t prot)
934 {
935 	if (end >= RADIX_VMALLOC_START) {
936 		pr_warn("Outside the supported range\n");
937 		return -1;
938 	}
939 
940 	return create_physical_mapping(__pa(start), __pa(end),
941 				       nid, prot);
942 }
943 
944 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
945 {
946 	remove_pagetable(start, end, true, NULL);
947 	return 0;
948 }
949 #endif /* CONFIG_MEMORY_HOTPLUG */
950 
951 #ifdef CONFIG_SPARSEMEM_VMEMMAP
952 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
953 				 pgprot_t flags, unsigned int map_page_size,
954 				 int nid)
955 {
956 	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
957 }
958 
959 int __meminit radix__vmemmap_create_mapping(unsigned long start,
960 				      unsigned long page_size,
961 				      unsigned long phys)
962 {
963 	/* Create a PTE encoding */
964 	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
965 	int ret;
966 
967 	if ((start + page_size) >= RADIX_VMEMMAP_END) {
968 		pr_warn("Outside the supported range\n");
969 		return -1;
970 	}
971 
972 	ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
973 	BUG_ON(ret);
974 
975 	return 0;
976 }
977 
978 
979 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
980 {
981 	if (radix_enabled())
982 		return __vmemmap_can_optimize(altmap, pgmap);
983 
984 	return false;
985 }
986 
987 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
988 				unsigned long addr, unsigned long next)
989 {
990 	int large = pmd_large(*pmdp);
991 
992 	if (large)
993 		vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
994 
995 	return large;
996 }
997 
998 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
999 			       unsigned long addr, unsigned long next)
1000 {
1001 	pte_t entry;
1002 	pte_t *ptep = pmdp_ptep(pmdp);
1003 
1004 	VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
1005 	entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1006 	set_pte_at(&init_mm, addr, ptep, entry);
1007 	asm volatile("ptesync": : :"memory");
1008 
1009 	vmemmap_verify(ptep, node, addr, next);
1010 }
1011 
1012 static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
1013 						     int node,
1014 						     struct vmem_altmap *altmap,
1015 						     struct page *reuse)
1016 {
1017 	pte_t *pte = pte_offset_kernel(pmdp, addr);
1018 
1019 	if (pte_none(*pte)) {
1020 		pte_t entry;
1021 		void *p;
1022 
1023 		if (!reuse) {
1024 			/*
1025 			 * make sure we don't create altmap mappings
1026 			 * covering things outside the device.
1027 			 */
1028 			if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
1029 				altmap = NULL;
1030 
1031 			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
1032 			if (!p && altmap)
1033 				p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
1034 			if (!p)
1035 				return NULL;
1036 			pr_debug("PAGE_SIZE vmemmap mapping\n");
1037 		} else {
1038 			/*
1039 			 * When a PTE/PMD entry is freed from the init_mm
1040 			 * there's a free_pages() call to this page allocated
1041 			 * above. Thus this get_page() is paired with the
1042 			 * put_page_testzero() on the freeing path.
1043 			 * This can only called by certain ZONE_DEVICE path,
1044 			 * and through vmemmap_populate_compound_pages() when
1045 			 * slab is available.
1046 			 */
1047 			get_page(reuse);
1048 			p = page_to_virt(reuse);
1049 			pr_debug("Tail page reuse vmemmap mapping\n");
1050 		}
1051 
1052 		VM_BUG_ON(!PAGE_ALIGNED(addr));
1053 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1054 		set_pte_at(&init_mm, addr, pte, entry);
1055 		asm volatile("ptesync": : :"memory");
1056 	}
1057 	return pte;
1058 }
1059 
1060 static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
1061 				       unsigned long address)
1062 {
1063 	pud_t *pud;
1064 
1065 	/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1066 	if (unlikely(p4d_none(*p4dp))) {
1067 		if (unlikely(!slab_is_available())) {
1068 			pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1069 			p4d_populate(&init_mm, p4dp, pud);
1070 			/* go to the pud_offset */
1071 		} else
1072 			return pud_alloc(&init_mm, p4dp, address);
1073 	}
1074 	return pud_offset(p4dp, address);
1075 }
1076 
1077 static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
1078 				       unsigned long address)
1079 {
1080 	pmd_t *pmd;
1081 
1082 	/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1083 	if (unlikely(pud_none(*pudp))) {
1084 		if (unlikely(!slab_is_available())) {
1085 			pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1086 			pud_populate(&init_mm, pudp, pmd);
1087 		} else
1088 			return pmd_alloc(&init_mm, pudp, address);
1089 	}
1090 	return pmd_offset(pudp, address);
1091 }
1092 
1093 static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
1094 				       unsigned long address)
1095 {
1096 	pte_t *pte;
1097 
1098 	/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1099 	if (unlikely(pmd_none(*pmdp))) {
1100 		if (unlikely(!slab_is_available())) {
1101 			pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1102 			pmd_populate(&init_mm, pmdp, pte);
1103 		} else
1104 			return pte_alloc_kernel(pmdp, address);
1105 	}
1106 	return pte_offset_kernel(pmdp, address);
1107 }
1108 
1109 
1110 
1111 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
1112 				      struct vmem_altmap *altmap)
1113 {
1114 	unsigned long addr;
1115 	unsigned long next;
1116 	pgd_t *pgd;
1117 	p4d_t *p4d;
1118 	pud_t *pud;
1119 	pmd_t *pmd;
1120 	pte_t *pte;
1121 
1122 	for (addr = start; addr < end; addr = next) {
1123 		next = pmd_addr_end(addr, end);
1124 
1125 		pgd = pgd_offset_k(addr);
1126 		p4d = p4d_offset(pgd, addr);
1127 		pud = vmemmap_pud_alloc(p4d, node, addr);
1128 		if (!pud)
1129 			return -ENOMEM;
1130 		pmd = vmemmap_pmd_alloc(pud, node, addr);
1131 		if (!pmd)
1132 			return -ENOMEM;
1133 
1134 		if (pmd_none(READ_ONCE(*pmd))) {
1135 			void *p;
1136 
1137 			/*
1138 			 * keep it simple by checking addr PMD_SIZE alignment
1139 			 * and verifying the device boundary condition.
1140 			 * For us to use a pmd mapping, both addr and pfn should
1141 			 * be aligned. We skip if addr is not aligned and for
1142 			 * pfn we hope we have extra area in the altmap that
1143 			 * can help to find an aligned block. This can result
1144 			 * in altmap block allocation failures, in which case
1145 			 * we fallback to RAM for vmemmap allocation.
1146 			 */
1147 			if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
1148 				       altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
1149 				/*
1150 				 * make sure we don't create altmap mappings
1151 				 * covering things outside the device.
1152 				 */
1153 				goto base_mapping;
1154 			}
1155 
1156 			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1157 			if (p) {
1158 				vmemmap_set_pmd(pmd, p, node, addr, next);
1159 				pr_debug("PMD_SIZE vmemmap mapping\n");
1160 				continue;
1161 			} else if (altmap) {
1162 				/*
1163 				 * A vmemmap block allocation can fail due to
1164 				 * alignment requirements and we trying to align
1165 				 * things aggressively there by running out of
1166 				 * space. Try base mapping on failure.
1167 				 */
1168 				goto base_mapping;
1169 			}
1170 		} else if (vmemmap_check_pmd(pmd, node, addr, next)) {
1171 			/*
1172 			 * If a huge mapping exist due to early call to
1173 			 * vmemmap_populate, let's try to use that.
1174 			 */
1175 			continue;
1176 		}
1177 base_mapping:
1178 		/*
1179 		 * Not able allocate higher order memory to back memmap
1180 		 * or we found a pointer to pte page. Allocate base page
1181 		 * size vmemmap
1182 		 */
1183 		pte = vmemmap_pte_alloc(pmd, node, addr);
1184 		if (!pte)
1185 			return -ENOMEM;
1186 
1187 		pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
1188 		if (!pte)
1189 			return -ENOMEM;
1190 
1191 		vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1192 		next = addr + PAGE_SIZE;
1193 	}
1194 	return 0;
1195 }
1196 
1197 static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
1198 							 struct vmem_altmap *altmap,
1199 							 struct page *reuse)
1200 {
1201 	pgd_t *pgd;
1202 	p4d_t *p4d;
1203 	pud_t *pud;
1204 	pmd_t *pmd;
1205 	pte_t *pte;
1206 
1207 	pgd = pgd_offset_k(addr);
1208 	p4d = p4d_offset(pgd, addr);
1209 	pud = vmemmap_pud_alloc(p4d, node, addr);
1210 	if (!pud)
1211 		return NULL;
1212 	pmd = vmemmap_pmd_alloc(pud, node, addr);
1213 	if (!pmd)
1214 		return NULL;
1215 	if (pmd_leaf(*pmd))
1216 		/*
1217 		 * The second page is mapped as a hugepage due to a nearby request.
1218 		 * Force our mapping to page size without deduplication
1219 		 */
1220 		return NULL;
1221 	pte = vmemmap_pte_alloc(pmd, node, addr);
1222 	if (!pte)
1223 		return NULL;
1224 	radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1225 	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1226 
1227 	return pte;
1228 }
1229 
1230 static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
1231 						    unsigned long pfn_offset, int node)
1232 {
1233 	pgd_t *pgd;
1234 	p4d_t *p4d;
1235 	pud_t *pud;
1236 	pmd_t *pmd;
1237 	pte_t *pte;
1238 	unsigned long map_addr;
1239 
1240 	/* the second vmemmap page which we use for duplication */
1241 	map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
1242 	pgd = pgd_offset_k(map_addr);
1243 	p4d = p4d_offset(pgd, map_addr);
1244 	pud = vmemmap_pud_alloc(p4d, node, map_addr);
1245 	if (!pud)
1246 		return NULL;
1247 	pmd = vmemmap_pmd_alloc(pud, node, map_addr);
1248 	if (!pmd)
1249 		return NULL;
1250 	if (pmd_leaf(*pmd))
1251 		/*
1252 		 * The second page is mapped as a hugepage due to a nearby request.
1253 		 * Force our mapping to page size without deduplication
1254 		 */
1255 		return NULL;
1256 	pte = vmemmap_pte_alloc(pmd, node, map_addr);
1257 	if (!pte)
1258 		return NULL;
1259 	/*
1260 	 * Check if there exist a mapping to the left
1261 	 */
1262 	if (pte_none(*pte)) {
1263 		/*
1264 		 * Populate the head page vmemmap page.
1265 		 * It can fall in different pmd, hence
1266 		 * vmemmap_populate_address()
1267 		 */
1268 		pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
1269 		if (!pte)
1270 			return NULL;
1271 		/*
1272 		 * Populate the tail pages vmemmap page
1273 		 */
1274 		pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
1275 		if (!pte)
1276 			return NULL;
1277 		vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
1278 		return pte;
1279 	}
1280 	return pte;
1281 }
1282 
1283 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
1284 					      unsigned long start,
1285 					      unsigned long end, int node,
1286 					      struct dev_pagemap *pgmap)
1287 {
1288 	/*
1289 	 * we want to map things as base page size mapping so that
1290 	 * we can save space in vmemmap. We could have huge mapping
1291 	 * covering out both edges.
1292 	 */
1293 	unsigned long addr;
1294 	unsigned long addr_pfn = start_pfn;
1295 	unsigned long next;
1296 	pgd_t *pgd;
1297 	p4d_t *p4d;
1298 	pud_t *pud;
1299 	pmd_t *pmd;
1300 	pte_t *pte;
1301 
1302 	for (addr = start; addr < end; addr = next) {
1303 
1304 		pgd = pgd_offset_k(addr);
1305 		p4d = p4d_offset(pgd, addr);
1306 		pud = vmemmap_pud_alloc(p4d, node, addr);
1307 		if (!pud)
1308 			return -ENOMEM;
1309 		pmd = vmemmap_pmd_alloc(pud, node, addr);
1310 		if (!pmd)
1311 			return -ENOMEM;
1312 
1313 		if (pmd_leaf(READ_ONCE(*pmd))) {
1314 			/* existing huge mapping. Skip the range */
1315 			addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
1316 			next = pmd_addr_end(addr, end);
1317 			continue;
1318 		}
1319 		pte = vmemmap_pte_alloc(pmd, node, addr);
1320 		if (!pte)
1321 			return -ENOMEM;
1322 		if (!pte_none(*pte)) {
1323 			/*
1324 			 * This could be because we already have a compound
1325 			 * page whose VMEMMAP_RESERVE_NR pages were mapped and
1326 			 * this request fall in those pages.
1327 			 */
1328 			addr_pfn += 1;
1329 			next = addr + PAGE_SIZE;
1330 			continue;
1331 		} else {
1332 			unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
1333 			unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
1334 			pte_t *tail_page_pte;
1335 
1336 			/*
1337 			 * if the address is aligned to huge page size it is the
1338 			 * head mapping.
1339 			 */
1340 			if (pfn_offset == 0) {
1341 				/* Populate the head page vmemmap page */
1342 				pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1343 				if (!pte)
1344 					return -ENOMEM;
1345 				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1346 
1347 				/*
1348 				 * Populate the tail pages vmemmap page
1349 				 * It can fall in different pmd, hence
1350 				 * vmemmap_populate_address()
1351 				 */
1352 				pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
1353 				if (!pte)
1354 					return -ENOMEM;
1355 
1356 				addr_pfn += 2;
1357 				next = addr + 2 * PAGE_SIZE;
1358 				continue;
1359 			}
1360 			/*
1361 			 * get the 2nd mapping details
1362 			 * Also create it if that doesn't exist
1363 			 */
1364 			tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
1365 			if (!tail_page_pte) {
1366 
1367 				pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1368 				if (!pte)
1369 					return -ENOMEM;
1370 				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1371 
1372 				addr_pfn += 1;
1373 				next = addr + PAGE_SIZE;
1374 				continue;
1375 			}
1376 
1377 			pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
1378 			if (!pte)
1379 				return -ENOMEM;
1380 			vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1381 
1382 			addr_pfn += 1;
1383 			next = addr + PAGE_SIZE;
1384 			continue;
1385 		}
1386 	}
1387 	return 0;
1388 }
1389 
1390 
1391 #ifdef CONFIG_MEMORY_HOTPLUG
1392 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
1393 {
1394 	remove_pagetable(start, start + page_size, true, NULL);
1395 }
1396 
1397 void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
1398 			       struct vmem_altmap *altmap)
1399 {
1400 	remove_pagetable(start, end, false, altmap);
1401 }
1402 #endif
1403 #endif
1404 
1405 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
1406 void radix__kernel_map_pages(struct page *page, int numpages, int enable)
1407 {
1408 	unsigned long addr;
1409 
1410 	addr = (unsigned long)page_address(page);
1411 
1412 	if (enable)
1413 		set_memory_p(addr, numpages);
1414 	else
1415 		set_memory_np(addr, numpages);
1416 }
1417 #endif
1418 
1419 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1420 
1421 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
1422 				  pmd_t *pmdp, unsigned long clr,
1423 				  unsigned long set)
1424 {
1425 	unsigned long old;
1426 
1427 #ifdef CONFIG_DEBUG_VM
1428 	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
1429 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1430 #endif
1431 
1432 	old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
1433 	trace_hugepage_update_pmd(addr, old, clr, set);
1434 
1435 	return old;
1436 }
1437 
1438 unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
1439 					 pud_t *pudp, unsigned long clr,
1440 					 unsigned long set)
1441 {
1442 	unsigned long old;
1443 
1444 #ifdef CONFIG_DEBUG_VM
1445 	WARN_ON(!pud_devmap(*pudp));
1446 	assert_spin_locked(pud_lockptr(mm, pudp));
1447 #endif
1448 
1449 	old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
1450 	trace_hugepage_update_pud(addr, old, clr, set);
1451 
1452 	return old;
1453 }
1454 
1455 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
1456 			pmd_t *pmdp)
1457 
1458 {
1459 	pmd_t pmd;
1460 
1461 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1462 	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
1463 	VM_BUG_ON(pmd_devmap(*pmdp));
1464 	/*
1465 	 * khugepaged calls this for normal pmd
1466 	 */
1467 	pmd = *pmdp;
1468 	pmd_clear(pmdp);
1469 
1470 	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
1471 
1472 	return pmd;
1473 }
1474 
1475 /*
1476  * For us pgtable_t is pte_t *. Inorder to save the deposisted
1477  * page table, we consider the allocated page table as a list
1478  * head. On withdraw we need to make sure we zero out the used
1479  * list_head memory area.
1480  */
1481 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1482 				 pgtable_t pgtable)
1483 {
1484 	struct list_head *lh = (struct list_head *) pgtable;
1485 
1486 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1487 
1488 	/* FIFO */
1489 	if (!pmd_huge_pte(mm, pmdp))
1490 		INIT_LIST_HEAD(lh);
1491 	else
1492 		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1493 	pmd_huge_pte(mm, pmdp) = pgtable;
1494 }
1495 
1496 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1497 {
1498 	pte_t *ptep;
1499 	pgtable_t pgtable;
1500 	struct list_head *lh;
1501 
1502 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1503 
1504 	/* FIFO */
1505 	pgtable = pmd_huge_pte(mm, pmdp);
1506 	lh = (struct list_head *) pgtable;
1507 	if (list_empty(lh))
1508 		pmd_huge_pte(mm, pmdp) = NULL;
1509 	else {
1510 		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1511 		list_del(lh);
1512 	}
1513 	ptep = (pte_t *) pgtable;
1514 	*ptep = __pte(0);
1515 	ptep++;
1516 	*ptep = __pte(0);
1517 	return pgtable;
1518 }
1519 
1520 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1521 				     unsigned long addr, pmd_t *pmdp)
1522 {
1523 	pmd_t old_pmd;
1524 	unsigned long old;
1525 
1526 	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1527 	old_pmd = __pmd(old);
1528 	return old_pmd;
1529 }
1530 
1531 pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
1532 				     unsigned long addr, pud_t *pudp)
1533 {
1534 	pud_t old_pud;
1535 	unsigned long old;
1536 
1537 	old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
1538 	old_pud = __pud(old);
1539 	return old_pud;
1540 }
1541 
1542 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1543 
1544 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1545 				  pte_t entry, unsigned long address, int psize)
1546 {
1547 	struct mm_struct *mm = vma->vm_mm;
1548 	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
1549 					      _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
1550 
1551 	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1552 	/*
1553 	 * On POWER9, the NMMU is not able to relax PTE access permissions
1554 	 * for a translation with a TLB. The PTE must be invalidated, TLB
1555 	 * flushed before the new PTE is installed.
1556 	 *
1557 	 * This only needs to be done for radix, because hash translation does
1558 	 * flush when updating the linux pte (and we don't support NMMU
1559 	 * accelerators on HPT on POWER9 anyway XXX: do we?).
1560 	 *
1561 	 * POWER10 (and P9P) NMMU does behave as per ISA.
1562 	 */
1563 	if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1564 	    atomic_read(&mm->context.copros) > 0) {
1565 		unsigned long old_pte, new_pte;
1566 
1567 		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1568 		new_pte = old_pte | set;
1569 		radix__flush_tlb_page_psize(mm, address, psize);
1570 		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1571 	} else {
1572 		__radix_pte_update(ptep, 0, set);
1573 		/*
1574 		 * Book3S does not require a TLB flush when relaxing access
1575 		 * restrictions when the address space (modulo the POWER9 nest
1576 		 * MMU issue above) because the MMU will reload the PTE after
1577 		 * taking an access fault, as defined by the architecture. See
1578 		 * "Setting a Reference or Change Bit or Upgrading Access
1579 		 *  Authority (PTE Subject to Atomic Hardware Updates)" in
1580 		 *  Power ISA Version 3.1B.
1581 		 */
1582 	}
1583 	/* See ptesync comment in radix__set_pte_at */
1584 }
1585 
1586 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1587 				    unsigned long addr, pte_t *ptep,
1588 				    pte_t old_pte, pte_t pte)
1589 {
1590 	struct mm_struct *mm = vma->vm_mm;
1591 
1592 	/*
1593 	 * POWER9 NMMU must flush the TLB after clearing the PTE before
1594 	 * installing a PTE with more relaxed access permissions, see
1595 	 * radix__ptep_set_access_flags.
1596 	 */
1597 	if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1598 	    is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1599 	    (atomic_read(&mm->context.copros) > 0))
1600 		radix__flush_tlb_page(vma, addr);
1601 
1602 	set_pte_at(mm, addr, ptep, pte);
1603 }
1604 
1605 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1606 {
1607 	pte_t *ptep = (pte_t *)pud;
1608 	pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1609 
1610 	if (!radix_enabled())
1611 		return 0;
1612 
1613 	set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1614 
1615 	return 1;
1616 }
1617 
1618 int pud_clear_huge(pud_t *pud)
1619 {
1620 	if (pud_is_leaf(*pud)) {
1621 		pud_clear(pud);
1622 		return 1;
1623 	}
1624 
1625 	return 0;
1626 }
1627 
1628 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1629 {
1630 	pmd_t *pmd;
1631 	int i;
1632 
1633 	pmd = pud_pgtable(*pud);
1634 	pud_clear(pud);
1635 
1636 	flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1637 
1638 	for (i = 0; i < PTRS_PER_PMD; i++) {
1639 		if (!pmd_none(pmd[i])) {
1640 			pte_t *pte;
1641 			pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1642 
1643 			pte_free_kernel(&init_mm, pte);
1644 		}
1645 	}
1646 
1647 	pmd_free(&init_mm, pmd);
1648 
1649 	return 1;
1650 }
1651 
1652 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1653 {
1654 	pte_t *ptep = (pte_t *)pmd;
1655 	pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1656 
1657 	if (!radix_enabled())
1658 		return 0;
1659 
1660 	set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1661 
1662 	return 1;
1663 }
1664 
1665 int pmd_clear_huge(pmd_t *pmd)
1666 {
1667 	if (pmd_is_leaf(*pmd)) {
1668 		pmd_clear(pmd);
1669 		return 1;
1670 	}
1671 
1672 	return 0;
1673 }
1674 
1675 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1676 {
1677 	pte_t *pte;
1678 
1679 	pte = (pte_t *)pmd_page_vaddr(*pmd);
1680 	pmd_clear(pmd);
1681 
1682 	flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1683 
1684 	pte_free_kernel(&init_mm, pte);
1685 
1686 	return 1;
1687 }
1688