1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * PowerPC64 port by Mike Corrigan and Dave Engebretsen 4 * {mikejc|engebret}@us.ibm.com 5 * 6 * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> 7 * 8 * SMP scalability work: 9 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM 10 * 11 * Module name: htab.c 12 * 13 * Description: 14 * PowerPC Hashed Page Table functions 15 */ 16 17 #undef DEBUG 18 #undef DEBUG_LOW 19 20 #define pr_fmt(fmt) "hash-mmu: " fmt 21 #include <linux/spinlock.h> 22 #include <linux/errno.h> 23 #include <linux/sched/mm.h> 24 #include <linux/proc_fs.h> 25 #include <linux/stat.h> 26 #include <linux/sysctl.h> 27 #include <linux/export.h> 28 #include <linux/ctype.h> 29 #include <linux/cache.h> 30 #include <linux/init.h> 31 #include <linux/signal.h> 32 #include <linux/memblock.h> 33 #include <linux/context_tracking.h> 34 #include <linux/libfdt.h> 35 #include <linux/pkeys.h> 36 #include <linux/hugetlb.h> 37 #include <linux/cpu.h> 38 #include <linux/pgtable.h> 39 40 #include <asm/debugfs.h> 41 #include <asm/interrupt.h> 42 #include <asm/processor.h> 43 #include <asm/mmu.h> 44 #include <asm/mmu_context.h> 45 #include <asm/page.h> 46 #include <asm/types.h> 47 #include <linux/uaccess.h> 48 #include <asm/machdep.h> 49 #include <asm/prom.h> 50 #include <asm/io.h> 51 #include <asm/eeh.h> 52 #include <asm/tlb.h> 53 #include <asm/cacheflush.h> 54 #include <asm/cputable.h> 55 #include <asm/sections.h> 56 #include <asm/copro.h> 57 #include <asm/udbg.h> 58 #include <asm/code-patching.h> 59 #include <asm/fadump.h> 60 #include <asm/firmware.h> 61 #include <asm/tm.h> 62 #include <asm/trace.h> 63 #include <asm/ps3.h> 64 #include <asm/pte-walk.h> 65 #include <asm/asm-prototypes.h> 66 #include <asm/ultravisor.h> 67 68 #include <mm/mmu_decl.h> 69 70 #include "internal.h" 71 72 73 #ifdef DEBUG 74 #define DBG(fmt...) udbg_printf(fmt) 75 #else 76 #define DBG(fmt...) 77 #endif 78 79 #ifdef DEBUG_LOW 80 #define DBG_LOW(fmt...) udbg_printf(fmt) 81 #else 82 #define DBG_LOW(fmt...) 83 #endif 84 85 #define KB (1024) 86 #define MB (1024*KB) 87 #define GB (1024L*MB) 88 89 /* 90 * Note: pte --> Linux PTE 91 * HPTE --> PowerPC Hashed Page Table Entry 92 * 93 * Execution context: 94 * htab_initialize is called with the MMU off (of course), but 95 * the kernel has been copied down to zero so it can directly 96 * reference global data. At this point it is very difficult 97 * to print debug info. 98 * 99 */ 100 101 static unsigned long _SDR1; 102 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; 103 EXPORT_SYMBOL_GPL(mmu_psize_defs); 104 105 u8 hpte_page_sizes[1 << LP_BITS]; 106 EXPORT_SYMBOL_GPL(hpte_page_sizes); 107 108 struct hash_pte *htab_address; 109 unsigned long htab_size_bytes; 110 unsigned long htab_hash_mask; 111 EXPORT_SYMBOL_GPL(htab_hash_mask); 112 int mmu_linear_psize = MMU_PAGE_4K; 113 EXPORT_SYMBOL_GPL(mmu_linear_psize); 114 int mmu_virtual_psize = MMU_PAGE_4K; 115 int mmu_vmalloc_psize = MMU_PAGE_4K; 116 EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); 117 #ifdef CONFIG_SPARSEMEM_VMEMMAP 118 int mmu_vmemmap_psize = MMU_PAGE_4K; 119 #endif 120 int mmu_io_psize = MMU_PAGE_4K; 121 int mmu_kernel_ssize = MMU_SEGSIZE_256M; 122 EXPORT_SYMBOL_GPL(mmu_kernel_ssize); 123 int mmu_highuser_ssize = MMU_SEGSIZE_256M; 124 u16 mmu_slb_size = 64; 125 EXPORT_SYMBOL_GPL(mmu_slb_size); 126 #ifdef CONFIG_PPC_64K_PAGES 127 int mmu_ci_restrictions; 128 #endif 129 #ifdef CONFIG_DEBUG_PAGEALLOC 130 static u8 *linear_map_hash_slots; 131 static unsigned long linear_map_hash_count; 132 static DEFINE_SPINLOCK(linear_map_hash_lock); 133 #endif /* CONFIG_DEBUG_PAGEALLOC */ 134 struct mmu_hash_ops mmu_hash_ops; 135 EXPORT_SYMBOL(mmu_hash_ops); 136 137 /* 138 * These are definitions of page sizes arrays to be used when none 139 * is provided by the firmware. 140 */ 141 142 /* 143 * Fallback (4k pages only) 144 */ 145 static struct mmu_psize_def mmu_psize_defaults[] = { 146 [MMU_PAGE_4K] = { 147 .shift = 12, 148 .sllp = 0, 149 .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, 150 .avpnm = 0, 151 .tlbiel = 0, 152 }, 153 }; 154 155 /* 156 * POWER4, GPUL, POWER5 157 * 158 * Support for 16Mb large pages 159 */ 160 static struct mmu_psize_def mmu_psize_defaults_gp[] = { 161 [MMU_PAGE_4K] = { 162 .shift = 12, 163 .sllp = 0, 164 .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, 165 .avpnm = 0, 166 .tlbiel = 1, 167 }, 168 [MMU_PAGE_16M] = { 169 .shift = 24, 170 .sllp = SLB_VSID_L, 171 .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, 172 [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, 173 .avpnm = 0x1UL, 174 .tlbiel = 0, 175 }, 176 }; 177 178 /* 179 * 'R' and 'C' update notes: 180 * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* 181 * create writeable HPTEs without C set, because the hcall H_PROTECT 182 * that we use in that case will not update C 183 * - The above is however not a problem, because we also don't do that 184 * fancy "no flush" variant of eviction and we use H_REMOVE which will 185 * do the right thing and thus we don't have the race I described earlier 186 * 187 * - Under bare metal, we do have the race, so we need R and C set 188 * - We make sure R is always set and never lost 189 * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping 190 */ 191 unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags) 192 { 193 unsigned long rflags = 0; 194 195 /* _PAGE_EXEC -> NOEXEC */ 196 if ((pteflags & _PAGE_EXEC) == 0) 197 rflags |= HPTE_R_N; 198 /* 199 * PPP bits: 200 * Linux uses slb key 0 for kernel and 1 for user. 201 * kernel RW areas are mapped with PPP=0b000 202 * User area is mapped with PPP=0b010 for read/write 203 * or PPP=0b011 for read-only (including writeable but clean pages). 204 */ 205 if (pteflags & _PAGE_PRIVILEGED) { 206 /* 207 * Kernel read only mapped with ppp bits 0b110 208 */ 209 if (!(pteflags & _PAGE_WRITE)) { 210 if (mmu_has_feature(MMU_FTR_KERNEL_RO)) 211 rflags |= (HPTE_R_PP0 | 0x2); 212 else 213 rflags |= 0x3; 214 } 215 } else { 216 if (pteflags & _PAGE_RWX) 217 rflags |= 0x2; 218 if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) 219 rflags |= 0x1; 220 } 221 /* 222 * We can't allow hardware to update hpte bits. Hence always 223 * set 'R' bit and set 'C' if it is a write fault 224 */ 225 rflags |= HPTE_R_R; 226 227 if (pteflags & _PAGE_DIRTY) 228 rflags |= HPTE_R_C; 229 /* 230 * Add in WIG bits 231 */ 232 233 if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) 234 rflags |= HPTE_R_I; 235 else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) 236 rflags |= (HPTE_R_I | HPTE_R_G); 237 else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) 238 rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); 239 else 240 /* 241 * Add memory coherence if cache inhibited is not set 242 */ 243 rflags |= HPTE_R_M; 244 245 rflags |= pte_to_hpte_pkey_bits(pteflags, flags); 246 return rflags; 247 } 248 249 int htab_bolt_mapping(unsigned long vstart, unsigned long vend, 250 unsigned long pstart, unsigned long prot, 251 int psize, int ssize) 252 { 253 unsigned long vaddr, paddr; 254 unsigned int step, shift; 255 int ret = 0; 256 257 shift = mmu_psize_defs[psize].shift; 258 step = 1 << shift; 259 260 prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY); 261 262 DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", 263 vstart, vend, pstart, prot, psize, ssize); 264 265 /* Carefully map only the possible range */ 266 vaddr = ALIGN(vstart, step); 267 paddr = ALIGN(pstart, step); 268 vend = ALIGN_DOWN(vend, step); 269 270 for (; vaddr < vend; vaddr += step, paddr += step) { 271 unsigned long hash, hpteg; 272 unsigned long vsid = get_kernel_vsid(vaddr, ssize); 273 unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); 274 unsigned long tprot = prot; 275 bool secondary_hash = false; 276 277 /* 278 * If we hit a bad address return error. 279 */ 280 if (!vsid) 281 return -1; 282 /* Make kernel text executable */ 283 if (overlaps_kernel_text(vaddr, vaddr + step)) 284 tprot &= ~HPTE_R_N; 285 286 /* 287 * If relocatable, check if it overlaps interrupt vectors that 288 * are copied down to real 0. For relocatable kernel 289 * (e.g. kdump case) we copy interrupt vectors down to real 290 * address 0. Mark that region as executable. This is 291 * because on p8 system with relocation on exception feature 292 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence 293 * in order to execute the interrupt handlers in virtual 294 * mode the vector region need to be marked as executable. 295 */ 296 if ((PHYSICAL_START > MEMORY_START) && 297 overlaps_interrupt_vector_text(vaddr, vaddr + step)) 298 tprot &= ~HPTE_R_N; 299 300 hash = hpt_hash(vpn, shift, ssize); 301 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); 302 303 BUG_ON(!mmu_hash_ops.hpte_insert); 304 repeat: 305 ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, 306 HPTE_V_BOLTED, psize, psize, 307 ssize); 308 if (ret == -1) { 309 /* 310 * Try to to keep bolted entries in primary. 311 * Remove non bolted entries and try insert again 312 */ 313 ret = mmu_hash_ops.hpte_remove(hpteg); 314 if (ret != -1) 315 ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, 316 HPTE_V_BOLTED, psize, psize, 317 ssize); 318 if (ret == -1 && !secondary_hash) { 319 secondary_hash = true; 320 hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP); 321 goto repeat; 322 } 323 } 324 325 if (ret < 0) 326 break; 327 328 cond_resched(); 329 #ifdef CONFIG_DEBUG_PAGEALLOC 330 if (debug_pagealloc_enabled() && 331 (paddr >> PAGE_SHIFT) < linear_map_hash_count) 332 linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; 333 #endif /* CONFIG_DEBUG_PAGEALLOC */ 334 } 335 return ret < 0 ? ret : 0; 336 } 337 338 int htab_remove_mapping(unsigned long vstart, unsigned long vend, 339 int psize, int ssize) 340 { 341 unsigned long vaddr, time_limit; 342 unsigned int step, shift; 343 int rc; 344 int ret = 0; 345 346 shift = mmu_psize_defs[psize].shift; 347 step = 1 << shift; 348 349 if (!mmu_hash_ops.hpte_removebolted) 350 return -ENODEV; 351 352 /* Unmap the full range specificied */ 353 vaddr = ALIGN_DOWN(vstart, step); 354 time_limit = jiffies + HZ; 355 356 for (;vaddr < vend; vaddr += step) { 357 rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); 358 359 /* 360 * For large number of mappings introduce a cond_resched() 361 * to prevent softlockup warnings. 362 */ 363 if (time_after(jiffies, time_limit)) { 364 cond_resched(); 365 time_limit = jiffies + HZ; 366 } 367 if (rc == -ENOENT) { 368 ret = -ENOENT; 369 continue; 370 } 371 if (rc < 0) 372 return rc; 373 } 374 375 return ret; 376 } 377 378 static bool disable_1tb_segments = false; 379 380 static int __init parse_disable_1tb_segments(char *p) 381 { 382 disable_1tb_segments = true; 383 return 0; 384 } 385 early_param("disable_1tb_segments", parse_disable_1tb_segments); 386 387 static int __init htab_dt_scan_seg_sizes(unsigned long node, 388 const char *uname, int depth, 389 void *data) 390 { 391 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 392 const __be32 *prop; 393 int size = 0; 394 395 /* We are scanning "cpu" nodes only */ 396 if (type == NULL || strcmp(type, "cpu") != 0) 397 return 0; 398 399 prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); 400 if (prop == NULL) 401 return 0; 402 for (; size >= 4; size -= 4, ++prop) { 403 if (be32_to_cpu(prop[0]) == 40) { 404 DBG("1T segment support detected\n"); 405 406 if (disable_1tb_segments) { 407 DBG("1T segments disabled by command line\n"); 408 break; 409 } 410 411 cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; 412 return 1; 413 } 414 } 415 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 416 return 0; 417 } 418 419 static int __init get_idx_from_shift(unsigned int shift) 420 { 421 int idx = -1; 422 423 switch (shift) { 424 case 0xc: 425 idx = MMU_PAGE_4K; 426 break; 427 case 0x10: 428 idx = MMU_PAGE_64K; 429 break; 430 case 0x14: 431 idx = MMU_PAGE_1M; 432 break; 433 case 0x18: 434 idx = MMU_PAGE_16M; 435 break; 436 case 0x22: 437 idx = MMU_PAGE_16G; 438 break; 439 } 440 return idx; 441 } 442 443 static int __init htab_dt_scan_page_sizes(unsigned long node, 444 const char *uname, int depth, 445 void *data) 446 { 447 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 448 const __be32 *prop; 449 int size = 0; 450 451 /* We are scanning "cpu" nodes only */ 452 if (type == NULL || strcmp(type, "cpu") != 0) 453 return 0; 454 455 prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); 456 if (!prop) 457 return 0; 458 459 pr_info("Page sizes from device-tree:\n"); 460 size /= 4; 461 cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); 462 while(size > 0) { 463 unsigned int base_shift = be32_to_cpu(prop[0]); 464 unsigned int slbenc = be32_to_cpu(prop[1]); 465 unsigned int lpnum = be32_to_cpu(prop[2]); 466 struct mmu_psize_def *def; 467 int idx, base_idx; 468 469 size -= 3; prop += 3; 470 base_idx = get_idx_from_shift(base_shift); 471 if (base_idx < 0) { 472 /* skip the pte encoding also */ 473 prop += lpnum * 2; size -= lpnum * 2; 474 continue; 475 } 476 def = &mmu_psize_defs[base_idx]; 477 if (base_idx == MMU_PAGE_16M) 478 cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; 479 480 def->shift = base_shift; 481 if (base_shift <= 23) 482 def->avpnm = 0; 483 else 484 def->avpnm = (1 << (base_shift - 23)) - 1; 485 def->sllp = slbenc; 486 /* 487 * We don't know for sure what's up with tlbiel, so 488 * for now we only set it for 4K and 64K pages 489 */ 490 if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) 491 def->tlbiel = 1; 492 else 493 def->tlbiel = 0; 494 495 while (size > 0 && lpnum) { 496 unsigned int shift = be32_to_cpu(prop[0]); 497 int penc = be32_to_cpu(prop[1]); 498 499 prop += 2; size -= 2; 500 lpnum--; 501 502 idx = get_idx_from_shift(shift); 503 if (idx < 0) 504 continue; 505 506 if (penc == -1) 507 pr_err("Invalid penc for base_shift=%d " 508 "shift=%d\n", base_shift, shift); 509 510 def->penc[idx] = penc; 511 pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," 512 " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", 513 base_shift, shift, def->sllp, 514 def->avpnm, def->tlbiel, def->penc[idx]); 515 } 516 } 517 518 return 1; 519 } 520 521 #ifdef CONFIG_HUGETLB_PAGE 522 /* 523 * Scan for 16G memory blocks that have been set aside for huge pages 524 * and reserve those blocks for 16G huge pages. 525 */ 526 static int __init htab_dt_scan_hugepage_blocks(unsigned long node, 527 const char *uname, int depth, 528 void *data) { 529 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 530 const __be64 *addr_prop; 531 const __be32 *page_count_prop; 532 unsigned int expected_pages; 533 long unsigned int phys_addr; 534 long unsigned int block_size; 535 536 /* We are scanning "memory" nodes only */ 537 if (type == NULL || strcmp(type, "memory") != 0) 538 return 0; 539 540 /* 541 * This property is the log base 2 of the number of virtual pages that 542 * will represent this memory block. 543 */ 544 page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); 545 if (page_count_prop == NULL) 546 return 0; 547 expected_pages = (1 << be32_to_cpu(page_count_prop[0])); 548 addr_prop = of_get_flat_dt_prop(node, "reg", NULL); 549 if (addr_prop == NULL) 550 return 0; 551 phys_addr = be64_to_cpu(addr_prop[0]); 552 block_size = be64_to_cpu(addr_prop[1]); 553 if (block_size != (16 * GB)) 554 return 0; 555 printk(KERN_INFO "Huge page(16GB) memory: " 556 "addr = 0x%lX size = 0x%lX pages = %d\n", 557 phys_addr, block_size, expected_pages); 558 if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { 559 memblock_reserve(phys_addr, block_size * expected_pages); 560 pseries_add_gpage(phys_addr, block_size, expected_pages); 561 } 562 return 0; 563 } 564 #endif /* CONFIG_HUGETLB_PAGE */ 565 566 static void mmu_psize_set_default_penc(void) 567 { 568 int bpsize, apsize; 569 for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) 570 for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) 571 mmu_psize_defs[bpsize].penc[apsize] = -1; 572 } 573 574 #ifdef CONFIG_PPC_64K_PAGES 575 576 static bool might_have_hea(void) 577 { 578 /* 579 * The HEA ethernet adapter requires awareness of the 580 * GX bus. Without that awareness we can easily assume 581 * we will never see an HEA ethernet device. 582 */ 583 #ifdef CONFIG_IBMEBUS 584 return !cpu_has_feature(CPU_FTR_ARCH_207S) && 585 firmware_has_feature(FW_FEATURE_SPLPAR); 586 #else 587 return false; 588 #endif 589 } 590 591 #endif /* #ifdef CONFIG_PPC_64K_PAGES */ 592 593 static void __init htab_scan_page_sizes(void) 594 { 595 int rc; 596 597 /* se the invalid penc to -1 */ 598 mmu_psize_set_default_penc(); 599 600 /* Default to 4K pages only */ 601 memcpy(mmu_psize_defs, mmu_psize_defaults, 602 sizeof(mmu_psize_defaults)); 603 604 /* 605 * Try to find the available page sizes in the device-tree 606 */ 607 rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); 608 if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { 609 /* 610 * Nothing in the device-tree, but the CPU supports 16M pages, 611 * so let's fallback on a known size list for 16M capable CPUs. 612 */ 613 memcpy(mmu_psize_defs, mmu_psize_defaults_gp, 614 sizeof(mmu_psize_defaults_gp)); 615 } 616 617 #ifdef CONFIG_HUGETLB_PAGE 618 if (!hugetlb_disabled && !early_radix_enabled() ) { 619 /* Reserve 16G huge page memory sections for huge pages */ 620 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); 621 } 622 #endif /* CONFIG_HUGETLB_PAGE */ 623 } 624 625 /* 626 * Fill in the hpte_page_sizes[] array. 627 * We go through the mmu_psize_defs[] array looking for all the 628 * supported base/actual page size combinations. Each combination 629 * has a unique pagesize encoding (penc) value in the low bits of 630 * the LP field of the HPTE. For actual page sizes less than 1MB, 631 * some of the upper LP bits are used for RPN bits, meaning that 632 * we need to fill in several entries in hpte_page_sizes[]. 633 * 634 * In diagrammatic form, with r = RPN bits and z = page size bits: 635 * PTE LP actual page size 636 * rrrr rrrz >=8KB 637 * rrrr rrzz >=16KB 638 * rrrr rzzz >=32KB 639 * rrrr zzzz >=64KB 640 * ... 641 * 642 * The zzzz bits are implementation-specific but are chosen so that 643 * no encoding for a larger page size uses the same value in its 644 * low-order N bits as the encoding for the 2^(12+N) byte page size 645 * (if it exists). 646 */ 647 static void init_hpte_page_sizes(void) 648 { 649 long int ap, bp; 650 long int shift, penc; 651 652 for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { 653 if (!mmu_psize_defs[bp].shift) 654 continue; /* not a supported page size */ 655 for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { 656 penc = mmu_psize_defs[bp].penc[ap]; 657 if (penc == -1 || !mmu_psize_defs[ap].shift) 658 continue; 659 shift = mmu_psize_defs[ap].shift - LP_SHIFT; 660 if (shift <= 0) 661 continue; /* should never happen */ 662 /* 663 * For page sizes less than 1MB, this loop 664 * replicates the entry for all possible values 665 * of the rrrr bits. 666 */ 667 while (penc < (1 << LP_BITS)) { 668 hpte_page_sizes[penc] = (ap << 4) | bp; 669 penc += 1 << shift; 670 } 671 } 672 } 673 } 674 675 static void __init htab_init_page_sizes(void) 676 { 677 bool aligned = true; 678 init_hpte_page_sizes(); 679 680 if (!debug_pagealloc_enabled()) { 681 /* 682 * Pick a size for the linear mapping. Currently, we only 683 * support 16M, 1M and 4K which is the default 684 */ 685 if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && 686 (unsigned long)_stext % 0x1000000) { 687 if (mmu_psize_defs[MMU_PAGE_16M].shift) 688 pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n"); 689 aligned = false; 690 } 691 692 if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned) 693 mmu_linear_psize = MMU_PAGE_16M; 694 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 695 mmu_linear_psize = MMU_PAGE_1M; 696 } 697 698 #ifdef CONFIG_PPC_64K_PAGES 699 /* 700 * Pick a size for the ordinary pages. Default is 4K, we support 701 * 64K for user mappings and vmalloc if supported by the processor. 702 * We only use 64k for ioremap if the processor 703 * (and firmware) support cache-inhibited large pages. 704 * If not, we use 4k and set mmu_ci_restrictions so that 705 * hash_page knows to switch processes that use cache-inhibited 706 * mappings to 4k pages. 707 */ 708 if (mmu_psize_defs[MMU_PAGE_64K].shift) { 709 mmu_virtual_psize = MMU_PAGE_64K; 710 mmu_vmalloc_psize = MMU_PAGE_64K; 711 if (mmu_linear_psize == MMU_PAGE_4K) 712 mmu_linear_psize = MMU_PAGE_64K; 713 if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { 714 /* 715 * When running on pSeries using 64k pages for ioremap 716 * would stop us accessing the HEA ethernet. So if we 717 * have the chance of ever seeing one, stay at 4k. 718 */ 719 if (!might_have_hea()) 720 mmu_io_psize = MMU_PAGE_64K; 721 } else 722 mmu_ci_restrictions = 1; 723 } 724 #endif /* CONFIG_PPC_64K_PAGES */ 725 726 #ifdef CONFIG_SPARSEMEM_VMEMMAP 727 /* 728 * We try to use 16M pages for vmemmap if that is supported 729 * and we have at least 1G of RAM at boot 730 */ 731 if (mmu_psize_defs[MMU_PAGE_16M].shift && 732 memblock_phys_mem_size() >= 0x40000000) 733 mmu_vmemmap_psize = MMU_PAGE_16M; 734 else 735 mmu_vmemmap_psize = mmu_virtual_psize; 736 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 737 738 printk(KERN_DEBUG "Page orders: linear mapping = %d, " 739 "virtual = %d, io = %d" 740 #ifdef CONFIG_SPARSEMEM_VMEMMAP 741 ", vmemmap = %d" 742 #endif 743 "\n", 744 mmu_psize_defs[mmu_linear_psize].shift, 745 mmu_psize_defs[mmu_virtual_psize].shift, 746 mmu_psize_defs[mmu_io_psize].shift 747 #ifdef CONFIG_SPARSEMEM_VMEMMAP 748 ,mmu_psize_defs[mmu_vmemmap_psize].shift 749 #endif 750 ); 751 } 752 753 static int __init htab_dt_scan_pftsize(unsigned long node, 754 const char *uname, int depth, 755 void *data) 756 { 757 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 758 const __be32 *prop; 759 760 /* We are scanning "cpu" nodes only */ 761 if (type == NULL || strcmp(type, "cpu") != 0) 762 return 0; 763 764 prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); 765 if (prop != NULL) { 766 /* pft_size[0] is the NUMA CEC cookie */ 767 ppc64_pft_size = be32_to_cpu(prop[1]); 768 return 1; 769 } 770 return 0; 771 } 772 773 unsigned htab_shift_for_mem_size(unsigned long mem_size) 774 { 775 unsigned memshift = __ilog2(mem_size); 776 unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift; 777 unsigned pteg_shift; 778 779 /* round mem_size up to next power of 2 */ 780 if ((1UL << memshift) < mem_size) 781 memshift += 1; 782 783 /* aim for 2 pages / pteg */ 784 pteg_shift = memshift - (pshift + 1); 785 786 /* 787 * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab 788 * size permitted by the architecture. 789 */ 790 return max(pteg_shift + 7, 18U); 791 } 792 793 static unsigned long __init htab_get_table_size(void) 794 { 795 /* 796 * If hash size isn't already provided by the platform, we try to 797 * retrieve it from the device-tree. If it's not there neither, we 798 * calculate it now based on the total RAM size 799 */ 800 if (ppc64_pft_size == 0) 801 of_scan_flat_dt(htab_dt_scan_pftsize, NULL); 802 if (ppc64_pft_size) 803 return 1UL << ppc64_pft_size; 804 805 return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size()); 806 } 807 808 #ifdef CONFIG_MEMORY_HOTPLUG 809 static int resize_hpt_for_hotplug(unsigned long new_mem_size) 810 { 811 unsigned target_hpt_shift; 812 813 if (!mmu_hash_ops.resize_hpt) 814 return 0; 815 816 target_hpt_shift = htab_shift_for_mem_size(new_mem_size); 817 818 /* 819 * To avoid lots of HPT resizes if memory size is fluctuating 820 * across a boundary, we deliberately have some hysterisis 821 * here: we immediately increase the HPT size if the target 822 * shift exceeds the current shift, but we won't attempt to 823 * reduce unless the target shift is at least 2 below the 824 * current shift 825 */ 826 if (target_hpt_shift > ppc64_pft_size || 827 target_hpt_shift < ppc64_pft_size - 1) 828 return mmu_hash_ops.resize_hpt(target_hpt_shift); 829 830 return 0; 831 } 832 833 int hash__create_section_mapping(unsigned long start, unsigned long end, 834 int nid, pgprot_t prot) 835 { 836 int rc; 837 838 if (end >= H_VMALLOC_START) { 839 pr_warn("Outside the supported range\n"); 840 return -1; 841 } 842 843 resize_hpt_for_hotplug(memblock_phys_mem_size()); 844 845 rc = htab_bolt_mapping(start, end, __pa(start), 846 pgprot_val(prot), mmu_linear_psize, 847 mmu_kernel_ssize); 848 849 if (rc < 0) { 850 int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, 851 mmu_kernel_ssize); 852 BUG_ON(rc2 && (rc2 != -ENOENT)); 853 } 854 return rc; 855 } 856 857 int hash__remove_section_mapping(unsigned long start, unsigned long end) 858 { 859 int rc = htab_remove_mapping(start, end, mmu_linear_psize, 860 mmu_kernel_ssize); 861 862 if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) 863 pr_warn("Hash collision while resizing HPT\n"); 864 865 return rc; 866 } 867 #endif /* CONFIG_MEMORY_HOTPLUG */ 868 869 static void __init hash_init_partition_table(phys_addr_t hash_table, 870 unsigned long htab_size) 871 { 872 mmu_partition_table_init(); 873 874 /* 875 * PS field (VRMA page size) is not used for LPID 0, hence set to 0. 876 * For now, UPRT is 0 and we have no segment table. 877 */ 878 htab_size = __ilog2(htab_size) - 18; 879 mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false); 880 pr_info("Partition table %p\n", partition_tb); 881 } 882 883 static void __init htab_initialize(void) 884 { 885 unsigned long table; 886 unsigned long pteg_count; 887 unsigned long prot; 888 phys_addr_t base = 0, size = 0, end; 889 u64 i; 890 891 DBG(" -> htab_initialize()\n"); 892 893 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { 894 mmu_kernel_ssize = MMU_SEGSIZE_1T; 895 mmu_highuser_ssize = MMU_SEGSIZE_1T; 896 printk(KERN_INFO "Using 1TB segments\n"); 897 } 898 899 if (stress_slb_enabled) 900 static_branch_enable(&stress_slb_key); 901 902 /* 903 * Calculate the required size of the htab. We want the number of 904 * PTEGs to equal one half the number of real pages. 905 */ 906 htab_size_bytes = htab_get_table_size(); 907 pteg_count = htab_size_bytes >> 7; 908 909 htab_hash_mask = pteg_count - 1; 910 911 if (firmware_has_feature(FW_FEATURE_LPAR) || 912 firmware_has_feature(FW_FEATURE_PS3_LV1)) { 913 /* Using a hypervisor which owns the htab */ 914 htab_address = NULL; 915 _SDR1 = 0; 916 #ifdef CONFIG_FA_DUMP 917 /* 918 * If firmware assisted dump is active firmware preserves 919 * the contents of htab along with entire partition memory. 920 * Clear the htab if firmware assisted dump is active so 921 * that we dont end up using old mappings. 922 */ 923 if (is_fadump_active() && mmu_hash_ops.hpte_clear_all) 924 mmu_hash_ops.hpte_clear_all(); 925 #endif 926 } else { 927 unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; 928 929 #ifdef CONFIG_PPC_CELL 930 /* 931 * Cell may require the hash table down low when using the 932 * Axon IOMMU in order to fit the dynamic region over it, see 933 * comments in cell/iommu.c 934 */ 935 if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) { 936 limit = 0x80000000; 937 pr_info("Hash table forced below 2G for Axon IOMMU\n"); 938 } 939 #endif /* CONFIG_PPC_CELL */ 940 941 table = memblock_phys_alloc_range(htab_size_bytes, 942 htab_size_bytes, 943 0, limit); 944 if (!table) 945 panic("ERROR: Failed to allocate %pa bytes below %pa\n", 946 &htab_size_bytes, &limit); 947 948 DBG("Hash table allocated at %lx, size: %lx\n", table, 949 htab_size_bytes); 950 951 htab_address = __va(table); 952 953 /* htab absolute addr + encoded htabsize */ 954 _SDR1 = table + __ilog2(htab_size_bytes) - 18; 955 956 /* Initialize the HPT with no entries */ 957 memset((void *)table, 0, htab_size_bytes); 958 959 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 960 /* Set SDR1 */ 961 mtspr(SPRN_SDR1, _SDR1); 962 else 963 hash_init_partition_table(table, htab_size_bytes); 964 } 965 966 prot = pgprot_val(PAGE_KERNEL); 967 968 #ifdef CONFIG_DEBUG_PAGEALLOC 969 if (debug_pagealloc_enabled()) { 970 linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; 971 linear_map_hash_slots = memblock_alloc_try_nid( 972 linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, 973 ppc64_rma_size, NUMA_NO_NODE); 974 if (!linear_map_hash_slots) 975 panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", 976 __func__, linear_map_hash_count, &ppc64_rma_size); 977 } 978 #endif /* CONFIG_DEBUG_PAGEALLOC */ 979 980 /* create bolted the linear mapping in the hash table */ 981 for_each_mem_range(i, &base, &end) { 982 size = end - base; 983 base = (unsigned long)__va(base); 984 985 DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", 986 base, size, prot); 987 988 if ((base + size) >= H_VMALLOC_START) { 989 pr_warn("Outside the supported range\n"); 990 continue; 991 } 992 993 BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), 994 prot, mmu_linear_psize, mmu_kernel_ssize)); 995 } 996 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 997 998 /* 999 * If we have a memory_limit and we've allocated TCEs then we need to 1000 * explicitly map the TCE area at the top of RAM. We also cope with the 1001 * case that the TCEs start below memory_limit. 1002 * tce_alloc_start/end are 16MB aligned so the mapping should work 1003 * for either 4K or 16MB pages. 1004 */ 1005 if (tce_alloc_start) { 1006 tce_alloc_start = (unsigned long)__va(tce_alloc_start); 1007 tce_alloc_end = (unsigned long)__va(tce_alloc_end); 1008 1009 if (base + size >= tce_alloc_start) 1010 tce_alloc_start = base + size + 1; 1011 1012 BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, 1013 __pa(tce_alloc_start), prot, 1014 mmu_linear_psize, mmu_kernel_ssize)); 1015 } 1016 1017 1018 DBG(" <- htab_initialize()\n"); 1019 } 1020 #undef KB 1021 #undef MB 1022 1023 void __init hash__early_init_devtree(void) 1024 { 1025 /* Initialize segment sizes */ 1026 of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); 1027 1028 /* Initialize page sizes */ 1029 htab_scan_page_sizes(); 1030 } 1031 1032 static struct hash_mm_context init_hash_mm_context; 1033 void __init hash__early_init_mmu(void) 1034 { 1035 #ifndef CONFIG_PPC_64K_PAGES 1036 /* 1037 * We have code in __hash_page_4K() and elsewhere, which assumes it can 1038 * do the following: 1039 * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); 1040 * 1041 * Where the slot number is between 0-15, and values of 8-15 indicate 1042 * the secondary bucket. For that code to work H_PAGE_F_SECOND and 1043 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and 1044 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here 1045 * with a BUILD_BUG_ON(). 1046 */ 1047 BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); 1048 #endif /* CONFIG_PPC_64K_PAGES */ 1049 1050 htab_init_page_sizes(); 1051 1052 /* 1053 * initialize page table size 1054 */ 1055 __pte_frag_nr = H_PTE_FRAG_NR; 1056 __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; 1057 __pmd_frag_nr = H_PMD_FRAG_NR; 1058 __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; 1059 1060 __pte_index_size = H_PTE_INDEX_SIZE; 1061 __pmd_index_size = H_PMD_INDEX_SIZE; 1062 __pud_index_size = H_PUD_INDEX_SIZE; 1063 __pgd_index_size = H_PGD_INDEX_SIZE; 1064 __pud_cache_index = H_PUD_CACHE_INDEX; 1065 __pte_table_size = H_PTE_TABLE_SIZE; 1066 __pmd_table_size = H_PMD_TABLE_SIZE; 1067 __pud_table_size = H_PUD_TABLE_SIZE; 1068 __pgd_table_size = H_PGD_TABLE_SIZE; 1069 /* 1070 * 4k use hugepd format, so for hash set then to 1071 * zero 1072 */ 1073 __pmd_val_bits = HASH_PMD_VAL_BITS; 1074 __pud_val_bits = HASH_PUD_VAL_BITS; 1075 __pgd_val_bits = HASH_PGD_VAL_BITS; 1076 1077 __kernel_virt_start = H_KERN_VIRT_START; 1078 __vmalloc_start = H_VMALLOC_START; 1079 __vmalloc_end = H_VMALLOC_END; 1080 __kernel_io_start = H_KERN_IO_START; 1081 __kernel_io_end = H_KERN_IO_END; 1082 vmemmap = (struct page *)H_VMEMMAP_START; 1083 ioremap_bot = IOREMAP_BASE; 1084 1085 #ifdef CONFIG_PCI 1086 pci_io_base = ISA_IO_BASE; 1087 #endif 1088 1089 /* Select appropriate backend */ 1090 if (firmware_has_feature(FW_FEATURE_PS3_LV1)) 1091 ps3_early_mm_init(); 1092 else if (firmware_has_feature(FW_FEATURE_LPAR)) 1093 hpte_init_pseries(); 1094 else if (IS_ENABLED(CONFIG_PPC_NATIVE)) 1095 hpte_init_native(); 1096 1097 if (!mmu_hash_ops.hpte_insert) 1098 panic("hash__early_init_mmu: No MMU hash ops defined!\n"); 1099 1100 /* 1101 * Initialize the MMU Hash table and create the linear mapping 1102 * of memory. Has to be done before SLB initialization as this is 1103 * currently where the page size encoding is obtained. 1104 */ 1105 htab_initialize(); 1106 1107 init_mm.context.hash_context = &init_hash_mm_context; 1108 mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); 1109 1110 pr_info("Initializing hash mmu with SLB\n"); 1111 /* Initialize SLB management */ 1112 slb_initialize(); 1113 1114 if (cpu_has_feature(CPU_FTR_ARCH_206) 1115 && cpu_has_feature(CPU_FTR_HVMODE)) 1116 tlbiel_all(); 1117 } 1118 1119 #ifdef CONFIG_SMP 1120 void hash__early_init_mmu_secondary(void) 1121 { 1122 /* Initialize hash table for that CPU */ 1123 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1124 1125 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 1126 mtspr(SPRN_SDR1, _SDR1); 1127 else 1128 set_ptcr_when_no_uv(__pa(partition_tb) | 1129 (PATB_SIZE_SHIFT - 12)); 1130 } 1131 /* Initialize SLB */ 1132 slb_initialize(); 1133 1134 if (cpu_has_feature(CPU_FTR_ARCH_206) 1135 && cpu_has_feature(CPU_FTR_HVMODE)) 1136 tlbiel_all(); 1137 1138 #ifdef CONFIG_PPC_MEM_KEYS 1139 if (mmu_has_feature(MMU_FTR_PKEY)) 1140 mtspr(SPRN_UAMOR, default_uamor); 1141 #endif 1142 } 1143 #endif /* CONFIG_SMP */ 1144 1145 /* 1146 * Called by asm hashtable.S for doing lazy icache flush 1147 */ 1148 unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) 1149 { 1150 struct page *page; 1151 1152 if (!pfn_valid(pte_pfn(pte))) 1153 return pp; 1154 1155 page = pte_page(pte); 1156 1157 /* page is dirty */ 1158 if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { 1159 if (trap == 0x400) { 1160 flush_dcache_icache_page(page); 1161 set_bit(PG_dcache_clean, &page->flags); 1162 } else 1163 pp |= HPTE_R_N; 1164 } 1165 return pp; 1166 } 1167 1168 #ifdef CONFIG_PPC_MM_SLICES 1169 static unsigned int get_paca_psize(unsigned long addr) 1170 { 1171 unsigned char *psizes; 1172 unsigned long index, mask_index; 1173 1174 if (addr < SLICE_LOW_TOP) { 1175 psizes = get_paca()->mm_ctx_low_slices_psize; 1176 index = GET_LOW_SLICE_INDEX(addr); 1177 } else { 1178 psizes = get_paca()->mm_ctx_high_slices_psize; 1179 index = GET_HIGH_SLICE_INDEX(addr); 1180 } 1181 mask_index = index & 0x1; 1182 return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; 1183 } 1184 1185 #else 1186 unsigned int get_paca_psize(unsigned long addr) 1187 { 1188 return get_paca()->mm_ctx_user_psize; 1189 } 1190 #endif 1191 1192 /* 1193 * Demote a segment to using 4k pages. 1194 * For now this makes the whole process use 4k pages. 1195 */ 1196 #ifdef CONFIG_PPC_64K_PAGES 1197 void demote_segment_4k(struct mm_struct *mm, unsigned long addr) 1198 { 1199 if (get_slice_psize(mm, addr) == MMU_PAGE_4K) 1200 return; 1201 slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); 1202 copro_flush_all_slbs(mm); 1203 if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { 1204 1205 copy_mm_to_paca(mm); 1206 slb_flush_and_restore_bolted(); 1207 } 1208 } 1209 #endif /* CONFIG_PPC_64K_PAGES */ 1210 1211 #ifdef CONFIG_PPC_SUBPAGE_PROT 1212 /* 1213 * This looks up a 2-bit protection code for a 4k subpage of a 64k page. 1214 * Userspace sets the subpage permissions using the subpage_prot system call. 1215 * 1216 * Result is 0: full permissions, _PAGE_RW: read-only, 1217 * _PAGE_RWX: no access. 1218 */ 1219 static int subpage_protection(struct mm_struct *mm, unsigned long ea) 1220 { 1221 struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); 1222 u32 spp = 0; 1223 u32 **sbpm, *sbpp; 1224 1225 if (!spt) 1226 return 0; 1227 1228 if (ea >= spt->maxaddr) 1229 return 0; 1230 if (ea < 0x100000000UL) { 1231 /* addresses below 4GB use spt->low_prot */ 1232 sbpm = spt->low_prot; 1233 } else { 1234 sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; 1235 if (!sbpm) 1236 return 0; 1237 } 1238 sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; 1239 if (!sbpp) 1240 return 0; 1241 spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; 1242 1243 /* extract 2-bit bitfield for this 4k subpage */ 1244 spp >>= 30 - 2 * ((ea >> 12) & 0xf); 1245 1246 /* 1247 * 0 -> full premission 1248 * 1 -> Read only 1249 * 2 -> no access. 1250 * We return the flag that need to be cleared. 1251 */ 1252 spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); 1253 return spp; 1254 } 1255 1256 #else /* CONFIG_PPC_SUBPAGE_PROT */ 1257 static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) 1258 { 1259 return 0; 1260 } 1261 #endif 1262 1263 void hash_failure_debug(unsigned long ea, unsigned long access, 1264 unsigned long vsid, unsigned long trap, 1265 int ssize, int psize, int lpsize, unsigned long pte) 1266 { 1267 if (!printk_ratelimit()) 1268 return; 1269 pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", 1270 ea, access, current->comm); 1271 pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", 1272 trap, vsid, ssize, psize, lpsize, pte); 1273 } 1274 1275 static void check_paca_psize(unsigned long ea, struct mm_struct *mm, 1276 int psize, bool user_region) 1277 { 1278 if (user_region) { 1279 if (psize != get_paca_psize(ea)) { 1280 copy_mm_to_paca(mm); 1281 slb_flush_and_restore_bolted(); 1282 } 1283 } else if (get_paca()->vmalloc_sllp != 1284 mmu_psize_defs[mmu_vmalloc_psize].sllp) { 1285 get_paca()->vmalloc_sllp = 1286 mmu_psize_defs[mmu_vmalloc_psize].sllp; 1287 slb_vmalloc_update(); 1288 } 1289 } 1290 1291 /* 1292 * Result code is: 1293 * 0 - handled 1294 * 1 - normal page fault 1295 * -1 - critical hash insertion error 1296 * -2 - access not permitted by subpage protection mechanism 1297 */ 1298 int hash_page_mm(struct mm_struct *mm, unsigned long ea, 1299 unsigned long access, unsigned long trap, 1300 unsigned long flags) 1301 { 1302 bool is_thp; 1303 pgd_t *pgdir; 1304 unsigned long vsid; 1305 pte_t *ptep; 1306 unsigned hugeshift; 1307 int rc, user_region = 0; 1308 int psize, ssize; 1309 1310 DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", 1311 ea, access, trap); 1312 trace_hash_fault(ea, access, trap); 1313 1314 /* Get region & vsid */ 1315 switch (get_region_id(ea)) { 1316 case USER_REGION_ID: 1317 user_region = 1; 1318 if (! mm) { 1319 DBG_LOW(" user region with no mm !\n"); 1320 rc = 1; 1321 goto bail; 1322 } 1323 psize = get_slice_psize(mm, ea); 1324 ssize = user_segment_size(ea); 1325 vsid = get_user_vsid(&mm->context, ea, ssize); 1326 break; 1327 case VMALLOC_REGION_ID: 1328 vsid = get_kernel_vsid(ea, mmu_kernel_ssize); 1329 psize = mmu_vmalloc_psize; 1330 ssize = mmu_kernel_ssize; 1331 flags |= HPTE_USE_KERNEL_KEY; 1332 break; 1333 1334 case IO_REGION_ID: 1335 vsid = get_kernel_vsid(ea, mmu_kernel_ssize); 1336 psize = mmu_io_psize; 1337 ssize = mmu_kernel_ssize; 1338 flags |= HPTE_USE_KERNEL_KEY; 1339 break; 1340 default: 1341 /* 1342 * Not a valid range 1343 * Send the problem up to do_page_fault() 1344 */ 1345 rc = 1; 1346 goto bail; 1347 } 1348 DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); 1349 1350 /* Bad address. */ 1351 if (!vsid) { 1352 DBG_LOW("Bad address!\n"); 1353 rc = 1; 1354 goto bail; 1355 } 1356 /* Get pgdir */ 1357 pgdir = mm->pgd; 1358 if (pgdir == NULL) { 1359 rc = 1; 1360 goto bail; 1361 } 1362 1363 /* Check CPU locality */ 1364 if (user_region && mm_is_thread_local(mm)) 1365 flags |= HPTE_LOCAL_UPDATE; 1366 1367 #ifndef CONFIG_PPC_64K_PAGES 1368 /* 1369 * If we use 4K pages and our psize is not 4K, then we might 1370 * be hitting a special driver mapping, and need to align the 1371 * address before we fetch the PTE. 1372 * 1373 * It could also be a hugepage mapping, in which case this is 1374 * not necessary, but it's not harmful, either. 1375 */ 1376 if (psize != MMU_PAGE_4K) 1377 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); 1378 #endif /* CONFIG_PPC_64K_PAGES */ 1379 1380 /* Get PTE and page size from page tables */ 1381 ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); 1382 if (ptep == NULL || !pte_present(*ptep)) { 1383 DBG_LOW(" no PTE !\n"); 1384 rc = 1; 1385 goto bail; 1386 } 1387 1388 /* 1389 * Add _PAGE_PRESENT to the required access perm. If there are parallel 1390 * updates to the pte that can possibly clear _PAGE_PTE, catch that too. 1391 * 1392 * We can safely use the return pte address in rest of the function 1393 * because we do set H_PAGE_BUSY which prevents further updates to pte 1394 * from generic code. 1395 */ 1396 access |= _PAGE_PRESENT | _PAGE_PTE; 1397 1398 /* 1399 * Pre-check access permissions (will be re-checked atomically 1400 * in __hash_page_XX but this pre-check is a fast path 1401 */ 1402 if (!check_pte_access(access, pte_val(*ptep))) { 1403 DBG_LOW(" no access !\n"); 1404 rc = 1; 1405 goto bail; 1406 } 1407 1408 if (hugeshift) { 1409 if (is_thp) 1410 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, 1411 trap, flags, ssize, psize); 1412 #ifdef CONFIG_HUGETLB_PAGE 1413 else 1414 rc = __hash_page_huge(ea, access, vsid, ptep, trap, 1415 flags, ssize, hugeshift, psize); 1416 #else 1417 else { 1418 /* 1419 * if we have hugeshift, and is not transhuge with 1420 * hugetlb disabled, something is really wrong. 1421 */ 1422 rc = 1; 1423 WARN_ON(1); 1424 } 1425 #endif 1426 if (current->mm == mm) 1427 check_paca_psize(ea, mm, psize, user_region); 1428 1429 goto bail; 1430 } 1431 1432 #ifndef CONFIG_PPC_64K_PAGES 1433 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 1434 #else 1435 DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), 1436 pte_val(*(ptep + PTRS_PER_PTE))); 1437 #endif 1438 /* Do actual hashing */ 1439 #ifdef CONFIG_PPC_64K_PAGES 1440 /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ 1441 if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { 1442 demote_segment_4k(mm, ea); 1443 psize = MMU_PAGE_4K; 1444 } 1445 1446 /* 1447 * If this PTE is non-cacheable and we have restrictions on 1448 * using non cacheable large pages, then we switch to 4k 1449 */ 1450 if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { 1451 if (user_region) { 1452 demote_segment_4k(mm, ea); 1453 psize = MMU_PAGE_4K; 1454 } else if (ea < VMALLOC_END) { 1455 /* 1456 * some driver did a non-cacheable mapping 1457 * in vmalloc space, so switch vmalloc 1458 * to 4k pages 1459 */ 1460 printk(KERN_ALERT "Reducing vmalloc segment " 1461 "to 4kB pages because of " 1462 "non-cacheable mapping\n"); 1463 psize = mmu_vmalloc_psize = MMU_PAGE_4K; 1464 copro_flush_all_slbs(mm); 1465 } 1466 } 1467 1468 #endif /* CONFIG_PPC_64K_PAGES */ 1469 1470 if (current->mm == mm) 1471 check_paca_psize(ea, mm, psize, user_region); 1472 1473 #ifdef CONFIG_PPC_64K_PAGES 1474 if (psize == MMU_PAGE_64K) 1475 rc = __hash_page_64K(ea, access, vsid, ptep, trap, 1476 flags, ssize); 1477 else 1478 #endif /* CONFIG_PPC_64K_PAGES */ 1479 { 1480 int spp = subpage_protection(mm, ea); 1481 if (access & spp) 1482 rc = -2; 1483 else 1484 rc = __hash_page_4K(ea, access, vsid, ptep, trap, 1485 flags, ssize, spp); 1486 } 1487 1488 /* 1489 * Dump some info in case of hash insertion failure, they should 1490 * never happen so it is really useful to know if/when they do 1491 */ 1492 if (rc == -1) 1493 hash_failure_debug(ea, access, vsid, trap, ssize, psize, 1494 psize, pte_val(*ptep)); 1495 #ifndef CONFIG_PPC_64K_PAGES 1496 DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); 1497 #else 1498 DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), 1499 pte_val(*(ptep + PTRS_PER_PTE))); 1500 #endif 1501 DBG_LOW(" -> rc=%d\n", rc); 1502 1503 bail: 1504 return rc; 1505 } 1506 EXPORT_SYMBOL_GPL(hash_page_mm); 1507 1508 int hash_page(unsigned long ea, unsigned long access, unsigned long trap, 1509 unsigned long dsisr) 1510 { 1511 unsigned long flags = 0; 1512 struct mm_struct *mm = current->mm; 1513 1514 if ((get_region_id(ea) == VMALLOC_REGION_ID) || 1515 (get_region_id(ea) == IO_REGION_ID)) 1516 mm = &init_mm; 1517 1518 if (dsisr & DSISR_NOHPTE) 1519 flags |= HPTE_NOHPTE_UPDATE; 1520 1521 return hash_page_mm(mm, ea, access, trap, flags); 1522 } 1523 EXPORT_SYMBOL_GPL(hash_page); 1524 1525 DECLARE_INTERRUPT_HANDLER_RET(__do_hash_fault); 1526 DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault) 1527 { 1528 unsigned long ea = regs->dar; 1529 unsigned long dsisr = regs->dsisr; 1530 unsigned long access = _PAGE_PRESENT | _PAGE_READ; 1531 unsigned long flags = 0; 1532 struct mm_struct *mm; 1533 unsigned int region_id; 1534 long err; 1535 1536 region_id = get_region_id(ea); 1537 if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) 1538 mm = &init_mm; 1539 else 1540 mm = current->mm; 1541 1542 if (dsisr & DSISR_NOHPTE) 1543 flags |= HPTE_NOHPTE_UPDATE; 1544 1545 if (dsisr & DSISR_ISSTORE) 1546 access |= _PAGE_WRITE; 1547 /* 1548 * We set _PAGE_PRIVILEGED only when 1549 * kernel mode access kernel space. 1550 * 1551 * _PAGE_PRIVILEGED is NOT set 1552 * 1) when kernel mode access user space 1553 * 2) user space access kernel space. 1554 */ 1555 access |= _PAGE_PRIVILEGED; 1556 if (user_mode(regs) || (region_id == USER_REGION_ID)) 1557 access &= ~_PAGE_PRIVILEGED; 1558 1559 if (TRAP(regs) == 0x400) 1560 access |= _PAGE_EXEC; 1561 1562 err = hash_page_mm(mm, ea, access, TRAP(regs), flags); 1563 if (unlikely(err < 0)) { 1564 // failed to instert a hash PTE due to an hypervisor error 1565 if (user_mode(regs)) { 1566 if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2) 1567 _exception(SIGSEGV, regs, SEGV_ACCERR, ea); 1568 else 1569 _exception(SIGBUS, regs, BUS_ADRERR, ea); 1570 } else { 1571 bad_page_fault(regs, SIGBUS); 1572 } 1573 err = 0; 1574 } 1575 1576 return err; 1577 } 1578 1579 /* 1580 * The _RAW interrupt entry checks for the in_nmi() case before 1581 * running the full handler. 1582 */ 1583 DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault) 1584 { 1585 unsigned long dsisr = regs->dsisr; 1586 long err; 1587 1588 if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) 1589 goto page_fault; 1590 1591 /* 1592 * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then 1593 * don't call hash_page, just fail the fault. This is required to 1594 * prevent re-entrancy problems in the hash code, namely perf 1595 * interrupts hitting while something holds H_PAGE_BUSY, and taking a 1596 * hash fault. See the comment in hash_preload(). 1597 * 1598 * We come here as a result of a DSI at a point where we don't want 1599 * to call hash_page, such as when we are accessing memory (possibly 1600 * user memory) inside a PMU interrupt that occurred while interrupts 1601 * were soft-disabled. We want to invoke the exception handler for 1602 * the access, or panic if there isn't a handler. 1603 */ 1604 if (unlikely(in_nmi())) { 1605 do_bad_page_fault_segv(regs); 1606 return 0; 1607 } 1608 1609 err = __do_hash_fault(regs); 1610 if (err) { 1611 page_fault: 1612 err = hash__do_page_fault(regs); 1613 } 1614 1615 return err; 1616 } 1617 1618 #ifdef CONFIG_PPC_MM_SLICES 1619 static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) 1620 { 1621 int psize = get_slice_psize(mm, ea); 1622 1623 /* We only prefault standard pages for now */ 1624 if (unlikely(psize != mm_ctx_user_psize(&mm->context))) 1625 return false; 1626 1627 /* 1628 * Don't prefault if subpage protection is enabled for the EA. 1629 */ 1630 if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) 1631 return false; 1632 1633 return true; 1634 } 1635 #else 1636 static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) 1637 { 1638 return true; 1639 } 1640 #endif 1641 1642 static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, 1643 bool is_exec, unsigned long trap) 1644 { 1645 unsigned long vsid; 1646 pgd_t *pgdir; 1647 int rc, ssize, update_flags = 0; 1648 unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); 1649 unsigned long flags; 1650 1651 BUG_ON(get_region_id(ea) != USER_REGION_ID); 1652 1653 if (!should_hash_preload(mm, ea)) 1654 return; 1655 1656 DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," 1657 " trap=%lx\n", mm, mm->pgd, ea, access, trap); 1658 1659 /* Get Linux PTE if available */ 1660 pgdir = mm->pgd; 1661 if (pgdir == NULL) 1662 return; 1663 1664 /* Get VSID */ 1665 ssize = user_segment_size(ea); 1666 vsid = get_user_vsid(&mm->context, ea, ssize); 1667 if (!vsid) 1668 return; 1669 1670 #ifdef CONFIG_PPC_64K_PAGES 1671 /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on 1672 * a 64K kernel), then we don't preload, hash_page() will take 1673 * care of it once we actually try to access the page. 1674 * That way we don't have to duplicate all of the logic for segment 1675 * page size demotion here 1676 * Called with PTL held, hence can be sure the value won't change in 1677 * between. 1678 */ 1679 if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) 1680 return; 1681 #endif /* CONFIG_PPC_64K_PAGES */ 1682 1683 /* 1684 * __hash_page_* must run with interrupts off, as it sets the 1685 * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any 1686 * time and may take a hash fault reading the user stack, see 1687 * read_user_stack_slow() in the powerpc/perf code. 1688 * 1689 * If that takes a hash fault on the same page as we lock here, it 1690 * will bail out when seeing H_PAGE_BUSY set, and retry the access 1691 * leading to an infinite loop. 1692 * 1693 * Disabling interrupts here does not prevent perf interrupts, but it 1694 * will prevent them taking hash faults (see the NMI test in 1695 * do_hash_page), then read_user_stack's copy_from_user_nofault will 1696 * fail and perf will fall back to read_user_stack_slow(), which 1697 * walks the Linux page tables. 1698 * 1699 * Interrupts must also be off for the duration of the 1700 * mm_is_thread_local test and update, to prevent preempt running the 1701 * mm on another CPU (XXX: this may be racy vs kthread_use_mm). 1702 */ 1703 local_irq_save(flags); 1704 1705 /* Is that local to this CPU ? */ 1706 if (mm_is_thread_local(mm)) 1707 update_flags |= HPTE_LOCAL_UPDATE; 1708 1709 /* Hash it in */ 1710 #ifdef CONFIG_PPC_64K_PAGES 1711 if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) 1712 rc = __hash_page_64K(ea, access, vsid, ptep, trap, 1713 update_flags, ssize); 1714 else 1715 #endif /* CONFIG_PPC_64K_PAGES */ 1716 rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, 1717 ssize, subpage_protection(mm, ea)); 1718 1719 /* Dump some info in case of hash insertion failure, they should 1720 * never happen so it is really useful to know if/when they do 1721 */ 1722 if (rc == -1) 1723 hash_failure_debug(ea, access, vsid, trap, ssize, 1724 mm_ctx_user_psize(&mm->context), 1725 mm_ctx_user_psize(&mm->context), 1726 pte_val(*ptep)); 1727 1728 local_irq_restore(flags); 1729 } 1730 1731 /* 1732 * This is called at the end of handling a user page fault, when the 1733 * fault has been handled by updating a PTE in the linux page tables. 1734 * We use it to preload an HPTE into the hash table corresponding to 1735 * the updated linux PTE. 1736 * 1737 * This must always be called with the pte lock held. 1738 */ 1739 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, 1740 pte_t *ptep) 1741 { 1742 /* 1743 * We don't need to worry about _PAGE_PRESENT here because we are 1744 * called with either mm->page_table_lock held or ptl lock held 1745 */ 1746 unsigned long trap; 1747 bool is_exec; 1748 1749 if (radix_enabled()) 1750 return; 1751 1752 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 1753 if (!pte_young(*ptep) || address >= TASK_SIZE) 1754 return; 1755 1756 /* 1757 * We try to figure out if we are coming from an instruction 1758 * access fault and pass that down to __hash_page so we avoid 1759 * double-faulting on execution of fresh text. We have to test 1760 * for regs NULL since init will get here first thing at boot. 1761 * 1762 * We also avoid filling the hash if not coming from a fault. 1763 */ 1764 1765 trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; 1766 switch (trap) { 1767 case 0x300: 1768 is_exec = false; 1769 break; 1770 case 0x400: 1771 is_exec = true; 1772 break; 1773 default: 1774 return; 1775 } 1776 1777 hash_preload(vma->vm_mm, ptep, address, is_exec, trap); 1778 } 1779 1780 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1781 static inline void tm_flush_hash_page(int local) 1782 { 1783 /* 1784 * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a 1785 * page back to a block device w/PIO could pick up transactional data 1786 * (bad!) so we force an abort here. Before the sync the page will be 1787 * made read-only, which will flush_hash_page. BIG ISSUE here: if the 1788 * kernel uses a page from userspace without unmapping it first, it may 1789 * see the speculated version. 1790 */ 1791 if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs && 1792 MSR_TM_ACTIVE(current->thread.regs->msr)) { 1793 tm_enable(); 1794 tm_abort(TM_CAUSE_TLBI); 1795 } 1796 } 1797 #else 1798 static inline void tm_flush_hash_page(int local) 1799 { 1800 } 1801 #endif 1802 1803 /* 1804 * Return the global hash slot, corresponding to the given PTE, which contains 1805 * the HPTE. 1806 */ 1807 unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, 1808 int ssize, real_pte_t rpte, unsigned int subpg_index) 1809 { 1810 unsigned long hash, gslot, hidx; 1811 1812 hash = hpt_hash(vpn, shift, ssize); 1813 hidx = __rpte_to_hidx(rpte, subpg_index); 1814 if (hidx & _PTEIDX_SECONDARY) 1815 hash = ~hash; 1816 gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1817 gslot += hidx & _PTEIDX_GROUP_IX; 1818 return gslot; 1819 } 1820 1821 void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, 1822 unsigned long flags) 1823 { 1824 unsigned long index, shift, gslot; 1825 int local = flags & HPTE_LOCAL_UPDATE; 1826 1827 DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); 1828 pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { 1829 gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index); 1830 DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); 1831 /* 1832 * We use same base page size and actual psize, because we don't 1833 * use these functions for hugepage 1834 */ 1835 mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize, 1836 ssize, local); 1837 } pte_iterate_hashed_end(); 1838 1839 tm_flush_hash_page(local); 1840 } 1841 1842 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1843 void flush_hash_hugepage(unsigned long vsid, unsigned long addr, 1844 pmd_t *pmdp, unsigned int psize, int ssize, 1845 unsigned long flags) 1846 { 1847 int i, max_hpte_count, valid; 1848 unsigned long s_addr; 1849 unsigned char *hpte_slot_array; 1850 unsigned long hidx, shift, vpn, hash, slot; 1851 int local = flags & HPTE_LOCAL_UPDATE; 1852 1853 s_addr = addr & HPAGE_PMD_MASK; 1854 hpte_slot_array = get_hpte_slot_array(pmdp); 1855 /* 1856 * IF we try to do a HUGE PTE update after a withdraw is done. 1857 * we will find the below NULL. This happens when we do 1858 * split_huge_pmd 1859 */ 1860 if (!hpte_slot_array) 1861 return; 1862 1863 if (mmu_hash_ops.hugepage_invalidate) { 1864 mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array, 1865 psize, ssize, local); 1866 goto tm_abort; 1867 } 1868 /* 1869 * No bluk hpte removal support, invalidate each entry 1870 */ 1871 shift = mmu_psize_defs[psize].shift; 1872 max_hpte_count = HPAGE_PMD_SIZE >> shift; 1873 for (i = 0; i < max_hpte_count; i++) { 1874 /* 1875 * 8 bits per each hpte entries 1876 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] 1877 */ 1878 valid = hpte_valid(hpte_slot_array, i); 1879 if (!valid) 1880 continue; 1881 hidx = hpte_hash_index(hpte_slot_array, i); 1882 1883 /* get the vpn */ 1884 addr = s_addr + (i * (1ul << shift)); 1885 vpn = hpt_vpn(addr, vsid, ssize); 1886 hash = hpt_hash(vpn, shift, ssize); 1887 if (hidx & _PTEIDX_SECONDARY) 1888 hash = ~hash; 1889 1890 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1891 slot += hidx & _PTEIDX_GROUP_IX; 1892 mmu_hash_ops.hpte_invalidate(slot, vpn, psize, 1893 MMU_PAGE_16M, ssize, local); 1894 } 1895 tm_abort: 1896 tm_flush_hash_page(local); 1897 } 1898 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1899 1900 void flush_hash_range(unsigned long number, int local) 1901 { 1902 if (mmu_hash_ops.flush_hash_range) 1903 mmu_hash_ops.flush_hash_range(number, local); 1904 else { 1905 int i; 1906 struct ppc64_tlb_batch *batch = 1907 this_cpu_ptr(&ppc64_tlb_batch); 1908 1909 for (i = 0; i < number; i++) 1910 flush_hash_page(batch->vpn[i], batch->pte[i], 1911 batch->psize, batch->ssize, local); 1912 } 1913 } 1914 1915 long hpte_insert_repeating(unsigned long hash, unsigned long vpn, 1916 unsigned long pa, unsigned long rflags, 1917 unsigned long vflags, int psize, int ssize) 1918 { 1919 unsigned long hpte_group; 1920 long slot; 1921 1922 repeat: 1923 hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1924 1925 /* Insert into the hash table, primary slot */ 1926 slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, 1927 psize, psize, ssize); 1928 1929 /* Primary is full, try the secondary */ 1930 if (unlikely(slot == -1)) { 1931 hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; 1932 slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 1933 vflags | HPTE_V_SECONDARY, 1934 psize, psize, ssize); 1935 if (slot == -1) { 1936 if (mftb() & 0x1) 1937 hpte_group = (hash & htab_hash_mask) * 1938 HPTES_PER_GROUP; 1939 1940 mmu_hash_ops.hpte_remove(hpte_group); 1941 goto repeat; 1942 } 1943 } 1944 1945 return slot; 1946 } 1947 1948 #ifdef CONFIG_DEBUG_PAGEALLOC 1949 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) 1950 { 1951 unsigned long hash; 1952 unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); 1953 unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); 1954 unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); 1955 long ret; 1956 1957 hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); 1958 1959 /* Don't create HPTE entries for bad address */ 1960 if (!vsid) 1961 return; 1962 1963 ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, 1964 HPTE_V_BOLTED, 1965 mmu_linear_psize, mmu_kernel_ssize); 1966 1967 BUG_ON (ret < 0); 1968 spin_lock(&linear_map_hash_lock); 1969 BUG_ON(linear_map_hash_slots[lmi] & 0x80); 1970 linear_map_hash_slots[lmi] = ret | 0x80; 1971 spin_unlock(&linear_map_hash_lock); 1972 } 1973 1974 static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) 1975 { 1976 unsigned long hash, hidx, slot; 1977 unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); 1978 unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); 1979 1980 hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); 1981 spin_lock(&linear_map_hash_lock); 1982 BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); 1983 hidx = linear_map_hash_slots[lmi] & 0x7f; 1984 linear_map_hash_slots[lmi] = 0; 1985 spin_unlock(&linear_map_hash_lock); 1986 if (hidx & _PTEIDX_SECONDARY) 1987 hash = ~hash; 1988 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1989 slot += hidx & _PTEIDX_GROUP_IX; 1990 mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, 1991 mmu_linear_psize, 1992 mmu_kernel_ssize, 0); 1993 } 1994 1995 void __kernel_map_pages(struct page *page, int numpages, int enable) 1996 { 1997 unsigned long flags, vaddr, lmi; 1998 int i; 1999 2000 local_irq_save(flags); 2001 for (i = 0; i < numpages; i++, page++) { 2002 vaddr = (unsigned long)page_address(page); 2003 lmi = __pa(vaddr) >> PAGE_SHIFT; 2004 if (lmi >= linear_map_hash_count) 2005 continue; 2006 if (enable) 2007 kernel_map_linear_page(vaddr, lmi); 2008 else 2009 kernel_unmap_linear_page(vaddr, lmi); 2010 } 2011 local_irq_restore(flags); 2012 } 2013 #endif /* CONFIG_DEBUG_PAGEALLOC */ 2014 2015 void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, 2016 phys_addr_t first_memblock_size) 2017 { 2018 /* 2019 * We don't currently support the first MEMBLOCK not mapping 0 2020 * physical on those processors 2021 */ 2022 BUG_ON(first_memblock_base != 0); 2023 2024 /* 2025 * On virtualized systems the first entry is our RMA region aka VRMA, 2026 * non-virtualized 64-bit hash MMU systems don't have a limitation 2027 * on real mode access. 2028 * 2029 * For guests on platforms before POWER9, we clamp the it limit to 1G 2030 * to avoid some funky things such as RTAS bugs etc... 2031 * 2032 * On POWER9 we limit to 1TB in case the host erroneously told us that 2033 * the RMA was >1TB. Effective address bits 0:23 are treated as zero 2034 * (meaning the access is aliased to zero i.e. addr = addr % 1TB) 2035 * for virtual real mode addressing and so it doesn't make sense to 2036 * have an area larger than 1TB as it can't be addressed. 2037 */ 2038 if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { 2039 ppc64_rma_size = first_memblock_size; 2040 if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) 2041 ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); 2042 else 2043 ppc64_rma_size = min_t(u64, ppc64_rma_size, 2044 1UL << SID_SHIFT_1T); 2045 2046 /* Finally limit subsequent allocations */ 2047 memblock_set_current_limit(ppc64_rma_size); 2048 } else { 2049 ppc64_rma_size = ULONG_MAX; 2050 } 2051 } 2052 2053 #ifdef CONFIG_DEBUG_FS 2054 2055 static int hpt_order_get(void *data, u64 *val) 2056 { 2057 *val = ppc64_pft_size; 2058 return 0; 2059 } 2060 2061 static int hpt_order_set(void *data, u64 val) 2062 { 2063 int ret; 2064 2065 if (!mmu_hash_ops.resize_hpt) 2066 return -ENODEV; 2067 2068 cpus_read_lock(); 2069 ret = mmu_hash_ops.resize_hpt(val); 2070 cpus_read_unlock(); 2071 2072 return ret; 2073 } 2074 2075 DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); 2076 2077 static int __init hash64_debugfs(void) 2078 { 2079 debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL, 2080 &fops_hpt_order); 2081 return 0; 2082 } 2083 machine_device_initcall(pseries, hash64_debugfs); 2084 #endif /* CONFIG_DEBUG_FS */ 2085 2086 void __init print_system_hash_info(void) 2087 { 2088 pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); 2089 2090 if (htab_hash_mask) 2091 pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); 2092 } 2093