1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/moduleparam.h> 19 #include <linux/swap.h> 20 #include <linux/swapops.h> 21 #include <linux/kmemleak.h> 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/tlb.h> 25 #include <asm/setup.h> 26 #include <asm/hugetlb.h> 27 #include <asm/pte-walk.h> 28 29 bool hugetlb_disabled = false; 30 31 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 32 33 #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *))) 34 35 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 36 { 37 /* 38 * Only called for hugetlbfs pages, hence can ignore THP and the 39 * irq disabled walk. 40 */ 41 return __find_linux_pte(mm->pgd, addr, NULL, NULL); 42 } 43 44 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 45 unsigned long address, unsigned int pdshift, 46 unsigned int pshift, spinlock_t *ptl) 47 { 48 struct kmem_cache *cachep; 49 pte_t *new; 50 int i; 51 int num_hugepd; 52 53 if (pshift >= pdshift) { 54 cachep = PGT_CACHE(PTE_T_ORDER); 55 num_hugepd = 1 << (pshift - pdshift); 56 } else if (IS_ENABLED(CONFIG_PPC_8xx)) { 57 cachep = PGT_CACHE(PTE_INDEX_SIZE); 58 num_hugepd = 1; 59 } else { 60 cachep = PGT_CACHE(pdshift - pshift); 61 num_hugepd = 1; 62 } 63 64 if (!cachep) { 65 WARN_ONCE(1, "No page table cache created for hugetlb tables"); 66 return -ENOMEM; 67 } 68 69 new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 70 71 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 72 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 73 74 if (!new) 75 return -ENOMEM; 76 77 /* 78 * Make sure other cpus find the hugepd set only after a 79 * properly initialized page table is visible to them. 80 * For more details look for comment in __pte_alloc(). 81 */ 82 smp_wmb(); 83 84 spin_lock(ptl); 85 /* 86 * We have multiple higher-level entries that point to the same 87 * actual pte location. Fill in each as we go and backtrack on error. 88 * We need all of these so the DTLB pgtable walk code can find the 89 * right higher-level entry without knowing if it's a hugepage or not. 90 */ 91 for (i = 0; i < num_hugepd; i++, hpdp++) { 92 if (unlikely(!hugepd_none(*hpdp))) 93 break; 94 hugepd_populate(hpdp, new, pshift); 95 } 96 /* If we bailed from the for loop early, an error occurred, clean up */ 97 if (i < num_hugepd) { 98 for (i = i - 1 ; i >= 0; i--, hpdp--) 99 *hpdp = __hugepd(0); 100 kmem_cache_free(cachep, new); 101 } else { 102 kmemleak_ignore(new); 103 } 104 spin_unlock(ptl); 105 return 0; 106 } 107 108 /* 109 * At this point we do the placement change only for BOOK3S 64. This would 110 * possibly work on other subarchs. 111 */ 112 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 113 { 114 pgd_t *pg; 115 pud_t *pu; 116 pmd_t *pm; 117 hugepd_t *hpdp = NULL; 118 unsigned pshift = __ffs(sz); 119 unsigned pdshift = PGDIR_SHIFT; 120 spinlock_t *ptl; 121 122 addr &= ~(sz-1); 123 pg = pgd_offset(mm, addr); 124 125 #ifdef CONFIG_PPC_BOOK3S_64 126 if (pshift == PGDIR_SHIFT) 127 /* 16GB huge page */ 128 return (pte_t *) pg; 129 else if (pshift > PUD_SHIFT) { 130 /* 131 * We need to use hugepd table 132 */ 133 ptl = &mm->page_table_lock; 134 hpdp = (hugepd_t *)pg; 135 } else { 136 pdshift = PUD_SHIFT; 137 pu = pud_alloc(mm, pg, addr); 138 if (!pu) 139 return NULL; 140 if (pshift == PUD_SHIFT) 141 return (pte_t *)pu; 142 else if (pshift > PMD_SHIFT) { 143 ptl = pud_lockptr(mm, pu); 144 hpdp = (hugepd_t *)pu; 145 } else { 146 pdshift = PMD_SHIFT; 147 pm = pmd_alloc(mm, pu, addr); 148 if (!pm) 149 return NULL; 150 if (pshift == PMD_SHIFT) 151 /* 16MB hugepage */ 152 return (pte_t *)pm; 153 else { 154 ptl = pmd_lockptr(mm, pm); 155 hpdp = (hugepd_t *)pm; 156 } 157 } 158 } 159 #else 160 if (pshift >= PGDIR_SHIFT) { 161 ptl = &mm->page_table_lock; 162 hpdp = (hugepd_t *)pg; 163 } else { 164 pdshift = PUD_SHIFT; 165 pu = pud_alloc(mm, pg, addr); 166 if (!pu) 167 return NULL; 168 if (pshift >= PUD_SHIFT) { 169 ptl = pud_lockptr(mm, pu); 170 hpdp = (hugepd_t *)pu; 171 } else { 172 pdshift = PMD_SHIFT; 173 pm = pmd_alloc(mm, pu, addr); 174 if (!pm) 175 return NULL; 176 ptl = pmd_lockptr(mm, pm); 177 hpdp = (hugepd_t *)pm; 178 } 179 } 180 #endif 181 if (!hpdp) 182 return NULL; 183 184 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 185 186 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, 187 pdshift, pshift, ptl)) 188 return NULL; 189 190 return hugepte_offset(*hpdp, addr, pdshift); 191 } 192 193 #ifdef CONFIG_PPC_BOOK3S_64 194 /* 195 * Tracks gpages after the device tree is scanned and before the 196 * huge_boot_pages list is ready on pseries. 197 */ 198 #define MAX_NUMBER_GPAGES 1024 199 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 200 __initdata static unsigned nr_gpages; 201 202 /* 203 * Build list of addresses of gigantic pages. This function is used in early 204 * boot before the buddy allocator is setup. 205 */ 206 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 207 { 208 if (!addr) 209 return; 210 while (number_of_pages > 0) { 211 gpage_freearray[nr_gpages] = addr; 212 nr_gpages++; 213 number_of_pages--; 214 addr += page_size; 215 } 216 } 217 218 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) 219 { 220 struct huge_bootmem_page *m; 221 if (nr_gpages == 0) 222 return 0; 223 m = phys_to_virt(gpage_freearray[--nr_gpages]); 224 gpage_freearray[nr_gpages] = 0; 225 list_add(&m->list, &huge_boot_pages); 226 m->hstate = hstate; 227 return 1; 228 } 229 #endif 230 231 232 int __init alloc_bootmem_huge_page(struct hstate *h) 233 { 234 235 #ifdef CONFIG_PPC_BOOK3S_64 236 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) 237 return pseries_alloc_bootmem_huge_page(h); 238 #endif 239 return __alloc_bootmem_huge_page(h); 240 } 241 242 #ifndef CONFIG_PPC_BOOK3S_64 243 #define HUGEPD_FREELIST_SIZE \ 244 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 245 246 struct hugepd_freelist { 247 struct rcu_head rcu; 248 unsigned int index; 249 void *ptes[0]; 250 }; 251 252 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 253 254 static void hugepd_free_rcu_callback(struct rcu_head *head) 255 { 256 struct hugepd_freelist *batch = 257 container_of(head, struct hugepd_freelist, rcu); 258 unsigned int i; 259 260 for (i = 0; i < batch->index; i++) 261 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]); 262 263 free_page((unsigned long)batch); 264 } 265 266 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 267 { 268 struct hugepd_freelist **batchp; 269 270 batchp = &get_cpu_var(hugepd_freelist_cur); 271 272 if (atomic_read(&tlb->mm->mm_users) < 2 || 273 mm_is_thread_local(tlb->mm)) { 274 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte); 275 put_cpu_var(hugepd_freelist_cur); 276 return; 277 } 278 279 if (*batchp == NULL) { 280 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 281 (*batchp)->index = 0; 282 } 283 284 (*batchp)->ptes[(*batchp)->index++] = hugepte; 285 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 286 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); 287 *batchp = NULL; 288 } 289 put_cpu_var(hugepd_freelist_cur); 290 } 291 #else 292 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 293 #endif 294 295 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 296 unsigned long start, unsigned long end, 297 unsigned long floor, unsigned long ceiling) 298 { 299 pte_t *hugepte = hugepd_page(*hpdp); 300 int i; 301 302 unsigned long pdmask = ~((1UL << pdshift) - 1); 303 unsigned int num_hugepd = 1; 304 unsigned int shift = hugepd_shift(*hpdp); 305 306 /* Note: On fsl the hpdp may be the first of several */ 307 if (shift > pdshift) 308 num_hugepd = 1 << (shift - pdshift); 309 310 start &= pdmask; 311 if (start < floor) 312 return; 313 if (ceiling) { 314 ceiling &= pdmask; 315 if (! ceiling) 316 return; 317 } 318 if (end - 1 > ceiling - 1) 319 return; 320 321 for (i = 0; i < num_hugepd; i++, hpdp++) 322 *hpdp = __hugepd(0); 323 324 if (shift >= pdshift) 325 hugepd_free(tlb, hugepte); 326 else if (IS_ENABLED(CONFIG_PPC_8xx)) 327 pgtable_free_tlb(tlb, hugepte, 328 get_hugepd_cache_index(PTE_INDEX_SIZE)); 329 else 330 pgtable_free_tlb(tlb, hugepte, 331 get_hugepd_cache_index(pdshift - shift)); 332 } 333 334 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 335 unsigned long addr, unsigned long end, 336 unsigned long floor, unsigned long ceiling) 337 { 338 pmd_t *pmd; 339 unsigned long next; 340 unsigned long start; 341 342 start = addr; 343 do { 344 unsigned long more; 345 346 pmd = pmd_offset(pud, addr); 347 next = pmd_addr_end(addr, end); 348 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 349 /* 350 * if it is not hugepd pointer, we should already find 351 * it cleared. 352 */ 353 WARN_ON(!pmd_none_or_clear_bad(pmd)); 354 continue; 355 } 356 /* 357 * Increment next by the size of the huge mapping since 358 * there may be more than one entry at this level for a 359 * single hugepage, but all of them point to 360 * the same kmem cache that holds the hugepte. 361 */ 362 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 363 if (more > next) 364 next = more; 365 366 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 367 addr, next, floor, ceiling); 368 } while (addr = next, addr != end); 369 370 start &= PUD_MASK; 371 if (start < floor) 372 return; 373 if (ceiling) { 374 ceiling &= PUD_MASK; 375 if (!ceiling) 376 return; 377 } 378 if (end - 1 > ceiling - 1) 379 return; 380 381 pmd = pmd_offset(pud, start); 382 pud_clear(pud); 383 pmd_free_tlb(tlb, pmd, start); 384 mm_dec_nr_pmds(tlb->mm); 385 } 386 387 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 388 unsigned long addr, unsigned long end, 389 unsigned long floor, unsigned long ceiling) 390 { 391 pud_t *pud; 392 unsigned long next; 393 unsigned long start; 394 395 start = addr; 396 do { 397 pud = pud_offset(pgd, addr); 398 next = pud_addr_end(addr, end); 399 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 400 if (pud_none_or_clear_bad(pud)) 401 continue; 402 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 403 ceiling); 404 } else { 405 unsigned long more; 406 /* 407 * Increment next by the size of the huge mapping since 408 * there may be more than one entry at this level for a 409 * single hugepage, but all of them point to 410 * the same kmem cache that holds the hugepte. 411 */ 412 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 413 if (more > next) 414 next = more; 415 416 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 417 addr, next, floor, ceiling); 418 } 419 } while (addr = next, addr != end); 420 421 start &= PGDIR_MASK; 422 if (start < floor) 423 return; 424 if (ceiling) { 425 ceiling &= PGDIR_MASK; 426 if (!ceiling) 427 return; 428 } 429 if (end - 1 > ceiling - 1) 430 return; 431 432 pud = pud_offset(pgd, start); 433 pgd_clear(pgd); 434 pud_free_tlb(tlb, pud, start); 435 mm_dec_nr_puds(tlb->mm); 436 } 437 438 /* 439 * This function frees user-level page tables of a process. 440 */ 441 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 442 unsigned long addr, unsigned long end, 443 unsigned long floor, unsigned long ceiling) 444 { 445 pgd_t *pgd; 446 unsigned long next; 447 448 /* 449 * Because there are a number of different possible pagetable 450 * layouts for hugepage ranges, we limit knowledge of how 451 * things should be laid out to the allocation path 452 * (huge_pte_alloc(), above). Everything else works out the 453 * structure as it goes from information in the hugepd 454 * pointers. That means that we can't here use the 455 * optimization used in the normal page free_pgd_range(), of 456 * checking whether we're actually covering a large enough 457 * range to have to do anything at the top level of the walk 458 * instead of at the bottom. 459 * 460 * To make sense of this, you should probably go read the big 461 * block comment at the top of the normal free_pgd_range(), 462 * too. 463 */ 464 465 do { 466 next = pgd_addr_end(addr, end); 467 pgd = pgd_offset(tlb->mm, addr); 468 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 469 if (pgd_none_or_clear_bad(pgd)) 470 continue; 471 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 472 } else { 473 unsigned long more; 474 /* 475 * Increment next by the size of the huge mapping since 476 * there may be more than one entry at the pgd level 477 * for a single hugepage, but all of them point to the 478 * same kmem cache that holds the hugepte. 479 */ 480 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 481 if (more > next) 482 next = more; 483 484 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 485 addr, next, floor, ceiling); 486 } 487 } while (addr = next, addr != end); 488 } 489 490 struct page *follow_huge_pd(struct vm_area_struct *vma, 491 unsigned long address, hugepd_t hpd, 492 int flags, int pdshift) 493 { 494 pte_t *ptep; 495 spinlock_t *ptl; 496 struct page *page = NULL; 497 unsigned long mask; 498 int shift = hugepd_shift(hpd); 499 struct mm_struct *mm = vma->vm_mm; 500 501 retry: 502 /* 503 * hugepage directory entries are protected by mm->page_table_lock 504 * Use this instead of huge_pte_lockptr 505 */ 506 ptl = &mm->page_table_lock; 507 spin_lock(ptl); 508 509 ptep = hugepte_offset(hpd, address, pdshift); 510 if (pte_present(*ptep)) { 511 mask = (1UL << shift) - 1; 512 page = pte_page(*ptep); 513 page += ((address & mask) >> PAGE_SHIFT); 514 if (flags & FOLL_GET) 515 get_page(page); 516 } else { 517 if (is_hugetlb_entry_migration(*ptep)) { 518 spin_unlock(ptl); 519 __migration_entry_wait(mm, ptep, ptl); 520 goto retry; 521 } 522 } 523 spin_unlock(ptl); 524 return page; 525 } 526 527 #ifdef CONFIG_PPC_MM_SLICES 528 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 529 unsigned long len, unsigned long pgoff, 530 unsigned long flags) 531 { 532 struct hstate *hstate = hstate_file(file); 533 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 534 535 #ifdef CONFIG_PPC_RADIX_MMU 536 if (radix_enabled()) 537 return radix__hugetlb_get_unmapped_area(file, addr, len, 538 pgoff, flags); 539 #endif 540 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 541 } 542 #endif 543 544 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 545 { 546 /* With radix we don't use slice, so derive it from vma*/ 547 if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { 548 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 549 550 return 1UL << mmu_psize_to_shift(psize); 551 } 552 return vma_kernel_pagesize(vma); 553 } 554 555 static int __init add_huge_page_size(unsigned long long size) 556 { 557 int shift = __ffs(size); 558 int mmu_psize; 559 560 /* Check that it is a page size supported by the hardware and 561 * that it fits within pagetable and slice limits. */ 562 if (size <= PAGE_SIZE || !is_power_of_2(size)) 563 return -EINVAL; 564 565 mmu_psize = check_and_get_huge_psize(shift); 566 if (mmu_psize < 0) 567 return -EINVAL; 568 569 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 570 571 /* Return if huge page size has already been setup */ 572 if (size_to_hstate(size)) 573 return 0; 574 575 hugetlb_add_hstate(shift - PAGE_SHIFT); 576 577 return 0; 578 } 579 580 static int __init hugepage_setup_sz(char *str) 581 { 582 unsigned long long size; 583 584 size = memparse(str, &str); 585 586 if (add_huge_page_size(size) != 0) { 587 hugetlb_bad_size(); 588 pr_err("Invalid huge page size specified(%llu)\n", size); 589 } 590 591 return 1; 592 } 593 __setup("hugepagesz=", hugepage_setup_sz); 594 595 static int __init hugetlbpage_init(void) 596 { 597 bool configured = false; 598 int psize; 599 600 if (hugetlb_disabled) { 601 pr_info("HugeTLB support is disabled!\n"); 602 return 0; 603 } 604 605 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && 606 !mmu_has_feature(MMU_FTR_16M_PAGE)) 607 return -ENODEV; 608 609 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 610 unsigned shift; 611 unsigned pdshift; 612 613 if (!mmu_psize_defs[psize].shift) 614 continue; 615 616 shift = mmu_psize_to_shift(psize); 617 618 #ifdef CONFIG_PPC_BOOK3S_64 619 if (shift > PGDIR_SHIFT) 620 continue; 621 else if (shift > PUD_SHIFT) 622 pdshift = PGDIR_SHIFT; 623 else if (shift > PMD_SHIFT) 624 pdshift = PUD_SHIFT; 625 else 626 pdshift = PMD_SHIFT; 627 #else 628 if (shift < PUD_SHIFT) 629 pdshift = PMD_SHIFT; 630 else if (shift < PGDIR_SHIFT) 631 pdshift = PUD_SHIFT; 632 else 633 pdshift = PGDIR_SHIFT; 634 #endif 635 636 if (add_huge_page_size(1ULL << shift) < 0) 637 continue; 638 /* 639 * if we have pdshift and shift value same, we don't 640 * use pgt cache for hugepd. 641 */ 642 if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx)) 643 pgtable_cache_add(PTE_INDEX_SIZE); 644 else if (pdshift > shift) 645 pgtable_cache_add(pdshift - shift); 646 else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx)) 647 pgtable_cache_add(PTE_T_ORDER); 648 649 configured = true; 650 } 651 652 if (configured) { 653 if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) 654 hugetlbpage_init_default(); 655 } else 656 pr_info("Failed to initialize. Disabling HugeTLB"); 657 658 return 0; 659 } 660 661 arch_initcall(hugetlbpage_init); 662 663 void flush_dcache_icache_hugepage(struct page *page) 664 { 665 int i; 666 void *start; 667 668 BUG_ON(!PageCompound(page)); 669 670 for (i = 0; i < (1UL << compound_order(page)); i++) { 671 if (!PageHighMem(page)) { 672 __flush_dcache_icache(page_address(page+i)); 673 } else { 674 start = kmap_atomic(page+i); 675 __flush_dcache_icache(start); 676 kunmap_atomic(start); 677 } 678 } 679 } 680