1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/bootmem.h> 19 #include <linux/moduleparam.h> 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/tlb.h> 23 #include <asm/setup.h> 24 #include <asm/hugetlb.h> 25 26 #ifdef CONFIG_HUGETLB_PAGE 27 28 #define PAGE_SHIFT_64K 16 29 #define PAGE_SHIFT_16M 24 30 #define PAGE_SHIFT_16G 34 31 32 unsigned int HPAGE_SHIFT; 33 34 /* 35 * Tracks gpages after the device tree is scanned and before the 36 * huge_boot_pages list is ready. On non-Freescale implementations, this is 37 * just used to track 16G pages and so is a single array. FSL-based 38 * implementations may have more than one gpage size, so we need multiple 39 * arrays 40 */ 41 #ifdef CONFIG_PPC_FSL_BOOK3E 42 #define MAX_NUMBER_GPAGES 128 43 struct psize_gpages { 44 u64 gpage_list[MAX_NUMBER_GPAGES]; 45 unsigned int nr_gpages; 46 }; 47 static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; 48 #else 49 #define MAX_NUMBER_GPAGES 1024 50 static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 51 static unsigned nr_gpages; 52 #endif 53 54 #define hugepd_none(hpd) ((hpd).pd == 0) 55 56 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 57 { 58 /* Only called for hugetlbfs pages, hence can ignore THP */ 59 return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); 60 } 61 62 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 63 unsigned long address, unsigned pdshift, unsigned pshift) 64 { 65 struct kmem_cache *cachep; 66 pte_t *new; 67 68 #ifdef CONFIG_PPC_FSL_BOOK3E 69 int i; 70 int num_hugepd = 1 << (pshift - pdshift); 71 cachep = hugepte_cache; 72 #else 73 cachep = PGT_CACHE(pdshift - pshift); 74 #endif 75 76 new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); 77 78 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 79 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 80 81 if (! new) 82 return -ENOMEM; 83 84 spin_lock(&mm->page_table_lock); 85 #ifdef CONFIG_PPC_FSL_BOOK3E 86 /* 87 * We have multiple higher-level entries that point to the same 88 * actual pte location. Fill in each as we go and backtrack on error. 89 * We need all of these so the DTLB pgtable walk code can find the 90 * right higher-level entry without knowing if it's a hugepage or not. 91 */ 92 for (i = 0; i < num_hugepd; i++, hpdp++) { 93 if (unlikely(!hugepd_none(*hpdp))) 94 break; 95 else 96 /* We use the old format for PPC_FSL_BOOK3E */ 97 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; 98 } 99 /* If we bailed from the for loop early, an error occurred, clean up */ 100 if (i < num_hugepd) { 101 for (i = i - 1 ; i >= 0; i--, hpdp--) 102 hpdp->pd = 0; 103 kmem_cache_free(cachep, new); 104 } 105 #else 106 if (!hugepd_none(*hpdp)) 107 kmem_cache_free(cachep, new); 108 else { 109 #ifdef CONFIG_PPC_BOOK3S_64 110 hpdp->pd = __pa(new) | (shift_to_mmu_psize(pshift) << 2); 111 #else 112 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; 113 #endif 114 } 115 #endif 116 spin_unlock(&mm->page_table_lock); 117 return 0; 118 } 119 120 /* 121 * These macros define how to determine which level of the page table holds 122 * the hpdp. 123 */ 124 #ifdef CONFIG_PPC_FSL_BOOK3E 125 #define HUGEPD_PGD_SHIFT PGDIR_SHIFT 126 #define HUGEPD_PUD_SHIFT PUD_SHIFT 127 #else 128 #define HUGEPD_PGD_SHIFT PUD_SHIFT 129 #define HUGEPD_PUD_SHIFT PMD_SHIFT 130 #endif 131 132 #ifdef CONFIG_PPC_BOOK3S_64 133 /* 134 * At this point we do the placement change only for BOOK3S 64. This would 135 * possibly work on other subarchs. 136 */ 137 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 138 { 139 pgd_t *pg; 140 pud_t *pu; 141 pmd_t *pm; 142 hugepd_t *hpdp = NULL; 143 unsigned pshift = __ffs(sz); 144 unsigned pdshift = PGDIR_SHIFT; 145 146 addr &= ~(sz-1); 147 pg = pgd_offset(mm, addr); 148 149 if (pshift == PGDIR_SHIFT) 150 /* 16GB huge page */ 151 return (pte_t *) pg; 152 else if (pshift > PUD_SHIFT) 153 /* 154 * We need to use hugepd table 155 */ 156 hpdp = (hugepd_t *)pg; 157 else { 158 pdshift = PUD_SHIFT; 159 pu = pud_alloc(mm, pg, addr); 160 if (pshift == PUD_SHIFT) 161 return (pte_t *)pu; 162 else if (pshift > PMD_SHIFT) 163 hpdp = (hugepd_t *)pu; 164 else { 165 pdshift = PMD_SHIFT; 166 pm = pmd_alloc(mm, pu, addr); 167 if (pshift == PMD_SHIFT) 168 /* 16MB hugepage */ 169 return (pte_t *)pm; 170 else 171 hpdp = (hugepd_t *)pm; 172 } 173 } 174 if (!hpdp) 175 return NULL; 176 177 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 178 179 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) 180 return NULL; 181 182 return hugepte_offset(*hpdp, addr, pdshift); 183 } 184 185 #else 186 187 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 188 { 189 pgd_t *pg; 190 pud_t *pu; 191 pmd_t *pm; 192 hugepd_t *hpdp = NULL; 193 unsigned pshift = __ffs(sz); 194 unsigned pdshift = PGDIR_SHIFT; 195 196 addr &= ~(sz-1); 197 198 pg = pgd_offset(mm, addr); 199 200 if (pshift >= HUGEPD_PGD_SHIFT) { 201 hpdp = (hugepd_t *)pg; 202 } else { 203 pdshift = PUD_SHIFT; 204 pu = pud_alloc(mm, pg, addr); 205 if (pshift >= HUGEPD_PUD_SHIFT) { 206 hpdp = (hugepd_t *)pu; 207 } else { 208 pdshift = PMD_SHIFT; 209 pm = pmd_alloc(mm, pu, addr); 210 hpdp = (hugepd_t *)pm; 211 } 212 } 213 214 if (!hpdp) 215 return NULL; 216 217 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 218 219 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) 220 return NULL; 221 222 return hugepte_offset(*hpdp, addr, pdshift); 223 } 224 #endif 225 226 #ifdef CONFIG_PPC_FSL_BOOK3E 227 /* Build list of addresses of gigantic pages. This function is used in early 228 * boot before the buddy allocator is setup. 229 */ 230 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 231 { 232 unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); 233 int i; 234 235 if (addr == 0) 236 return; 237 238 gpage_freearray[idx].nr_gpages = number_of_pages; 239 240 for (i = 0; i < number_of_pages; i++) { 241 gpage_freearray[idx].gpage_list[i] = addr; 242 addr += page_size; 243 } 244 } 245 246 /* 247 * Moves the gigantic page addresses from the temporary list to the 248 * huge_boot_pages list. 249 */ 250 int alloc_bootmem_huge_page(struct hstate *hstate) 251 { 252 struct huge_bootmem_page *m; 253 int idx = shift_to_mmu_psize(huge_page_shift(hstate)); 254 int nr_gpages = gpage_freearray[idx].nr_gpages; 255 256 if (nr_gpages == 0) 257 return 0; 258 259 #ifdef CONFIG_HIGHMEM 260 /* 261 * If gpages can be in highmem we can't use the trick of storing the 262 * data structure in the page; allocate space for this 263 */ 264 m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); 265 m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; 266 #else 267 m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); 268 #endif 269 270 list_add(&m->list, &huge_boot_pages); 271 gpage_freearray[idx].nr_gpages = nr_gpages; 272 gpage_freearray[idx].gpage_list[nr_gpages] = 0; 273 m->hstate = hstate; 274 275 return 1; 276 } 277 /* 278 * Scan the command line hugepagesz= options for gigantic pages; store those in 279 * a list that we use to allocate the memory once all options are parsed. 280 */ 281 282 unsigned long gpage_npages[MMU_PAGE_COUNT]; 283 284 static int __init do_gpage_early_setup(char *param, char *val, 285 const char *unused, void *arg) 286 { 287 static phys_addr_t size; 288 unsigned long npages; 289 290 /* 291 * The hugepagesz and hugepages cmdline options are interleaved. We 292 * use the size variable to keep track of whether or not this was done 293 * properly and skip over instances where it is incorrect. Other 294 * command-line parsing code will issue warnings, so we don't need to. 295 * 296 */ 297 if ((strcmp(param, "default_hugepagesz") == 0) || 298 (strcmp(param, "hugepagesz") == 0)) { 299 size = memparse(val, NULL); 300 } else if (strcmp(param, "hugepages") == 0) { 301 if (size != 0) { 302 if (sscanf(val, "%lu", &npages) <= 0) 303 npages = 0; 304 if (npages > MAX_NUMBER_GPAGES) { 305 pr_warn("MMU: %lu pages requested for page " 306 "size %llu KB, limiting to " 307 __stringify(MAX_NUMBER_GPAGES) "\n", 308 npages, size / 1024); 309 npages = MAX_NUMBER_GPAGES; 310 } 311 gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; 312 size = 0; 313 } 314 } 315 return 0; 316 } 317 318 319 /* 320 * This function allocates physical space for pages that are larger than the 321 * buddy allocator can handle. We want to allocate these in highmem because 322 * the amount of lowmem is limited. This means that this function MUST be 323 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb 324 * allocate to grab highmem. 325 */ 326 void __init reserve_hugetlb_gpages(void) 327 { 328 static __initdata char cmdline[COMMAND_LINE_SIZE]; 329 phys_addr_t size, base; 330 int i; 331 332 strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); 333 parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, 334 NULL, &do_gpage_early_setup); 335 336 /* 337 * Walk gpage list in reverse, allocating larger page sizes first. 338 * Skip over unsupported sizes, or sizes that have 0 gpages allocated. 339 * When we reach the point in the list where pages are no longer 340 * considered gpages, we're done. 341 */ 342 for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { 343 if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) 344 continue; 345 else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) 346 break; 347 348 size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); 349 base = memblock_alloc_base(size * gpage_npages[i], size, 350 MEMBLOCK_ALLOC_ANYWHERE); 351 add_gpage(base, size, gpage_npages[i]); 352 } 353 } 354 355 #else /* !PPC_FSL_BOOK3E */ 356 357 /* Build list of addresses of gigantic pages. This function is used in early 358 * boot before the buddy allocator is setup. 359 */ 360 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 361 { 362 if (!addr) 363 return; 364 while (number_of_pages > 0) { 365 gpage_freearray[nr_gpages] = addr; 366 nr_gpages++; 367 number_of_pages--; 368 addr += page_size; 369 } 370 } 371 372 /* Moves the gigantic page addresses from the temporary list to the 373 * huge_boot_pages list. 374 */ 375 int alloc_bootmem_huge_page(struct hstate *hstate) 376 { 377 struct huge_bootmem_page *m; 378 if (nr_gpages == 0) 379 return 0; 380 m = phys_to_virt(gpage_freearray[--nr_gpages]); 381 gpage_freearray[nr_gpages] = 0; 382 list_add(&m->list, &huge_boot_pages); 383 m->hstate = hstate; 384 return 1; 385 } 386 #endif 387 388 #ifdef CONFIG_PPC_FSL_BOOK3E 389 #define HUGEPD_FREELIST_SIZE \ 390 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 391 392 struct hugepd_freelist { 393 struct rcu_head rcu; 394 unsigned int index; 395 void *ptes[0]; 396 }; 397 398 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 399 400 static void hugepd_free_rcu_callback(struct rcu_head *head) 401 { 402 struct hugepd_freelist *batch = 403 container_of(head, struct hugepd_freelist, rcu); 404 unsigned int i; 405 406 for (i = 0; i < batch->index; i++) 407 kmem_cache_free(hugepte_cache, batch->ptes[i]); 408 409 free_page((unsigned long)batch); 410 } 411 412 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 413 { 414 struct hugepd_freelist **batchp; 415 416 batchp = &get_cpu_var(hugepd_freelist_cur); 417 418 if (atomic_read(&tlb->mm->mm_users) < 2 || 419 cpumask_equal(mm_cpumask(tlb->mm), 420 cpumask_of(smp_processor_id()))) { 421 kmem_cache_free(hugepte_cache, hugepte); 422 put_cpu_var(hugepd_freelist_cur); 423 return; 424 } 425 426 if (*batchp == NULL) { 427 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 428 (*batchp)->index = 0; 429 } 430 431 (*batchp)->ptes[(*batchp)->index++] = hugepte; 432 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 433 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); 434 *batchp = NULL; 435 } 436 put_cpu_var(hugepd_freelist_cur); 437 } 438 #endif 439 440 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 441 unsigned long start, unsigned long end, 442 unsigned long floor, unsigned long ceiling) 443 { 444 pte_t *hugepte = hugepd_page(*hpdp); 445 int i; 446 447 unsigned long pdmask = ~((1UL << pdshift) - 1); 448 unsigned int num_hugepd = 1; 449 450 #ifdef CONFIG_PPC_FSL_BOOK3E 451 /* Note: On fsl the hpdp may be the first of several */ 452 num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); 453 #else 454 unsigned int shift = hugepd_shift(*hpdp); 455 #endif 456 457 start &= pdmask; 458 if (start < floor) 459 return; 460 if (ceiling) { 461 ceiling &= pdmask; 462 if (! ceiling) 463 return; 464 } 465 if (end - 1 > ceiling - 1) 466 return; 467 468 for (i = 0; i < num_hugepd; i++, hpdp++) 469 hpdp->pd = 0; 470 471 #ifdef CONFIG_PPC_FSL_BOOK3E 472 hugepd_free(tlb, hugepte); 473 #else 474 pgtable_free_tlb(tlb, hugepte, pdshift - shift); 475 #endif 476 } 477 478 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 479 unsigned long addr, unsigned long end, 480 unsigned long floor, unsigned long ceiling) 481 { 482 pmd_t *pmd; 483 unsigned long next; 484 unsigned long start; 485 486 start = addr; 487 do { 488 pmd = pmd_offset(pud, addr); 489 next = pmd_addr_end(addr, end); 490 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 491 /* 492 * if it is not hugepd pointer, we should already find 493 * it cleared. 494 */ 495 WARN_ON(!pmd_none_or_clear_bad(pmd)); 496 continue; 497 } 498 #ifdef CONFIG_PPC_FSL_BOOK3E 499 /* 500 * Increment next by the size of the huge mapping since 501 * there may be more than one entry at this level for a 502 * single hugepage, but all of them point to 503 * the same kmem cache that holds the hugepte. 504 */ 505 next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 506 #endif 507 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 508 addr, next, floor, ceiling); 509 } while (addr = next, addr != end); 510 511 start &= PUD_MASK; 512 if (start < floor) 513 return; 514 if (ceiling) { 515 ceiling &= PUD_MASK; 516 if (!ceiling) 517 return; 518 } 519 if (end - 1 > ceiling - 1) 520 return; 521 522 pmd = pmd_offset(pud, start); 523 pud_clear(pud); 524 pmd_free_tlb(tlb, pmd, start); 525 mm_dec_nr_pmds(tlb->mm); 526 } 527 528 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 529 unsigned long addr, unsigned long end, 530 unsigned long floor, unsigned long ceiling) 531 { 532 pud_t *pud; 533 unsigned long next; 534 unsigned long start; 535 536 start = addr; 537 do { 538 pud = pud_offset(pgd, addr); 539 next = pud_addr_end(addr, end); 540 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 541 if (pud_none_or_clear_bad(pud)) 542 continue; 543 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 544 ceiling); 545 } else { 546 #ifdef CONFIG_PPC_FSL_BOOK3E 547 /* 548 * Increment next by the size of the huge mapping since 549 * there may be more than one entry at this level for a 550 * single hugepage, but all of them point to 551 * the same kmem cache that holds the hugepte. 552 */ 553 next = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 554 #endif 555 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 556 addr, next, floor, ceiling); 557 } 558 } while (addr = next, addr != end); 559 560 start &= PGDIR_MASK; 561 if (start < floor) 562 return; 563 if (ceiling) { 564 ceiling &= PGDIR_MASK; 565 if (!ceiling) 566 return; 567 } 568 if (end - 1 > ceiling - 1) 569 return; 570 571 pud = pud_offset(pgd, start); 572 pgd_clear(pgd); 573 pud_free_tlb(tlb, pud, start); 574 } 575 576 /* 577 * This function frees user-level page tables of a process. 578 */ 579 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 580 unsigned long addr, unsigned long end, 581 unsigned long floor, unsigned long ceiling) 582 { 583 pgd_t *pgd; 584 unsigned long next; 585 586 /* 587 * Because there are a number of different possible pagetable 588 * layouts for hugepage ranges, we limit knowledge of how 589 * things should be laid out to the allocation path 590 * (huge_pte_alloc(), above). Everything else works out the 591 * structure as it goes from information in the hugepd 592 * pointers. That means that we can't here use the 593 * optimization used in the normal page free_pgd_range(), of 594 * checking whether we're actually covering a large enough 595 * range to have to do anything at the top level of the walk 596 * instead of at the bottom. 597 * 598 * To make sense of this, you should probably go read the big 599 * block comment at the top of the normal free_pgd_range(), 600 * too. 601 */ 602 603 do { 604 next = pgd_addr_end(addr, end); 605 pgd = pgd_offset(tlb->mm, addr); 606 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 607 if (pgd_none_or_clear_bad(pgd)) 608 continue; 609 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 610 } else { 611 #ifdef CONFIG_PPC_FSL_BOOK3E 612 /* 613 * Increment next by the size of the huge mapping since 614 * there may be more than one entry at the pgd level 615 * for a single hugepage, but all of them point to the 616 * same kmem cache that holds the hugepte. 617 */ 618 next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 619 #endif 620 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 621 addr, next, floor, ceiling); 622 } 623 } while (addr = next, addr != end); 624 } 625 626 /* 627 * We are holding mmap_sem, so a parallel huge page collapse cannot run. 628 * To prevent hugepage split, disable irq. 629 */ 630 struct page * 631 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 632 { 633 bool is_thp; 634 pte_t *ptep, pte; 635 unsigned shift; 636 unsigned long mask, flags; 637 struct page *page = ERR_PTR(-EINVAL); 638 639 local_irq_save(flags); 640 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); 641 if (!ptep) 642 goto no_page; 643 pte = READ_ONCE(*ptep); 644 /* 645 * Verify it is a huge page else bail. 646 * Transparent hugepages are handled by generic code. We can skip them 647 * here. 648 */ 649 if (!shift || is_thp) 650 goto no_page; 651 652 if (!pte_present(pte)) { 653 page = NULL; 654 goto no_page; 655 } 656 mask = (1UL << shift) - 1; 657 page = pte_page(pte); 658 if (page) 659 page += (address & mask) / PAGE_SIZE; 660 661 no_page: 662 local_irq_restore(flags); 663 return page; 664 } 665 666 struct page * 667 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 668 pmd_t *pmd, int write) 669 { 670 BUG(); 671 return NULL; 672 } 673 674 struct page * 675 follow_huge_pud(struct mm_struct *mm, unsigned long address, 676 pud_t *pud, int write) 677 { 678 BUG(); 679 return NULL; 680 } 681 682 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 683 unsigned long sz) 684 { 685 unsigned long __boundary = (addr + sz) & ~(sz-1); 686 return (__boundary - 1 < end - 1) ? __boundary : end; 687 } 688 689 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, 690 unsigned long end, int write, struct page **pages, int *nr) 691 { 692 pte_t *ptep; 693 unsigned long sz = 1UL << hugepd_shift(hugepd); 694 unsigned long next; 695 696 ptep = hugepte_offset(hugepd, addr, pdshift); 697 do { 698 next = hugepte_addr_end(addr, end, sz); 699 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 700 return 0; 701 } while (ptep++, addr = next, addr != end); 702 703 return 1; 704 } 705 706 #ifdef CONFIG_PPC_MM_SLICES 707 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 708 unsigned long len, unsigned long pgoff, 709 unsigned long flags) 710 { 711 struct hstate *hstate = hstate_file(file); 712 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 713 714 if (radix_enabled()) 715 return radix__hugetlb_get_unmapped_area(file, addr, len, 716 pgoff, flags); 717 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 718 } 719 #endif 720 721 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 722 { 723 #ifdef CONFIG_PPC_MM_SLICES 724 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 725 /* With radix we don't use slice, so derive it from vma*/ 726 if (!radix_enabled()) 727 return 1UL << mmu_psize_to_shift(psize); 728 #endif 729 if (!is_vm_hugetlb_page(vma)) 730 return PAGE_SIZE; 731 732 return huge_page_size(hstate_vma(vma)); 733 } 734 735 static inline bool is_power_of_4(unsigned long x) 736 { 737 if (is_power_of_2(x)) 738 return (__ilog2(x) % 2) ? false : true; 739 return false; 740 } 741 742 static int __init add_huge_page_size(unsigned long long size) 743 { 744 int shift = __ffs(size); 745 int mmu_psize; 746 747 /* Check that it is a page size supported by the hardware and 748 * that it fits within pagetable and slice limits. */ 749 #ifdef CONFIG_PPC_FSL_BOOK3E 750 if ((size < PAGE_SIZE) || !is_power_of_4(size)) 751 return -EINVAL; 752 #else 753 if (!is_power_of_2(size) 754 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) 755 return -EINVAL; 756 #endif 757 758 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 759 return -EINVAL; 760 761 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 762 763 /* Return if huge page size has already been setup */ 764 if (size_to_hstate(size)) 765 return 0; 766 767 hugetlb_add_hstate(shift - PAGE_SHIFT); 768 769 return 0; 770 } 771 772 static int __init hugepage_setup_sz(char *str) 773 { 774 unsigned long long size; 775 776 size = memparse(str, &str); 777 778 if (add_huge_page_size(size) != 0) { 779 hugetlb_bad_size(); 780 pr_err("Invalid huge page size specified(%llu)\n", size); 781 } 782 783 return 1; 784 } 785 __setup("hugepagesz=", hugepage_setup_sz); 786 787 #ifdef CONFIG_PPC_FSL_BOOK3E 788 struct kmem_cache *hugepte_cache; 789 static int __init hugetlbpage_init(void) 790 { 791 int psize; 792 793 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 794 unsigned shift; 795 796 if (!mmu_psize_defs[psize].shift) 797 continue; 798 799 shift = mmu_psize_to_shift(psize); 800 801 /* Don't treat normal page sizes as huge... */ 802 if (shift != PAGE_SHIFT) 803 if (add_huge_page_size(1ULL << shift) < 0) 804 continue; 805 } 806 807 /* 808 * Create a kmem cache for hugeptes. The bottom bits in the pte have 809 * size information encoded in them, so align them to allow this 810 */ 811 hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), 812 HUGEPD_SHIFT_MASK + 1, 0, NULL); 813 if (hugepte_cache == NULL) 814 panic("%s: Unable to create kmem cache for hugeptes\n", 815 __func__); 816 817 /* Default hpage size = 4M */ 818 if (mmu_psize_defs[MMU_PAGE_4M].shift) 819 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; 820 else 821 panic("%s: Unable to set default huge page size\n", __func__); 822 823 824 return 0; 825 } 826 #else 827 static int __init hugetlbpage_init(void) 828 { 829 int psize; 830 831 if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) 832 return -ENODEV; 833 834 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 835 unsigned shift; 836 unsigned pdshift; 837 838 if (!mmu_psize_defs[psize].shift) 839 continue; 840 841 shift = mmu_psize_to_shift(psize); 842 843 if (add_huge_page_size(1ULL << shift) < 0) 844 continue; 845 846 if (shift < PMD_SHIFT) 847 pdshift = PMD_SHIFT; 848 else if (shift < PUD_SHIFT) 849 pdshift = PUD_SHIFT; 850 else 851 pdshift = PGDIR_SHIFT; 852 /* 853 * if we have pdshift and shift value same, we don't 854 * use pgt cache for hugepd. 855 */ 856 if (pdshift != shift) { 857 pgtable_cache_add(pdshift - shift, NULL); 858 if (!PGT_CACHE(pdshift - shift)) 859 panic("hugetlbpage_init(): could not create " 860 "pgtable cache for %d bit pagesize\n", shift); 861 } 862 } 863 864 /* Set default large page size. Currently, we pick 16M or 1M 865 * depending on what is available 866 */ 867 if (mmu_psize_defs[MMU_PAGE_16M].shift) 868 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 869 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 870 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 871 else if (mmu_psize_defs[MMU_PAGE_2M].shift) 872 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; 873 874 875 return 0; 876 } 877 #endif 878 arch_initcall(hugetlbpage_init); 879 880 void flush_dcache_icache_hugepage(struct page *page) 881 { 882 int i; 883 void *start; 884 885 BUG_ON(!PageCompound(page)); 886 887 for (i = 0; i < (1UL << compound_order(page)); i++) { 888 if (!PageHighMem(page)) { 889 __flush_dcache_icache(page_address(page+i)); 890 } else { 891 start = kmap_atomic(page+i); 892 __flush_dcache_icache(start); 893 kunmap_atomic(start); 894 } 895 } 896 } 897 898 #endif /* CONFIG_HUGETLB_PAGE */ 899 900 /* 901 * We have 4 cases for pgds and pmds: 902 * (1) invalid (all zeroes) 903 * (2) pointer to next table, as normal; bottom 6 bits == 0 904 * (3) leaf pte for huge page _PAGE_PTE set 905 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table 906 * 907 * So long as we atomically load page table pointers we are safe against teardown, 908 * we can follow the address down to the the page and take a ref on it. 909 * This function need to be called with interrupts disabled. We use this variant 910 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 911 */ 912 913 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 914 bool *is_thp, unsigned *shift) 915 { 916 pgd_t pgd, *pgdp; 917 pud_t pud, *pudp; 918 pmd_t pmd, *pmdp; 919 pte_t *ret_pte; 920 hugepd_t *hpdp = NULL; 921 unsigned pdshift = PGDIR_SHIFT; 922 923 if (shift) 924 *shift = 0; 925 926 if (is_thp) 927 *is_thp = false; 928 929 pgdp = pgdir + pgd_index(ea); 930 pgd = READ_ONCE(*pgdp); 931 /* 932 * Always operate on the local stack value. This make sure the 933 * value don't get updated by a parallel THP split/collapse, 934 * page fault or a page unmap. The return pte_t * is still not 935 * stable. So should be checked there for above conditions. 936 */ 937 if (pgd_none(pgd)) 938 return NULL; 939 else if (pgd_huge(pgd)) { 940 ret_pte = (pte_t *) pgdp; 941 goto out; 942 } else if (is_hugepd(__hugepd(pgd_val(pgd)))) 943 hpdp = (hugepd_t *)&pgd; 944 else { 945 /* 946 * Even if we end up with an unmap, the pgtable will not 947 * be freed, because we do an rcu free and here we are 948 * irq disabled 949 */ 950 pdshift = PUD_SHIFT; 951 pudp = pud_offset(&pgd, ea); 952 pud = READ_ONCE(*pudp); 953 954 if (pud_none(pud)) 955 return NULL; 956 else if (pud_huge(pud)) { 957 ret_pte = (pte_t *) pudp; 958 goto out; 959 } else if (is_hugepd(__hugepd(pud_val(pud)))) 960 hpdp = (hugepd_t *)&pud; 961 else { 962 pdshift = PMD_SHIFT; 963 pmdp = pmd_offset(&pud, ea); 964 pmd = READ_ONCE(*pmdp); 965 /* 966 * A hugepage collapse is captured by pmd_none, because 967 * it mark the pmd none and do a hpte invalidate. 968 */ 969 if (pmd_none(pmd)) 970 return NULL; 971 972 if (pmd_trans_huge(pmd)) { 973 if (is_thp) 974 *is_thp = true; 975 ret_pte = (pte_t *) pmdp; 976 goto out; 977 } 978 979 if (pmd_huge(pmd)) { 980 ret_pte = (pte_t *) pmdp; 981 goto out; 982 } else if (is_hugepd(__hugepd(pmd_val(pmd)))) 983 hpdp = (hugepd_t *)&pmd; 984 else 985 return pte_offset_kernel(&pmd, ea); 986 } 987 } 988 if (!hpdp) 989 return NULL; 990 991 ret_pte = hugepte_offset(*hpdp, ea, pdshift); 992 pdshift = hugepd_shift(*hpdp); 993 out: 994 if (shift) 995 *shift = pdshift; 996 return ret_pte; 997 } 998 EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); 999 1000 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 1001 unsigned long end, int write, struct page **pages, int *nr) 1002 { 1003 unsigned long mask; 1004 unsigned long pte_end; 1005 struct page *head, *page; 1006 pte_t pte; 1007 int refs; 1008 1009 pte_end = (addr + sz) & ~(sz-1); 1010 if (pte_end < end) 1011 end = pte_end; 1012 1013 pte = READ_ONCE(*ptep); 1014 mask = _PAGE_PRESENT | _PAGE_READ; 1015 if (write) 1016 mask |= _PAGE_WRITE; 1017 1018 if ((pte_val(pte) & mask) != mask) 1019 return 0; 1020 1021 /* hugepages are never "special" */ 1022 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1023 1024 refs = 0; 1025 head = pte_page(pte); 1026 1027 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 1028 do { 1029 VM_BUG_ON(compound_head(page) != head); 1030 pages[*nr] = page; 1031 (*nr)++; 1032 page++; 1033 refs++; 1034 } while (addr += PAGE_SIZE, addr != end); 1035 1036 if (!page_cache_add_speculative(head, refs)) { 1037 *nr -= refs; 1038 return 0; 1039 } 1040 1041 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1042 /* Could be optimized better */ 1043 *nr -= refs; 1044 while (refs--) 1045 put_page(head); 1046 return 0; 1047 } 1048 1049 return 1; 1050 } 1051