1 /* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/io.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/export.h> 16 #include <linux/of_fdt.h> 17 #include <linux/memblock.h> 18 #include <linux/bootmem.h> 19 #include <linux/moduleparam.h> 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/tlb.h> 23 #include <asm/setup.h> 24 #include <asm/hugetlb.h> 25 26 #ifdef CONFIG_HUGETLB_PAGE 27 28 #define PAGE_SHIFT_64K 16 29 #define PAGE_SHIFT_512K 19 30 #define PAGE_SHIFT_8M 23 31 #define PAGE_SHIFT_16M 24 32 #define PAGE_SHIFT_16G 34 33 34 unsigned int HPAGE_SHIFT; 35 36 /* 37 * Tracks gpages after the device tree is scanned and before the 38 * huge_boot_pages list is ready. On non-Freescale implementations, this is 39 * just used to track 16G pages and so is a single array. FSL-based 40 * implementations may have more than one gpage size, so we need multiple 41 * arrays 42 */ 43 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 44 #define MAX_NUMBER_GPAGES 128 45 struct psize_gpages { 46 u64 gpage_list[MAX_NUMBER_GPAGES]; 47 unsigned int nr_gpages; 48 }; 49 static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; 50 #else 51 #define MAX_NUMBER_GPAGES 1024 52 static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 53 static unsigned nr_gpages; 54 #endif 55 56 #define hugepd_none(hpd) (hpd_val(hpd) == 0) 57 58 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 59 { 60 /* Only called for hugetlbfs pages, hence can ignore THP */ 61 return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); 62 } 63 64 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 65 unsigned long address, unsigned pdshift, unsigned pshift) 66 { 67 struct kmem_cache *cachep; 68 pte_t *new; 69 int i; 70 int num_hugepd; 71 72 if (pshift >= pdshift) { 73 cachep = hugepte_cache; 74 num_hugepd = 1 << (pshift - pdshift); 75 } else { 76 cachep = PGT_CACHE(pdshift - pshift); 77 num_hugepd = 1; 78 } 79 80 new = kmem_cache_zalloc(cachep, GFP_KERNEL); 81 82 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 83 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 84 85 if (! new) 86 return -ENOMEM; 87 88 /* 89 * Make sure other cpus find the hugepd set only after a 90 * properly initialized page table is visible to them. 91 * For more details look for comment in __pte_alloc(). 92 */ 93 smp_wmb(); 94 95 spin_lock(&mm->page_table_lock); 96 97 /* 98 * We have multiple higher-level entries that point to the same 99 * actual pte location. Fill in each as we go and backtrack on error. 100 * We need all of these so the DTLB pgtable walk code can find the 101 * right higher-level entry without knowing if it's a hugepage or not. 102 */ 103 for (i = 0; i < num_hugepd; i++, hpdp++) { 104 if (unlikely(!hugepd_none(*hpdp))) 105 break; 106 else { 107 #ifdef CONFIG_PPC_BOOK3S_64 108 *hpdp = __hugepd(__pa(new) | 109 (shift_to_mmu_psize(pshift) << 2)); 110 #elif defined(CONFIG_PPC_8xx) 111 *hpdp = __hugepd(__pa(new) | 112 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : 113 _PMD_PAGE_512K) | _PMD_PRESENT); 114 #else 115 /* We use the old format for PPC_FSL_BOOK3E */ 116 *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); 117 #endif 118 } 119 } 120 /* If we bailed from the for loop early, an error occurred, clean up */ 121 if (i < num_hugepd) { 122 for (i = i - 1 ; i >= 0; i--, hpdp--) 123 *hpdp = __hugepd(0); 124 kmem_cache_free(cachep, new); 125 } 126 spin_unlock(&mm->page_table_lock); 127 return 0; 128 } 129 130 /* 131 * These macros define how to determine which level of the page table holds 132 * the hpdp. 133 */ 134 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 135 #define HUGEPD_PGD_SHIFT PGDIR_SHIFT 136 #define HUGEPD_PUD_SHIFT PUD_SHIFT 137 #else 138 #define HUGEPD_PGD_SHIFT PUD_SHIFT 139 #define HUGEPD_PUD_SHIFT PMD_SHIFT 140 #endif 141 142 /* 143 * At this point we do the placement change only for BOOK3S 64. This would 144 * possibly work on other subarchs. 145 */ 146 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 147 { 148 pgd_t *pg; 149 pud_t *pu; 150 pmd_t *pm; 151 hugepd_t *hpdp = NULL; 152 unsigned pshift = __ffs(sz); 153 unsigned pdshift = PGDIR_SHIFT; 154 155 addr &= ~(sz-1); 156 pg = pgd_offset(mm, addr); 157 158 #ifdef CONFIG_PPC_BOOK3S_64 159 if (pshift == PGDIR_SHIFT) 160 /* 16GB huge page */ 161 return (pte_t *) pg; 162 else if (pshift > PUD_SHIFT) 163 /* 164 * We need to use hugepd table 165 */ 166 hpdp = (hugepd_t *)pg; 167 else { 168 pdshift = PUD_SHIFT; 169 pu = pud_alloc(mm, pg, addr); 170 if (pshift == PUD_SHIFT) 171 return (pte_t *)pu; 172 else if (pshift > PMD_SHIFT) 173 hpdp = (hugepd_t *)pu; 174 else { 175 pdshift = PMD_SHIFT; 176 pm = pmd_alloc(mm, pu, addr); 177 if (pshift == PMD_SHIFT) 178 /* 16MB hugepage */ 179 return (pte_t *)pm; 180 else 181 hpdp = (hugepd_t *)pm; 182 } 183 } 184 #else 185 if (pshift >= HUGEPD_PGD_SHIFT) { 186 hpdp = (hugepd_t *)pg; 187 } else { 188 pdshift = PUD_SHIFT; 189 pu = pud_alloc(mm, pg, addr); 190 if (pshift >= HUGEPD_PUD_SHIFT) { 191 hpdp = (hugepd_t *)pu; 192 } else { 193 pdshift = PMD_SHIFT; 194 pm = pmd_alloc(mm, pu, addr); 195 hpdp = (hugepd_t *)pm; 196 } 197 } 198 #endif 199 if (!hpdp) 200 return NULL; 201 202 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 203 204 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) 205 return NULL; 206 207 return hugepte_offset(*hpdp, addr, pdshift); 208 } 209 210 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 211 /* Build list of addresses of gigantic pages. This function is used in early 212 * boot before the buddy allocator is setup. 213 */ 214 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 215 { 216 unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); 217 int i; 218 219 if (addr == 0) 220 return; 221 222 gpage_freearray[idx].nr_gpages = number_of_pages; 223 224 for (i = 0; i < number_of_pages; i++) { 225 gpage_freearray[idx].gpage_list[i] = addr; 226 addr += page_size; 227 } 228 } 229 230 /* 231 * Moves the gigantic page addresses from the temporary list to the 232 * huge_boot_pages list. 233 */ 234 int alloc_bootmem_huge_page(struct hstate *hstate) 235 { 236 struct huge_bootmem_page *m; 237 int idx = shift_to_mmu_psize(huge_page_shift(hstate)); 238 int nr_gpages = gpage_freearray[idx].nr_gpages; 239 240 if (nr_gpages == 0) 241 return 0; 242 243 #ifdef CONFIG_HIGHMEM 244 /* 245 * If gpages can be in highmem we can't use the trick of storing the 246 * data structure in the page; allocate space for this 247 */ 248 m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); 249 m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; 250 #else 251 m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); 252 #endif 253 254 list_add(&m->list, &huge_boot_pages); 255 gpage_freearray[idx].nr_gpages = nr_gpages; 256 gpage_freearray[idx].gpage_list[nr_gpages] = 0; 257 m->hstate = hstate; 258 259 return 1; 260 } 261 /* 262 * Scan the command line hugepagesz= options for gigantic pages; store those in 263 * a list that we use to allocate the memory once all options are parsed. 264 */ 265 266 unsigned long gpage_npages[MMU_PAGE_COUNT]; 267 268 static int __init do_gpage_early_setup(char *param, char *val, 269 const char *unused, void *arg) 270 { 271 static phys_addr_t size; 272 unsigned long npages; 273 274 /* 275 * The hugepagesz and hugepages cmdline options are interleaved. We 276 * use the size variable to keep track of whether or not this was done 277 * properly and skip over instances where it is incorrect. Other 278 * command-line parsing code will issue warnings, so we don't need to. 279 * 280 */ 281 if ((strcmp(param, "default_hugepagesz") == 0) || 282 (strcmp(param, "hugepagesz") == 0)) { 283 size = memparse(val, NULL); 284 } else if (strcmp(param, "hugepages") == 0) { 285 if (size != 0) { 286 if (sscanf(val, "%lu", &npages) <= 0) 287 npages = 0; 288 if (npages > MAX_NUMBER_GPAGES) { 289 pr_warn("MMU: %lu pages requested for page " 290 #ifdef CONFIG_PHYS_ADDR_T_64BIT 291 "size %llu KB, limiting to " 292 #else 293 "size %u KB, limiting to " 294 #endif 295 __stringify(MAX_NUMBER_GPAGES) "\n", 296 npages, size / 1024); 297 npages = MAX_NUMBER_GPAGES; 298 } 299 gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; 300 size = 0; 301 } 302 } 303 return 0; 304 } 305 306 307 /* 308 * This function allocates physical space for pages that are larger than the 309 * buddy allocator can handle. We want to allocate these in highmem because 310 * the amount of lowmem is limited. This means that this function MUST be 311 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb 312 * allocate to grab highmem. 313 */ 314 void __init reserve_hugetlb_gpages(void) 315 { 316 static __initdata char cmdline[COMMAND_LINE_SIZE]; 317 phys_addr_t size, base; 318 int i; 319 320 strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); 321 parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, 322 NULL, &do_gpage_early_setup); 323 324 /* 325 * Walk gpage list in reverse, allocating larger page sizes first. 326 * Skip over unsupported sizes, or sizes that have 0 gpages allocated. 327 * When we reach the point in the list where pages are no longer 328 * considered gpages, we're done. 329 */ 330 for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { 331 if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) 332 continue; 333 else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) 334 break; 335 336 size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); 337 base = memblock_alloc_base(size * gpage_npages[i], size, 338 MEMBLOCK_ALLOC_ANYWHERE); 339 add_gpage(base, size, gpage_npages[i]); 340 } 341 } 342 343 #else /* !PPC_FSL_BOOK3E */ 344 345 /* Build list of addresses of gigantic pages. This function is used in early 346 * boot before the buddy allocator is setup. 347 */ 348 void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 349 { 350 if (!addr) 351 return; 352 while (number_of_pages > 0) { 353 gpage_freearray[nr_gpages] = addr; 354 nr_gpages++; 355 number_of_pages--; 356 addr += page_size; 357 } 358 } 359 360 /* Moves the gigantic page addresses from the temporary list to the 361 * huge_boot_pages list. 362 */ 363 int alloc_bootmem_huge_page(struct hstate *hstate) 364 { 365 struct huge_bootmem_page *m; 366 if (nr_gpages == 0) 367 return 0; 368 m = phys_to_virt(gpage_freearray[--nr_gpages]); 369 gpage_freearray[nr_gpages] = 0; 370 list_add(&m->list, &huge_boot_pages); 371 m->hstate = hstate; 372 return 1; 373 } 374 #endif 375 376 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 377 #define HUGEPD_FREELIST_SIZE \ 378 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 379 380 struct hugepd_freelist { 381 struct rcu_head rcu; 382 unsigned int index; 383 void *ptes[0]; 384 }; 385 386 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 387 388 static void hugepd_free_rcu_callback(struct rcu_head *head) 389 { 390 struct hugepd_freelist *batch = 391 container_of(head, struct hugepd_freelist, rcu); 392 unsigned int i; 393 394 for (i = 0; i < batch->index; i++) 395 kmem_cache_free(hugepte_cache, batch->ptes[i]); 396 397 free_page((unsigned long)batch); 398 } 399 400 static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 401 { 402 struct hugepd_freelist **batchp; 403 404 batchp = &get_cpu_var(hugepd_freelist_cur); 405 406 if (atomic_read(&tlb->mm->mm_users) < 2 || 407 cpumask_equal(mm_cpumask(tlb->mm), 408 cpumask_of(smp_processor_id()))) { 409 kmem_cache_free(hugepte_cache, hugepte); 410 put_cpu_var(hugepd_freelist_cur); 411 return; 412 } 413 414 if (*batchp == NULL) { 415 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 416 (*batchp)->index = 0; 417 } 418 419 (*batchp)->ptes[(*batchp)->index++] = hugepte; 420 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 421 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); 422 *batchp = NULL; 423 } 424 put_cpu_var(hugepd_freelist_cur); 425 } 426 #else 427 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 428 #endif 429 430 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 431 unsigned long start, unsigned long end, 432 unsigned long floor, unsigned long ceiling) 433 { 434 pte_t *hugepte = hugepd_page(*hpdp); 435 int i; 436 437 unsigned long pdmask = ~((1UL << pdshift) - 1); 438 unsigned int num_hugepd = 1; 439 unsigned int shift = hugepd_shift(*hpdp); 440 441 /* Note: On fsl the hpdp may be the first of several */ 442 if (shift > pdshift) 443 num_hugepd = 1 << (shift - pdshift); 444 445 start &= pdmask; 446 if (start < floor) 447 return; 448 if (ceiling) { 449 ceiling &= pdmask; 450 if (! ceiling) 451 return; 452 } 453 if (end - 1 > ceiling - 1) 454 return; 455 456 for (i = 0; i < num_hugepd; i++, hpdp++) 457 *hpdp = __hugepd(0); 458 459 if (shift >= pdshift) 460 hugepd_free(tlb, hugepte); 461 else 462 pgtable_free_tlb(tlb, hugepte, pdshift - shift); 463 } 464 465 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 466 unsigned long addr, unsigned long end, 467 unsigned long floor, unsigned long ceiling) 468 { 469 pmd_t *pmd; 470 unsigned long next; 471 unsigned long start; 472 473 start = addr; 474 do { 475 unsigned long more; 476 477 pmd = pmd_offset(pud, addr); 478 next = pmd_addr_end(addr, end); 479 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 480 /* 481 * if it is not hugepd pointer, we should already find 482 * it cleared. 483 */ 484 WARN_ON(!pmd_none_or_clear_bad(pmd)); 485 continue; 486 } 487 /* 488 * Increment next by the size of the huge mapping since 489 * there may be more than one entry at this level for a 490 * single hugepage, but all of them point to 491 * the same kmem cache that holds the hugepte. 492 */ 493 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 494 if (more > next) 495 next = more; 496 497 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 498 addr, next, floor, ceiling); 499 } while (addr = next, addr != end); 500 501 start &= PUD_MASK; 502 if (start < floor) 503 return; 504 if (ceiling) { 505 ceiling &= PUD_MASK; 506 if (!ceiling) 507 return; 508 } 509 if (end - 1 > ceiling - 1) 510 return; 511 512 pmd = pmd_offset(pud, start); 513 pud_clear(pud); 514 pmd_free_tlb(tlb, pmd, start); 515 mm_dec_nr_pmds(tlb->mm); 516 } 517 518 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 519 unsigned long addr, unsigned long end, 520 unsigned long floor, unsigned long ceiling) 521 { 522 pud_t *pud; 523 unsigned long next; 524 unsigned long start; 525 526 start = addr; 527 do { 528 pud = pud_offset(pgd, addr); 529 next = pud_addr_end(addr, end); 530 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 531 if (pud_none_or_clear_bad(pud)) 532 continue; 533 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 534 ceiling); 535 } else { 536 unsigned long more; 537 /* 538 * Increment next by the size of the huge mapping since 539 * there may be more than one entry at this level for a 540 * single hugepage, but all of them point to 541 * the same kmem cache that holds the hugepte. 542 */ 543 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 544 if (more > next) 545 next = more; 546 547 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 548 addr, next, floor, ceiling); 549 } 550 } while (addr = next, addr != end); 551 552 start &= PGDIR_MASK; 553 if (start < floor) 554 return; 555 if (ceiling) { 556 ceiling &= PGDIR_MASK; 557 if (!ceiling) 558 return; 559 } 560 if (end - 1 > ceiling - 1) 561 return; 562 563 pud = pud_offset(pgd, start); 564 pgd_clear(pgd); 565 pud_free_tlb(tlb, pud, start); 566 } 567 568 /* 569 * This function frees user-level page tables of a process. 570 */ 571 void hugetlb_free_pgd_range(struct mmu_gather *tlb, 572 unsigned long addr, unsigned long end, 573 unsigned long floor, unsigned long ceiling) 574 { 575 pgd_t *pgd; 576 unsigned long next; 577 578 /* 579 * Because there are a number of different possible pagetable 580 * layouts for hugepage ranges, we limit knowledge of how 581 * things should be laid out to the allocation path 582 * (huge_pte_alloc(), above). Everything else works out the 583 * structure as it goes from information in the hugepd 584 * pointers. That means that we can't here use the 585 * optimization used in the normal page free_pgd_range(), of 586 * checking whether we're actually covering a large enough 587 * range to have to do anything at the top level of the walk 588 * instead of at the bottom. 589 * 590 * To make sense of this, you should probably go read the big 591 * block comment at the top of the normal free_pgd_range(), 592 * too. 593 */ 594 595 do { 596 next = pgd_addr_end(addr, end); 597 pgd = pgd_offset(tlb->mm, addr); 598 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 599 if (pgd_none_or_clear_bad(pgd)) 600 continue; 601 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 602 } else { 603 unsigned long more; 604 /* 605 * Increment next by the size of the huge mapping since 606 * there may be more than one entry at the pgd level 607 * for a single hugepage, but all of them point to the 608 * same kmem cache that holds the hugepte. 609 */ 610 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 611 if (more > next) 612 next = more; 613 614 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 615 addr, next, floor, ceiling); 616 } 617 } while (addr = next, addr != end); 618 } 619 620 /* 621 * We are holding mmap_sem, so a parallel huge page collapse cannot run. 622 * To prevent hugepage split, disable irq. 623 */ 624 struct page * 625 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 626 { 627 bool is_thp; 628 pte_t *ptep, pte; 629 unsigned shift; 630 unsigned long mask, flags; 631 struct page *page = ERR_PTR(-EINVAL); 632 633 local_irq_save(flags); 634 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); 635 if (!ptep) 636 goto no_page; 637 pte = READ_ONCE(*ptep); 638 /* 639 * Verify it is a huge page else bail. 640 * Transparent hugepages are handled by generic code. We can skip them 641 * here. 642 */ 643 if (!shift || is_thp) 644 goto no_page; 645 646 if (!pte_present(pte)) { 647 page = NULL; 648 goto no_page; 649 } 650 mask = (1UL << shift) - 1; 651 page = pte_page(pte); 652 if (page) 653 page += (address & mask) / PAGE_SIZE; 654 655 no_page: 656 local_irq_restore(flags); 657 return page; 658 } 659 660 struct page * 661 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 662 pmd_t *pmd, int write) 663 { 664 BUG(); 665 return NULL; 666 } 667 668 struct page * 669 follow_huge_pud(struct mm_struct *mm, unsigned long address, 670 pud_t *pud, int write) 671 { 672 BUG(); 673 return NULL; 674 } 675 676 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 677 unsigned long sz) 678 { 679 unsigned long __boundary = (addr + sz) & ~(sz-1); 680 return (__boundary - 1 < end - 1) ? __boundary : end; 681 } 682 683 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, 684 unsigned long end, int write, struct page **pages, int *nr) 685 { 686 pte_t *ptep; 687 unsigned long sz = 1UL << hugepd_shift(hugepd); 688 unsigned long next; 689 690 ptep = hugepte_offset(hugepd, addr, pdshift); 691 do { 692 next = hugepte_addr_end(addr, end, sz); 693 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 694 return 0; 695 } while (ptep++, addr = next, addr != end); 696 697 return 1; 698 } 699 700 #ifdef CONFIG_PPC_MM_SLICES 701 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 702 unsigned long len, unsigned long pgoff, 703 unsigned long flags) 704 { 705 struct hstate *hstate = hstate_file(file); 706 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 707 708 if (radix_enabled()) 709 return radix__hugetlb_get_unmapped_area(file, addr, len, 710 pgoff, flags); 711 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 712 } 713 #endif 714 715 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 716 { 717 #ifdef CONFIG_PPC_MM_SLICES 718 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 719 /* With radix we don't use slice, so derive it from vma*/ 720 if (!radix_enabled()) 721 return 1UL << mmu_psize_to_shift(psize); 722 #endif 723 if (!is_vm_hugetlb_page(vma)) 724 return PAGE_SIZE; 725 726 return huge_page_size(hstate_vma(vma)); 727 } 728 729 static inline bool is_power_of_4(unsigned long x) 730 { 731 if (is_power_of_2(x)) 732 return (__ilog2(x) % 2) ? false : true; 733 return false; 734 } 735 736 static int __init add_huge_page_size(unsigned long long size) 737 { 738 int shift = __ffs(size); 739 int mmu_psize; 740 741 /* Check that it is a page size supported by the hardware and 742 * that it fits within pagetable and slice limits. */ 743 if (size <= PAGE_SIZE) 744 return -EINVAL; 745 #if defined(CONFIG_PPC_FSL_BOOK3E) 746 if (!is_power_of_4(size)) 747 return -EINVAL; 748 #elif !defined(CONFIG_PPC_8xx) 749 if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) 750 return -EINVAL; 751 #endif 752 753 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 754 return -EINVAL; 755 756 #ifdef CONFIG_PPC_BOOK3S_64 757 /* 758 * We need to make sure that for different page sizes reported by 759 * firmware we only add hugetlb support for page sizes that can be 760 * supported by linux page table layout. 761 * For now we have 762 * Radix: 2M 763 * Hash: 16M and 16G 764 */ 765 if (radix_enabled()) { 766 if (mmu_psize != MMU_PAGE_2M) 767 return -EINVAL; 768 } else { 769 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) 770 return -EINVAL; 771 } 772 #endif 773 774 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 775 776 /* Return if huge page size has already been setup */ 777 if (size_to_hstate(size)) 778 return 0; 779 780 hugetlb_add_hstate(shift - PAGE_SHIFT); 781 782 return 0; 783 } 784 785 static int __init hugepage_setup_sz(char *str) 786 { 787 unsigned long long size; 788 789 size = memparse(str, &str); 790 791 if (add_huge_page_size(size) != 0) { 792 hugetlb_bad_size(); 793 pr_err("Invalid huge page size specified(%llu)\n", size); 794 } 795 796 return 1; 797 } 798 __setup("hugepagesz=", hugepage_setup_sz); 799 800 struct kmem_cache *hugepte_cache; 801 static int __init hugetlbpage_init(void) 802 { 803 int psize; 804 805 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) 806 if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) 807 return -ENODEV; 808 #endif 809 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 810 unsigned shift; 811 unsigned pdshift; 812 813 if (!mmu_psize_defs[psize].shift) 814 continue; 815 816 shift = mmu_psize_to_shift(psize); 817 818 if (add_huge_page_size(1ULL << shift) < 0) 819 continue; 820 821 if (shift < HUGEPD_PUD_SHIFT) 822 pdshift = PMD_SHIFT; 823 else if (shift < HUGEPD_PGD_SHIFT) 824 pdshift = PUD_SHIFT; 825 else 826 pdshift = PGDIR_SHIFT; 827 /* 828 * if we have pdshift and shift value same, we don't 829 * use pgt cache for hugepd. 830 */ 831 if (pdshift > shift) 832 pgtable_cache_add(pdshift - shift, NULL); 833 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 834 else if (!hugepte_cache) { 835 /* 836 * Create a kmem cache for hugeptes. The bottom bits in 837 * the pte have size information encoded in them, so 838 * align them to allow this 839 */ 840 hugepte_cache = kmem_cache_create("hugepte-cache", 841 sizeof(pte_t), 842 HUGEPD_SHIFT_MASK + 1, 843 0, NULL); 844 if (hugepte_cache == NULL) 845 panic("%s: Unable to create kmem cache " 846 "for hugeptes\n", __func__); 847 848 } 849 #endif 850 } 851 852 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) 853 /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ 854 if (mmu_psize_defs[MMU_PAGE_4M].shift) 855 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; 856 else if (mmu_psize_defs[MMU_PAGE_512K].shift) 857 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; 858 #else 859 /* Set default large page size. Currently, we pick 16M or 1M 860 * depending on what is available 861 */ 862 if (mmu_psize_defs[MMU_PAGE_16M].shift) 863 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 864 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 865 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 866 else if (mmu_psize_defs[MMU_PAGE_2M].shift) 867 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; 868 #endif 869 return 0; 870 } 871 872 arch_initcall(hugetlbpage_init); 873 874 void flush_dcache_icache_hugepage(struct page *page) 875 { 876 int i; 877 void *start; 878 879 BUG_ON(!PageCompound(page)); 880 881 for (i = 0; i < (1UL << compound_order(page)); i++) { 882 if (!PageHighMem(page)) { 883 __flush_dcache_icache(page_address(page+i)); 884 } else { 885 start = kmap_atomic(page+i); 886 __flush_dcache_icache(start); 887 kunmap_atomic(start); 888 } 889 } 890 } 891 892 #endif /* CONFIG_HUGETLB_PAGE */ 893 894 /* 895 * We have 4 cases for pgds and pmds: 896 * (1) invalid (all zeroes) 897 * (2) pointer to next table, as normal; bottom 6 bits == 0 898 * (3) leaf pte for huge page _PAGE_PTE set 899 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table 900 * 901 * So long as we atomically load page table pointers we are safe against teardown, 902 * we can follow the address down to the the page and take a ref on it. 903 * This function need to be called with interrupts disabled. We use this variant 904 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 905 */ 906 907 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 908 bool *is_thp, unsigned *shift) 909 { 910 pgd_t pgd, *pgdp; 911 pud_t pud, *pudp; 912 pmd_t pmd, *pmdp; 913 pte_t *ret_pte; 914 hugepd_t *hpdp = NULL; 915 unsigned pdshift = PGDIR_SHIFT; 916 917 if (shift) 918 *shift = 0; 919 920 if (is_thp) 921 *is_thp = false; 922 923 pgdp = pgdir + pgd_index(ea); 924 pgd = READ_ONCE(*pgdp); 925 /* 926 * Always operate on the local stack value. This make sure the 927 * value don't get updated by a parallel THP split/collapse, 928 * page fault or a page unmap. The return pte_t * is still not 929 * stable. So should be checked there for above conditions. 930 */ 931 if (pgd_none(pgd)) 932 return NULL; 933 else if (pgd_huge(pgd)) { 934 ret_pte = (pte_t *) pgdp; 935 goto out; 936 } else if (is_hugepd(__hugepd(pgd_val(pgd)))) 937 hpdp = (hugepd_t *)&pgd; 938 else { 939 /* 940 * Even if we end up with an unmap, the pgtable will not 941 * be freed, because we do an rcu free and here we are 942 * irq disabled 943 */ 944 pdshift = PUD_SHIFT; 945 pudp = pud_offset(&pgd, ea); 946 pud = READ_ONCE(*pudp); 947 948 if (pud_none(pud)) 949 return NULL; 950 else if (pud_huge(pud)) { 951 ret_pte = (pte_t *) pudp; 952 goto out; 953 } else if (is_hugepd(__hugepd(pud_val(pud)))) 954 hpdp = (hugepd_t *)&pud; 955 else { 956 pdshift = PMD_SHIFT; 957 pmdp = pmd_offset(&pud, ea); 958 pmd = READ_ONCE(*pmdp); 959 /* 960 * A hugepage collapse is captured by pmd_none, because 961 * it mark the pmd none and do a hpte invalidate. 962 */ 963 if (pmd_none(pmd)) 964 return NULL; 965 966 if (pmd_trans_huge(pmd)) { 967 if (is_thp) 968 *is_thp = true; 969 ret_pte = (pte_t *) pmdp; 970 goto out; 971 } 972 973 if (pmd_huge(pmd)) { 974 ret_pte = (pte_t *) pmdp; 975 goto out; 976 } else if (is_hugepd(__hugepd(pmd_val(pmd)))) 977 hpdp = (hugepd_t *)&pmd; 978 else 979 return pte_offset_kernel(&pmd, ea); 980 } 981 } 982 if (!hpdp) 983 return NULL; 984 985 ret_pte = hugepte_offset(*hpdp, ea, pdshift); 986 pdshift = hugepd_shift(*hpdp); 987 out: 988 if (shift) 989 *shift = pdshift; 990 return ret_pte; 991 } 992 EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); 993 994 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 995 unsigned long end, int write, struct page **pages, int *nr) 996 { 997 unsigned long mask; 998 unsigned long pte_end; 999 struct page *head, *page; 1000 pte_t pte; 1001 int refs; 1002 1003 pte_end = (addr + sz) & ~(sz-1); 1004 if (pte_end < end) 1005 end = pte_end; 1006 1007 pte = READ_ONCE(*ptep); 1008 mask = _PAGE_PRESENT | _PAGE_READ; 1009 1010 /* 1011 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined 1012 * as 0 and _PAGE_RO has to be set when a page is not writable 1013 */ 1014 if (write) 1015 mask |= _PAGE_WRITE; 1016 else 1017 mask |= _PAGE_RO; 1018 1019 if ((pte_val(pte) & mask) != mask) 1020 return 0; 1021 1022 /* hugepages are never "special" */ 1023 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1024 1025 refs = 0; 1026 head = pte_page(pte); 1027 1028 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 1029 do { 1030 VM_BUG_ON(compound_head(page) != head); 1031 pages[*nr] = page; 1032 (*nr)++; 1033 page++; 1034 refs++; 1035 } while (addr += PAGE_SIZE, addr != end); 1036 1037 if (!page_cache_add_speculative(head, refs)) { 1038 *nr -= refs; 1039 return 0; 1040 } 1041 1042 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1043 /* Could be optimized better */ 1044 *nr -= refs; 1045 while (refs--) 1046 put_page(head); 1047 return 0; 1048 } 1049 1050 return 1; 1051 } 1052