1 /* 2 * IA-32 Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 5 */ 6 7 #include <linux/init.h> 8 #include <linux/fs.h> 9 #include <linux/mm.h> 10 #include <linux/hugetlb.h> 11 #include <linux/pagemap.h> 12 #include <linux/err.h> 13 #include <linux/sysctl.h> 14 #include <asm/mman.h> 15 #include <asm/tlb.h> 16 #include <asm/tlbflush.h> 17 #include <asm/pgalloc.h> 18 19 static unsigned long page_table_shareable(struct vm_area_struct *svma, 20 struct vm_area_struct *vma, 21 unsigned long addr, pgoff_t idx) 22 { 23 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 24 svma->vm_start; 25 unsigned long sbase = saddr & PUD_MASK; 26 unsigned long s_end = sbase + PUD_SIZE; 27 28 /* Allow segments to share if only one is marked locked */ 29 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 30 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 31 32 /* 33 * match the virtual addresses, permission and the alignment of the 34 * page table page. 35 */ 36 if (pmd_index(addr) != pmd_index(saddr) || 37 vm_flags != svm_flags || 38 sbase < svma->vm_start || svma->vm_end < s_end) 39 return 0; 40 41 return saddr; 42 } 43 44 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 45 { 46 unsigned long base = addr & PUD_MASK; 47 unsigned long end = base + PUD_SIZE; 48 49 /* 50 * check on proper vm_flags and page table alignment 51 */ 52 if (vma->vm_flags & VM_MAYSHARE && 53 vma->vm_start <= base && end <= vma->vm_end) 54 return 1; 55 return 0; 56 } 57 58 /* 59 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 60 * and returns the corresponding pte. While this is not necessary for the 61 * !shared pmd case because we can allocate the pmd later as well, it makes the 62 * code much cleaner. pmd allocation is essential for the shared case because 63 * pud has to be populated inside the same i_mmap_mutex section - otherwise 64 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 65 * bad pmd for sharing. 66 */ 67 static pte_t * 68 huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 69 { 70 struct vm_area_struct *vma = find_vma(mm, addr); 71 struct address_space *mapping = vma->vm_file->f_mapping; 72 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 73 vma->vm_pgoff; 74 struct vm_area_struct *svma; 75 unsigned long saddr; 76 pte_t *spte = NULL; 77 pte_t *pte; 78 79 if (!vma_shareable(vma, addr)) 80 return (pte_t *)pmd_alloc(mm, pud, addr); 81 82 mutex_lock(&mapping->i_mmap_mutex); 83 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 84 if (svma == vma) 85 continue; 86 87 saddr = page_table_shareable(svma, vma, addr, idx); 88 if (saddr) { 89 spte = huge_pte_offset(svma->vm_mm, saddr); 90 if (spte) { 91 get_page(virt_to_page(spte)); 92 break; 93 } 94 } 95 } 96 97 if (!spte) 98 goto out; 99 100 spin_lock(&mm->page_table_lock); 101 if (pud_none(*pud)) 102 pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); 103 else 104 put_page(virt_to_page(spte)); 105 spin_unlock(&mm->page_table_lock); 106 out: 107 pte = (pte_t *)pmd_alloc(mm, pud, addr); 108 mutex_unlock(&mapping->i_mmap_mutex); 109 return pte; 110 } 111 112 /* 113 * unmap huge page backed by shared pte. 114 * 115 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 116 * indicated by page_count > 1, unmap is achieved by clearing pud and 117 * decrementing the ref count. If count == 1, the pte page is not shared. 118 * 119 * called with vma->vm_mm->page_table_lock held. 120 * 121 * returns: 1 successfully unmapped a shared pte page 122 * 0 the underlying pte page is not shared, or it is the last user 123 */ 124 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 125 { 126 pgd_t *pgd = pgd_offset(mm, *addr); 127 pud_t *pud = pud_offset(pgd, *addr); 128 129 BUG_ON(page_count(virt_to_page(ptep)) == 0); 130 if (page_count(virt_to_page(ptep)) == 1) 131 return 0; 132 133 pud_clear(pud); 134 put_page(virt_to_page(ptep)); 135 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 136 return 1; 137 } 138 139 pte_t *huge_pte_alloc(struct mm_struct *mm, 140 unsigned long addr, unsigned long sz) 141 { 142 pgd_t *pgd; 143 pud_t *pud; 144 pte_t *pte = NULL; 145 146 pgd = pgd_offset(mm, addr); 147 pud = pud_alloc(mm, pgd, addr); 148 if (pud) { 149 if (sz == PUD_SIZE) { 150 pte = (pte_t *)pud; 151 } else { 152 BUG_ON(sz != PMD_SIZE); 153 if (pud_none(*pud)) 154 pte = huge_pmd_share(mm, addr, pud); 155 else 156 pte = (pte_t *)pmd_alloc(mm, pud, addr); 157 } 158 } 159 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 160 161 return pte; 162 } 163 164 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 165 { 166 pgd_t *pgd; 167 pud_t *pud; 168 pmd_t *pmd = NULL; 169 170 pgd = pgd_offset(mm, addr); 171 if (pgd_present(*pgd)) { 172 pud = pud_offset(pgd, addr); 173 if (pud_present(*pud)) { 174 if (pud_large(*pud)) 175 return (pte_t *)pud; 176 pmd = pmd_offset(pud, addr); 177 } 178 } 179 return (pte_t *) pmd; 180 } 181 182 #if 0 /* This is just for testing */ 183 struct page * 184 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 185 { 186 unsigned long start = address; 187 int length = 1; 188 int nr; 189 struct page *page; 190 struct vm_area_struct *vma; 191 192 vma = find_vma(mm, addr); 193 if (!vma || !is_vm_hugetlb_page(vma)) 194 return ERR_PTR(-EINVAL); 195 196 pte = huge_pte_offset(mm, address); 197 198 /* hugetlb should be locked, and hence, prefaulted */ 199 WARN_ON(!pte || pte_none(*pte)); 200 201 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 202 203 WARN_ON(!PageHead(page)); 204 205 return page; 206 } 207 208 int pmd_huge(pmd_t pmd) 209 { 210 return 0; 211 } 212 213 int pud_huge(pud_t pud) 214 { 215 return 0; 216 } 217 218 struct page * 219 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 220 pmd_t *pmd, int write) 221 { 222 return NULL; 223 } 224 225 #else 226 227 struct page * 228 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 229 { 230 return ERR_PTR(-EINVAL); 231 } 232 233 int pmd_huge(pmd_t pmd) 234 { 235 return !!(pmd_val(pmd) & _PAGE_PSE); 236 } 237 238 int pud_huge(pud_t pud) 239 { 240 return !!(pud_val(pud) & _PAGE_PSE); 241 } 242 243 struct page * 244 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 245 pmd_t *pmd, int write) 246 { 247 struct page *page; 248 249 page = pte_page(*(pte_t *)pmd); 250 if (page) 251 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 252 return page; 253 } 254 255 struct page * 256 follow_huge_pud(struct mm_struct *mm, unsigned long address, 257 pud_t *pud, int write) 258 { 259 struct page *page; 260 261 page = pte_page(*(pte_t *)pud); 262 if (page) 263 page += ((address & ~PUD_MASK) >> PAGE_SHIFT); 264 return page; 265 } 266 267 #endif 268 269 /* x86_64 also uses this file */ 270 271 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 272 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, 273 unsigned long addr, unsigned long len, 274 unsigned long pgoff, unsigned long flags) 275 { 276 struct hstate *h = hstate_file(file); 277 struct mm_struct *mm = current->mm; 278 struct vm_area_struct *vma; 279 unsigned long start_addr; 280 281 if (len > mm->cached_hole_size) { 282 start_addr = mm->free_area_cache; 283 } else { 284 start_addr = TASK_UNMAPPED_BASE; 285 mm->cached_hole_size = 0; 286 } 287 288 full_search: 289 addr = ALIGN(start_addr, huge_page_size(h)); 290 291 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 292 /* At this point: (!vma || addr < vma->vm_end). */ 293 if (TASK_SIZE - len < addr) { 294 /* 295 * Start a new search - just in case we missed 296 * some holes. 297 */ 298 if (start_addr != TASK_UNMAPPED_BASE) { 299 start_addr = TASK_UNMAPPED_BASE; 300 mm->cached_hole_size = 0; 301 goto full_search; 302 } 303 return -ENOMEM; 304 } 305 if (!vma || addr + len <= vma->vm_start) { 306 mm->free_area_cache = addr + len; 307 return addr; 308 } 309 if (addr + mm->cached_hole_size < vma->vm_start) 310 mm->cached_hole_size = vma->vm_start - addr; 311 addr = ALIGN(vma->vm_end, huge_page_size(h)); 312 } 313 } 314 315 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 316 unsigned long addr0, unsigned long len, 317 unsigned long pgoff, unsigned long flags) 318 { 319 struct hstate *h = hstate_file(file); 320 struct mm_struct *mm = current->mm; 321 struct vm_area_struct *vma; 322 unsigned long base = mm->mmap_base; 323 unsigned long addr = addr0; 324 unsigned long largest_hole = mm->cached_hole_size; 325 unsigned long start_addr; 326 327 /* don't allow allocations above current base */ 328 if (mm->free_area_cache > base) 329 mm->free_area_cache = base; 330 331 if (len <= largest_hole) { 332 largest_hole = 0; 333 mm->free_area_cache = base; 334 } 335 try_again: 336 start_addr = mm->free_area_cache; 337 338 /* make sure it can fit in the remaining address space */ 339 if (mm->free_area_cache < len) 340 goto fail; 341 342 /* either no address requested or can't fit in requested address hole */ 343 addr = (mm->free_area_cache - len) & huge_page_mask(h); 344 do { 345 /* 346 * Lookup failure means no vma is above this address, 347 * i.e. return with success: 348 */ 349 vma = find_vma(mm, addr); 350 if (!vma) 351 return addr; 352 353 if (addr + len <= vma->vm_start) { 354 /* remember the address as a hint for next time */ 355 mm->cached_hole_size = largest_hole; 356 return (mm->free_area_cache = addr); 357 } else if (mm->free_area_cache == vma->vm_end) { 358 /* pull free_area_cache down to the first hole */ 359 mm->free_area_cache = vma->vm_start; 360 mm->cached_hole_size = largest_hole; 361 } 362 363 /* remember the largest hole we saw so far */ 364 if (addr + largest_hole < vma->vm_start) 365 largest_hole = vma->vm_start - addr; 366 367 /* try just below the current vma->vm_start */ 368 addr = (vma->vm_start - len) & huge_page_mask(h); 369 } while (len <= vma->vm_start); 370 371 fail: 372 /* 373 * if hint left us with no space for the requested 374 * mapping then try again: 375 */ 376 if (start_addr != base) { 377 mm->free_area_cache = base; 378 largest_hole = 0; 379 goto try_again; 380 } 381 /* 382 * A failed mmap() very likely causes application failure, 383 * so fall back to the bottom-up function here. This scenario 384 * can happen with large stack limits and large mmap() 385 * allocations. 386 */ 387 mm->free_area_cache = TASK_UNMAPPED_BASE; 388 mm->cached_hole_size = ~0UL; 389 addr = hugetlb_get_unmapped_area_bottomup(file, addr0, 390 len, pgoff, flags); 391 392 /* 393 * Restore the topdown base: 394 */ 395 mm->free_area_cache = base; 396 mm->cached_hole_size = ~0UL; 397 398 return addr; 399 } 400 401 unsigned long 402 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 403 unsigned long len, unsigned long pgoff, unsigned long flags) 404 { 405 struct hstate *h = hstate_file(file); 406 struct mm_struct *mm = current->mm; 407 struct vm_area_struct *vma; 408 409 if (len & ~huge_page_mask(h)) 410 return -EINVAL; 411 if (len > TASK_SIZE) 412 return -ENOMEM; 413 414 if (flags & MAP_FIXED) { 415 if (prepare_hugepage_range(file, addr, len)) 416 return -EINVAL; 417 return addr; 418 } 419 420 if (addr) { 421 addr = ALIGN(addr, huge_page_size(h)); 422 vma = find_vma(mm, addr); 423 if (TASK_SIZE - len >= addr && 424 (!vma || addr + len <= vma->vm_start)) 425 return addr; 426 } 427 if (mm->get_unmapped_area == arch_get_unmapped_area) 428 return hugetlb_get_unmapped_area_bottomup(file, addr, len, 429 pgoff, flags); 430 else 431 return hugetlb_get_unmapped_area_topdown(file, addr, len, 432 pgoff, flags); 433 } 434 435 #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 436 437 #ifdef CONFIG_X86_64 438 static __init int setup_hugepagesz(char *opt) 439 { 440 unsigned long ps = memparse(opt, &opt); 441 if (ps == PMD_SIZE) { 442 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); 443 } else if (ps == PUD_SIZE && cpu_has_gbpages) { 444 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); 445 } else { 446 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", 447 ps >> 20); 448 return 0; 449 } 450 return 1; 451 } 452 __setup("hugepagesz=", setup_hugepagesz); 453 #endif 454