1 /* 2 * IA-32 Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 5 */ 6 7 #include <linux/init.h> 8 #include <linux/fs.h> 9 #include <linux/mm.h> 10 #include <linux/hugetlb.h> 11 #include <linux/pagemap.h> 12 #include <linux/err.h> 13 #include <linux/sysctl.h> 14 #include <asm/mman.h> 15 #include <asm/tlb.h> 16 #include <asm/tlbflush.h> 17 #include <asm/pgalloc.h> 18 19 static unsigned long page_table_shareable(struct vm_area_struct *svma, 20 struct vm_area_struct *vma, 21 unsigned long addr, pgoff_t idx) 22 { 23 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 24 svma->vm_start; 25 unsigned long sbase = saddr & PUD_MASK; 26 unsigned long s_end = sbase + PUD_SIZE; 27 28 /* Allow segments to share if only one is marked locked */ 29 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 30 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 31 32 /* 33 * match the virtual addresses, permission and the alignment of the 34 * page table page. 35 */ 36 if (pmd_index(addr) != pmd_index(saddr) || 37 vm_flags != svm_flags || 38 sbase < svma->vm_start || svma->vm_end < s_end) 39 return 0; 40 41 return saddr; 42 } 43 44 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 45 { 46 unsigned long base = addr & PUD_MASK; 47 unsigned long end = base + PUD_SIZE; 48 49 /* 50 * check on proper vm_flags and page table alignment 51 */ 52 if (vma->vm_flags & VM_MAYSHARE && 53 vma->vm_start <= base && end <= vma->vm_end) 54 return 1; 55 return 0; 56 } 57 58 /* 59 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 60 * and returns the corresponding pte. While this is not necessary for the 61 * !shared pmd case because we can allocate the pmd later as well, it makes the 62 * code much cleaner. pmd allocation is essential for the shared case because 63 * pud has to be populated inside the same i_mmap_mutex section - otherwise 64 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 65 * bad pmd for sharing. 66 */ 67 static pte_t * 68 huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 69 { 70 struct vm_area_struct *vma = find_vma(mm, addr); 71 struct address_space *mapping = vma->vm_file->f_mapping; 72 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 73 vma->vm_pgoff; 74 struct vm_area_struct *svma; 75 unsigned long saddr; 76 pte_t *spte = NULL; 77 pte_t *pte; 78 79 if (!vma_shareable(vma, addr)) 80 return (pte_t *)pmd_alloc(mm, pud, addr); 81 82 mutex_lock(&mapping->i_mmap_mutex); 83 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 84 if (svma == vma) 85 continue; 86 87 saddr = page_table_shareable(svma, vma, addr, idx); 88 if (saddr) { 89 spte = huge_pte_offset(svma->vm_mm, saddr); 90 if (spte) { 91 get_page(virt_to_page(spte)); 92 break; 93 } 94 } 95 } 96 97 if (!spte) 98 goto out; 99 100 spin_lock(&mm->page_table_lock); 101 if (pud_none(*pud)) 102 pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); 103 else 104 put_page(virt_to_page(spte)); 105 spin_unlock(&mm->page_table_lock); 106 out: 107 pte = (pte_t *)pmd_alloc(mm, pud, addr); 108 mutex_unlock(&mapping->i_mmap_mutex); 109 return pte; 110 } 111 112 /* 113 * unmap huge page backed by shared pte. 114 * 115 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 116 * indicated by page_count > 1, unmap is achieved by clearing pud and 117 * decrementing the ref count. If count == 1, the pte page is not shared. 118 * 119 * called with vma->vm_mm->page_table_lock held. 120 * 121 * returns: 1 successfully unmapped a shared pte page 122 * 0 the underlying pte page is not shared, or it is the last user 123 */ 124 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 125 { 126 pgd_t *pgd = pgd_offset(mm, *addr); 127 pud_t *pud = pud_offset(pgd, *addr); 128 129 BUG_ON(page_count(virt_to_page(ptep)) == 0); 130 if (page_count(virt_to_page(ptep)) == 1) 131 return 0; 132 133 pud_clear(pud); 134 put_page(virt_to_page(ptep)); 135 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 136 return 1; 137 } 138 139 pte_t *huge_pte_alloc(struct mm_struct *mm, 140 unsigned long addr, unsigned long sz) 141 { 142 pgd_t *pgd; 143 pud_t *pud; 144 pte_t *pte = NULL; 145 146 pgd = pgd_offset(mm, addr); 147 pud = pud_alloc(mm, pgd, addr); 148 if (pud) { 149 if (sz == PUD_SIZE) { 150 pte = (pte_t *)pud; 151 } else { 152 BUG_ON(sz != PMD_SIZE); 153 if (pud_none(*pud)) 154 pte = huge_pmd_share(mm, addr, pud); 155 else 156 pte = (pte_t *)pmd_alloc(mm, pud, addr); 157 } 158 } 159 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 160 161 return pte; 162 } 163 164 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 165 { 166 pgd_t *pgd; 167 pud_t *pud; 168 pmd_t *pmd = NULL; 169 170 pgd = pgd_offset(mm, addr); 171 if (pgd_present(*pgd)) { 172 pud = pud_offset(pgd, addr); 173 if (pud_present(*pud)) { 174 if (pud_large(*pud)) 175 return (pte_t *)pud; 176 pmd = pmd_offset(pud, addr); 177 } 178 } 179 return (pte_t *) pmd; 180 } 181 182 #if 0 /* This is just for testing */ 183 struct page * 184 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 185 { 186 unsigned long start = address; 187 int length = 1; 188 int nr; 189 struct page *page; 190 struct vm_area_struct *vma; 191 192 vma = find_vma(mm, addr); 193 if (!vma || !is_vm_hugetlb_page(vma)) 194 return ERR_PTR(-EINVAL); 195 196 pte = huge_pte_offset(mm, address); 197 198 /* hugetlb should be locked, and hence, prefaulted */ 199 WARN_ON(!pte || pte_none(*pte)); 200 201 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 202 203 WARN_ON(!PageHead(page)); 204 205 return page; 206 } 207 208 int pmd_huge(pmd_t pmd) 209 { 210 return 0; 211 } 212 213 int pud_huge(pud_t pud) 214 { 215 return 0; 216 } 217 218 struct page * 219 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 220 pmd_t *pmd, int write) 221 { 222 return NULL; 223 } 224 225 #else 226 227 struct page * 228 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 229 { 230 return ERR_PTR(-EINVAL); 231 } 232 233 int pmd_huge(pmd_t pmd) 234 { 235 return !!(pmd_val(pmd) & _PAGE_PSE); 236 } 237 238 int pud_huge(pud_t pud) 239 { 240 return !!(pud_val(pud) & _PAGE_PSE); 241 } 242 243 struct page * 244 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 245 pmd_t *pmd, int write) 246 { 247 struct page *page; 248 249 page = pte_page(*(pte_t *)pmd); 250 if (page) 251 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 252 return page; 253 } 254 255 struct page * 256 follow_huge_pud(struct mm_struct *mm, unsigned long address, 257 pud_t *pud, int write) 258 { 259 struct page *page; 260 261 page = pte_page(*(pte_t *)pud); 262 if (page) 263 page += ((address & ~PUD_MASK) >> PAGE_SHIFT); 264 return page; 265 } 266 267 #endif 268 269 /* x86_64 also uses this file */ 270 271 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 272 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, 273 unsigned long addr, unsigned long len, 274 unsigned long pgoff, unsigned long flags) 275 { 276 struct hstate *h = hstate_file(file); 277 struct vm_unmapped_area_info info; 278 279 info.flags = 0; 280 info.length = len; 281 info.low_limit = TASK_UNMAPPED_BASE; 282 info.high_limit = TASK_SIZE; 283 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 284 info.align_offset = 0; 285 return vm_unmapped_area(&info); 286 } 287 288 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 289 unsigned long addr0, unsigned long len, 290 unsigned long pgoff, unsigned long flags) 291 { 292 struct hstate *h = hstate_file(file); 293 struct vm_unmapped_area_info info; 294 unsigned long addr; 295 296 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 297 info.length = len; 298 info.low_limit = PAGE_SIZE; 299 info.high_limit = current->mm->mmap_base; 300 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 301 info.align_offset = 0; 302 addr = vm_unmapped_area(&info); 303 304 /* 305 * A failed mmap() very likely causes application failure, 306 * so fall back to the bottom-up function here. This scenario 307 * can happen with large stack limits and large mmap() 308 * allocations. 309 */ 310 if (addr & ~PAGE_MASK) { 311 VM_BUG_ON(addr != -ENOMEM); 312 info.flags = 0; 313 info.low_limit = TASK_UNMAPPED_BASE; 314 info.high_limit = TASK_SIZE; 315 addr = vm_unmapped_area(&info); 316 } 317 318 return addr; 319 } 320 321 unsigned long 322 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 323 unsigned long len, unsigned long pgoff, unsigned long flags) 324 { 325 struct hstate *h = hstate_file(file); 326 struct mm_struct *mm = current->mm; 327 struct vm_area_struct *vma; 328 329 if (len & ~huge_page_mask(h)) 330 return -EINVAL; 331 if (len > TASK_SIZE) 332 return -ENOMEM; 333 334 if (flags & MAP_FIXED) { 335 if (prepare_hugepage_range(file, addr, len)) 336 return -EINVAL; 337 return addr; 338 } 339 340 if (addr) { 341 addr = ALIGN(addr, huge_page_size(h)); 342 vma = find_vma(mm, addr); 343 if (TASK_SIZE - len >= addr && 344 (!vma || addr + len <= vma->vm_start)) 345 return addr; 346 } 347 if (mm->get_unmapped_area == arch_get_unmapped_area) 348 return hugetlb_get_unmapped_area_bottomup(file, addr, len, 349 pgoff, flags); 350 else 351 return hugetlb_get_unmapped_area_topdown(file, addr, len, 352 pgoff, flags); 353 } 354 355 #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 356 357 #ifdef CONFIG_X86_64 358 static __init int setup_hugepagesz(char *opt) 359 { 360 unsigned long ps = memparse(opt, &opt); 361 if (ps == PMD_SIZE) { 362 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); 363 } else if (ps == PUD_SIZE && cpu_has_gbpages) { 364 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); 365 } else { 366 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", 367 ps >> 20); 368 return 0; 369 } 370 return 1; 371 } 372 __setup("hugepagesz=", setup_hugepagesz); 373 #endif 374