1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mm/pgtable-generic.c 4 * 5 * Generic pgtable methods declared in linux/pgtable.h 6 * 7 * Copyright (C) 2010 Linus Torvalds 8 */ 9 10 #include <linux/pagemap.h> 11 #include <linux/hugetlb.h> 12 #include <linux/pgtable.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/mm_inline.h> 16 #include <linux/iommu.h> 17 #include <linux/pgalloc.h> 18 19 #include <asm/tlb.h> 20 21 /* 22 * If a p?d_bad entry is found while walking page tables, report 23 * the error, before resetting entry to p?d_none. Usually (but 24 * very seldom) called out from the p?d_none_or_clear_bad macros. 25 */ 26 27 void pgd_clear_bad(pgd_t *pgd) 28 { 29 pgd_ERROR(*pgd); 30 pgd_clear(pgd); 31 } 32 33 #ifndef __PAGETABLE_P4D_FOLDED 34 void p4d_clear_bad(p4d_t *p4d) 35 { 36 p4d_ERROR(*p4d); 37 p4d_clear(p4d); 38 } 39 #endif 40 41 #ifndef __PAGETABLE_PUD_FOLDED 42 void pud_clear_bad(pud_t *pud) 43 { 44 pud_ERROR(*pud); 45 pud_clear(pud); 46 } 47 #endif 48 49 /* 50 * Note that the pmd variant below can't be stub'ed out just as for p4d/pud 51 * above. pmd folding is special and typically pmd_* macros refer to upper 52 * level even when folded 53 */ 54 void pmd_clear_bad(pmd_t *pmd) 55 { 56 pmd_ERROR(*pmd); 57 pmd_clear(pmd); 58 } 59 60 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 61 /* 62 * Only sets the access flags (dirty, accessed), as well as write 63 * permission. Furthermore, we know it always gets set to a "more 64 * permissive" setting, which allows most architectures to optimize 65 * this. We return whether the PTE actually changed, which in turn 66 * instructs the caller to do things like update__mmu_cache. This 67 * used to be done in the caller, but sparc needs minor faults to 68 * force that call on sun4c so we changed this macro slightly 69 */ 70 int ptep_set_access_flags(struct vm_area_struct *vma, 71 unsigned long address, pte_t *ptep, 72 pte_t entry, int dirty) 73 { 74 int changed = !pte_same(ptep_get(ptep), entry); 75 if (changed) { 76 set_pte_at(vma->vm_mm, address, ptep, entry); 77 flush_tlb_fix_spurious_fault(vma, address, ptep); 78 } 79 return changed; 80 } 81 #endif 82 83 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 84 bool ptep_clear_flush_young(struct vm_area_struct *vma, 85 unsigned long address, pte_t *ptep) 86 { 87 bool young; 88 89 young = ptep_test_and_clear_young(vma, address, ptep); 90 if (young) 91 flush_tlb_page(vma, address); 92 return young; 93 } 94 #endif 95 96 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH 97 pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, 98 pte_t *ptep) 99 { 100 struct mm_struct *mm = (vma)->vm_mm; 101 pte_t pte; 102 pte = ptep_get_and_clear(mm, address, ptep); 103 if (pte_accessible(mm, pte)) 104 flush_tlb_page(vma, address); 105 return pte; 106 } 107 #endif 108 109 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 110 111 #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS 112 int pmdp_set_access_flags(struct vm_area_struct *vma, 113 unsigned long address, pmd_t *pmdp, 114 pmd_t entry, int dirty) 115 { 116 int changed = !pmd_same(*pmdp, entry); 117 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 118 if (changed) { 119 set_pmd_at(vma->vm_mm, address, pmdp, entry); 120 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 121 } 122 return changed; 123 } 124 #endif 125 126 #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH 127 bool pmdp_clear_flush_young(struct vm_area_struct *vma, 128 unsigned long address, pmd_t *pmdp) 129 { 130 bool young; 131 132 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 133 young = pmdp_test_and_clear_young(vma, address, pmdp); 134 if (young) 135 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 136 return young; 137 } 138 #endif 139 140 #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH 141 pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, 142 pmd_t *pmdp) 143 { 144 pmd_t pmd; 145 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 146 VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)); 147 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); 148 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 149 return pmd; 150 } 151 152 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 153 pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, 154 pud_t *pudp) 155 { 156 pud_t pud; 157 158 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 159 VM_BUG_ON(!pud_trans_huge(*pudp)); 160 pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); 161 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); 162 return pud; 163 } 164 #endif 165 #endif 166 167 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 168 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 169 pgtable_t pgtable) 170 { 171 assert_spin_locked(pmd_lockptr(mm, pmdp)); 172 173 /* FIFO */ 174 if (!pmd_huge_pte(mm, pmdp)) 175 INIT_LIST_HEAD(&pgtable->lru); 176 else 177 list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); 178 pmd_huge_pte(mm, pmdp) = pgtable; 179 } 180 #endif 181 182 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW 183 /* no "address" argument so destroys page coloring of some arch */ 184 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 185 { 186 pgtable_t pgtable; 187 188 assert_spin_locked(pmd_lockptr(mm, pmdp)); 189 190 /* FIFO */ 191 pgtable = pmd_huge_pte(mm, pmdp); 192 pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, 193 struct page, lru); 194 if (pmd_huge_pte(mm, pmdp)) 195 list_del(&pgtable->lru); 196 return pgtable; 197 } 198 #endif 199 200 #ifndef __HAVE_ARCH_PMDP_INVALIDATE 201 pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 202 pmd_t *pmdp) 203 { 204 VM_WARN_ON_ONCE(!pmd_present(*pmdp)); 205 pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); 206 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 207 return old; 208 } 209 #endif 210 211 #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD 212 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, 213 pmd_t *pmdp) 214 { 215 VM_WARN_ON_ONCE(!pmd_present(*pmdp)); 216 return pmdp_invalidate(vma, address, pmdp); 217 } 218 #endif 219 220 #ifndef pmdp_collapse_flush 221 pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 222 pmd_t *pmdp) 223 { 224 /* 225 * pmd and hugepage pte format are same. So we could 226 * use the same function. 227 */ 228 pmd_t pmd; 229 230 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 231 VM_BUG_ON(pmd_trans_huge(*pmdp)); 232 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); 233 234 /* collapse entails shooting down ptes not pmd */ 235 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 236 return pmd; 237 } 238 #endif 239 240 /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ 241 #ifndef pte_free_defer 242 static void pte_free_now(struct rcu_head *head) 243 { 244 struct page *page; 245 246 page = container_of(head, struct page, rcu_head); 247 pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); 248 } 249 250 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) 251 { 252 struct page *page; 253 254 page = pgtable; 255 call_rcu(&page->rcu_head, pte_free_now); 256 } 257 #endif /* pte_free_defer */ 258 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 259 260 #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ 261 (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) 262 /* 263 * See the comment above ptep_get_lockless() in include/linux/pgtable.h: 264 * the barriers in pmdp_get_lockless() cannot guarantee that the value in 265 * pmd_high actually belongs with the value in pmd_low; but holding interrupts 266 * off blocks the TLB flush between present updates, which guarantees that a 267 * successful __pte_offset_map() points to a page from matched halves. 268 */ 269 static unsigned long pmdp_get_lockless_start(void) 270 { 271 unsigned long irqflags; 272 273 local_irq_save(irqflags); 274 return irqflags; 275 } 276 static void pmdp_get_lockless_end(unsigned long irqflags) 277 { 278 local_irq_restore(irqflags); 279 } 280 #else 281 static unsigned long pmdp_get_lockless_start(void) { return 0; } 282 static void pmdp_get_lockless_end(unsigned long irqflags) { } 283 #endif 284 285 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) 286 { 287 unsigned long irqflags; 288 pmd_t pmdval; 289 290 rcu_read_lock(); 291 irqflags = pmdp_get_lockless_start(); 292 pmdval = pmdp_get_lockless(pmd); 293 pmdp_get_lockless_end(irqflags); 294 295 if (pmdvalp) 296 *pmdvalp = pmdval; 297 if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) 298 goto nomap; 299 if (unlikely(pmd_trans_huge(pmdval))) 300 goto nomap; 301 if (unlikely(pmd_bad(pmdval))) { 302 pmd_clear_bad(pmd); 303 goto nomap; 304 } 305 return __pte_map(&pmdval, addr); 306 nomap: 307 rcu_read_unlock(); 308 return NULL; 309 } 310 311 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, 312 unsigned long addr, spinlock_t **ptlp) 313 { 314 pmd_t pmdval; 315 pte_t *pte; 316 317 pte = __pte_offset_map(pmd, addr, &pmdval); 318 if (likely(pte)) 319 *ptlp = pte_lockptr(mm, &pmdval); 320 return pte; 321 } 322 323 pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, 324 unsigned long addr, pmd_t *pmdvalp, 325 spinlock_t **ptlp) 326 { 327 pte_t *pte; 328 329 VM_WARN_ON_ONCE(!pmdvalp); 330 pte = __pte_offset_map(pmd, addr, pmdvalp); 331 if (likely(pte)) 332 *ptlp = pte_lockptr(mm, pmdvalp); 333 return pte; 334 } 335 336 /* 337 * pte_offset_map_lock(mm, pmd, addr, ptlp) is usually called with the pmd 338 * pointer for addr, reached by walking down the mm's pgd, p4d, pud for addr: 339 * either while holding mmap_lock or vma lock for read or for write; or in 340 * truncate or rmap context, while holding file's i_mmap_lock or anon_vma lock 341 * for read (or for write). In a few cases, it may be used with pmd pointing to 342 * a pmd_t already copied to or constructed on the stack. 343 * 344 * When successful, it returns the pte pointer for addr, with its page table 345 * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent 346 * modification by software, with a pointer to that spinlock in ptlp (in some 347 * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's 348 * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. 349 * 350 * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no 351 * page table at *pmd: if, for example, the page table has just been removed, 352 * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked 353 * after acquiring the ptlock, and retried internally if it changed: so that a 354 * page table can be safely removed or replaced by THP while holding its lock.) 355 * 356 * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, 357 * just returns the pte pointer for addr, its page table kmapped if necessary; 358 * or NULL if there is no page table at *pmd. It does not attempt to lock the 359 * page table, so cannot normally be used when the page table is to be updated, 360 * or when entries read must be stable. But it does take rcu_read_lock(): so 361 * that even when page table is racily removed, it remains a valid though empty 362 * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s 363 * afterwards. 364 * 365 * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); 366 * but when successful, it also outputs a pointer to the spinlock in ptlp - as 367 * pte_offset_map_lock() does, but in this case without locking it. This helps 368 * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time 369 * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock 370 * pointer for the page table that it returns. Even after grabbing the spinlock, 371 * we might be looking either at a page table that is still mapped or one that 372 * was unmapped and is about to get freed. But for R/O access this is sufficient. 373 * So it is only applicable for read-only cases where any modification operations 374 * to the page table are not allowed even if the corresponding spinlock is held 375 * afterwards. 376 * 377 * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like 378 * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. 379 * It is applicable for may-write cases where any modification operations to the 380 * page table may happen after the corresponding spinlock is held afterwards. 381 * But the users should make sure the page table is stable like checking pte_same() 382 * or checking pmd_same() by using the output pmdval before performing the write 383 * operations. 384 * 385 * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will 386 * be read-only/read-write protected. 387 * 388 * Note that free_pgtables(), used after unmapping detached vmas, or when 389 * exiting the whole mm, does not take page table lock before freeing a page 390 * table, and may not use RCU at all: "outsiders" like khugepaged should avoid 391 * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. 392 */ 393 pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, 394 unsigned long addr, spinlock_t **ptlp) 395 { 396 spinlock_t *ptl; 397 pmd_t pmdval; 398 pte_t *pte; 399 again: 400 pte = __pte_offset_map(pmd, addr, &pmdval); 401 if (unlikely(!pte)) 402 return pte; 403 ptl = pte_lockptr(mm, &pmdval); 404 spin_lock(ptl); 405 if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { 406 *ptlp = ptl; 407 return pte; 408 } 409 pte_unmap_unlock(pte, ptl); 410 goto again; 411 } 412 413 #ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE 414 static void kernel_pgtable_work_func(struct work_struct *work); 415 416 static struct { 417 struct list_head list; 418 /* protect above ptdesc lists */ 419 spinlock_t lock; 420 struct work_struct work; 421 } kernel_pgtable_work = { 422 .list = LIST_HEAD_INIT(kernel_pgtable_work.list), 423 .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), 424 .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), 425 }; 426 427 static void kernel_pgtable_work_func(struct work_struct *work) 428 { 429 struct ptdesc *pt, *next; 430 LIST_HEAD(page_list); 431 432 spin_lock(&kernel_pgtable_work.lock); 433 list_splice_tail_init(&kernel_pgtable_work.list, &page_list); 434 spin_unlock(&kernel_pgtable_work.lock); 435 436 iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); 437 list_for_each_entry_safe(pt, next, &page_list, pt_list) 438 __pagetable_free(pt); 439 } 440 441 void pagetable_free_kernel(struct ptdesc *pt) 442 { 443 spin_lock(&kernel_pgtable_work.lock); 444 list_add(&pt->pt_list, &kernel_pgtable_work.list); 445 spin_unlock(&kernel_pgtable_work.lock); 446 447 schedule_work(&kernel_pgtable_work.work); 448 } 449 #endif 450