1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/gfp.h> 3 #include <linux/highmem.h> 4 #include <linux/kernel.h> 5 #include <linux/mmdebug.h> 6 #include <linux/mm_types.h> 7 #include <linux/mm_inline.h> 8 #include <linux/pagemap.h> 9 #include <linux/rcupdate.h> 10 #include <linux/smp.h> 11 #include <linux/swap.h> 12 #include <linux/rmap.h> 13 #include <linux/pgalloc.h> 14 #include <linux/hugetlb.h> 15 16 #include <asm/tlb.h> 17 18 #ifndef CONFIG_MMU_GATHER_NO_GATHER 19 20 static bool tlb_next_batch(struct mmu_gather *tlb) 21 { 22 struct mmu_gather_batch *batch; 23 24 /* Limit batching if we have delayed rmaps pending */ 25 if (tlb->delayed_rmap && tlb->active != &tlb->local) 26 return false; 27 28 batch = tlb->active; 29 if (batch->next) { 30 tlb->active = batch->next; 31 return true; 32 } 33 34 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 35 return false; 36 37 batch = (void *)__get_free_page(GFP_NOWAIT); 38 if (!batch) 39 return false; 40 41 tlb->batch_count++; 42 batch->next = NULL; 43 batch->nr = 0; 44 batch->max = MAX_GATHER_BATCH; 45 46 tlb->active->next = batch; 47 tlb->active = batch; 48 49 return true; 50 } 51 52 #ifdef CONFIG_SMP 53 static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) 54 { 55 struct encoded_page **pages = batch->encoded_pages; 56 57 for (int i = 0; i < batch->nr; i++) { 58 struct encoded_page *enc = pages[i]; 59 60 if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { 61 struct page *page = encoded_page_ptr(enc); 62 unsigned int nr_pages = 1; 63 64 if (unlikely(encoded_page_flags(enc) & 65 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 66 nr_pages = encoded_nr_pages(pages[++i]); 67 68 folio_remove_rmap_ptes(page_folio(page), page, nr_pages, 69 vma); 70 } 71 } 72 } 73 74 /** 75 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB 76 * @tlb: the current mmu_gather 77 * @vma: The memory area from which the pages are being removed. 78 * 79 * Note that because of how tlb_next_batch() above works, we will 80 * never start multiple new batches with pending delayed rmaps, so 81 * we only need to walk through the current active batch and the 82 * original local one. 83 */ 84 void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) 85 { 86 if (!tlb->delayed_rmap) 87 return; 88 89 tlb_flush_rmap_batch(&tlb->local, vma); 90 if (tlb->active != &tlb->local) 91 tlb_flush_rmap_batch(tlb->active, vma); 92 tlb->delayed_rmap = 0; 93 } 94 #endif 95 96 /* 97 * We might end up freeing a lot of pages. Reschedule on a regular 98 * basis to avoid soft lockups in configurations without full 99 * preemption enabled. The magic number of 512 folios seems to work. 100 */ 101 #define MAX_NR_FOLIOS_PER_FREE 512 102 103 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) 104 { 105 struct encoded_page **pages = batch->encoded_pages; 106 unsigned int nr, nr_pages; 107 108 while (batch->nr) { 109 if (!page_poisoning_enabled_static() && !want_init_on_free()) { 110 nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); 111 112 /* 113 * Make sure we cover page + nr_pages, and don't leave 114 * nr_pages behind when capping the number of entries. 115 */ 116 if (unlikely(encoded_page_flags(pages[nr - 1]) & 117 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 118 nr++; 119 } else { 120 /* 121 * With page poisoning and init_on_free, the time it 122 * takes to free memory grows proportionally with the 123 * actual memory size. Therefore, limit based on the 124 * actual memory size and not the number of involved 125 * folios. 126 */ 127 for (nr = 0, nr_pages = 0; 128 nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; 129 nr++) { 130 if (unlikely(encoded_page_flags(pages[nr]) & 131 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 132 nr_pages += encoded_nr_pages(pages[++nr]); 133 else 134 nr_pages++; 135 } 136 } 137 138 free_pages_and_swap_cache(pages, nr); 139 pages += nr; 140 batch->nr -= nr; 141 142 cond_resched(); 143 } 144 } 145 146 static void tlb_batch_pages_flush(struct mmu_gather *tlb) 147 { 148 struct mmu_gather_batch *batch; 149 150 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) 151 __tlb_batch_free_encoded_pages(batch); 152 tlb->active = &tlb->local; 153 } 154 155 static void tlb_batch_list_free(struct mmu_gather *tlb) 156 { 157 struct mmu_gather_batch *batch, *next; 158 159 for (batch = tlb->local.next; batch; batch = next) { 160 next = batch->next; 161 free_pages((unsigned long)batch, 0); 162 } 163 tlb->local.next = NULL; 164 } 165 166 static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, 167 struct page *page, unsigned int nr_pages, bool delay_rmap, 168 int page_size) 169 { 170 int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; 171 struct mmu_gather_batch *batch; 172 173 VM_BUG_ON(!tlb->end); 174 175 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 176 VM_WARN_ON(tlb->page_size != page_size); 177 VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); 178 VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); 179 #endif 180 181 batch = tlb->active; 182 /* 183 * Add the page and check if we are full. If so 184 * force a flush. 185 */ 186 if (likely(nr_pages == 1)) { 187 batch->encoded_pages[batch->nr++] = encode_page(page, flags); 188 } else { 189 flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; 190 batch->encoded_pages[batch->nr++] = encode_page(page, flags); 191 batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); 192 } 193 /* 194 * Make sure that we can always add another "page" + "nr_pages", 195 * requiring two entries instead of only a single one. 196 */ 197 if (batch->nr >= batch->max - 1) { 198 if (!tlb_next_batch(tlb)) 199 return true; 200 batch = tlb->active; 201 } 202 VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); 203 204 return false; 205 } 206 207 bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, 208 unsigned int nr_pages, bool delay_rmap) 209 { 210 return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, 211 PAGE_SIZE); 212 } 213 214 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) 215 { 216 return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size); 217 } 218 219 #endif /* MMU_GATHER_NO_GATHER */ 220 221 #ifdef CONFIG_MMU_GATHER_TABLE_FREE 222 223 static void __tlb_remove_table_free(struct mmu_table_batch *batch) 224 { 225 int i; 226 227 for (i = 0; i < batch->nr; i++) 228 __tlb_remove_table(batch->tables[i]); 229 230 free_page((unsigned long)batch); 231 } 232 233 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE 234 235 /* 236 * Semi RCU freeing of the page directories. 237 * 238 * This is needed by some architectures to implement software pagetable walkers. 239 * 240 * gup_fast() and other software pagetable walkers do a lockless page-table 241 * walk and therefore needs some synchronization with the freeing of the page 242 * directories. The chosen means to accomplish that is by disabling IRQs over 243 * the walk. 244 * 245 * Architectures that use IPIs to flush TLBs will then automagically DTRT, 246 * since we unlink the page, flush TLBs, free the page. Since the disabling of 247 * IRQs delays the completion of the TLB flush we can never observe an already 248 * freed page. 249 * 250 * Not all systems IPI every CPU for this purpose: 251 * 252 * - Some architectures have HW support for cross-CPU synchronisation of TLB 253 * flushes, so there's no IPI at all. 254 * 255 * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate 256 * with the hypervisor to defer flushing on preempted vCPUs. 257 * 258 * Such systems need to delay the freeing by some other means, this is that 259 * means. 260 * 261 * What we do is batch the freed directory pages (tables) and RCU free them. 262 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling 263 * holds off grace periods. 264 * 265 * However, in order to batch these pages we need to allocate storage, this 266 * allocation is deep inside the MM code and can thus easily fail on memory 267 * pressure. To guarantee progress we fall back to single table freeing, see 268 * the implementation of tlb_remove_table_one(). 269 * 270 */ 271 272 static void tlb_remove_table_smp_sync(void *arg) 273 { 274 /* Simply deliver the interrupt */ 275 } 276 277 void tlb_remove_table_sync_one(void) 278 { 279 /* 280 * This isn't an RCU grace period and hence the page-tables cannot be 281 * assumed to be actually RCU-freed. 282 * 283 * It is however sufficient for software page-table walkers that rely on 284 * IRQ disabling. 285 */ 286 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 287 } 288 289 static void tlb_remove_table_rcu(struct rcu_head *head) 290 { 291 __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); 292 } 293 294 static void tlb_remove_table_free(struct mmu_table_batch *batch) 295 { 296 call_rcu(&batch->rcu, tlb_remove_table_rcu); 297 } 298 299 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 300 301 static void tlb_remove_table_free(struct mmu_table_batch *batch) 302 { 303 __tlb_remove_table_free(batch); 304 } 305 306 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 307 308 /* 309 * If we want tlb_remove_table() to imply TLB invalidates. 310 */ 311 static inline void tlb_table_invalidate(struct mmu_gather *tlb) 312 { 313 if (tlb_needs_table_invalidate()) { 314 /* 315 * Invalidate page-table caches used by hardware walkers. Then 316 * we still need to RCU-sched wait while freeing the pages 317 * because software walkers can still be in-flight. 318 */ 319 tlb_flush_mmu_tlbonly(tlb); 320 } 321 } 322 323 #ifdef CONFIG_PT_RECLAIM 324 static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) 325 { 326 struct ptdesc *ptdesc; 327 328 ptdesc = container_of(head, struct ptdesc, pt_rcu_head); 329 __tlb_remove_table(ptdesc); 330 } 331 332 static inline void __tlb_remove_table_one(void *table) 333 { 334 struct ptdesc *ptdesc; 335 336 ptdesc = table; 337 call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); 338 } 339 #else 340 static inline void __tlb_remove_table_one(void *table) 341 { 342 tlb_remove_table_sync_one(); 343 __tlb_remove_table(table); 344 } 345 #endif /* CONFIG_PT_RECLAIM */ 346 347 static void tlb_remove_table_one(void *table) 348 { 349 __tlb_remove_table_one(table); 350 } 351 352 static void tlb_table_flush(struct mmu_gather *tlb) 353 { 354 struct mmu_table_batch **batch = &tlb->batch; 355 356 if (*batch) { 357 tlb_table_invalidate(tlb); 358 tlb_remove_table_free(*batch); 359 *batch = NULL; 360 } 361 } 362 363 void tlb_remove_table(struct mmu_gather *tlb, void *table) 364 { 365 struct mmu_table_batch **batch = &tlb->batch; 366 367 if (*batch == NULL) { 368 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT); 369 if (*batch == NULL) { 370 tlb_table_invalidate(tlb); 371 tlb_remove_table_one(table); 372 return; 373 } 374 (*batch)->nr = 0; 375 } 376 377 (*batch)->tables[(*batch)->nr++] = table; 378 if ((*batch)->nr == MAX_TABLE_BATCH) 379 tlb_table_flush(tlb); 380 } 381 382 static inline void tlb_table_init(struct mmu_gather *tlb) 383 { 384 tlb->batch = NULL; 385 } 386 387 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ 388 389 static inline void tlb_table_flush(struct mmu_gather *tlb) { } 390 static inline void tlb_table_init(struct mmu_gather *tlb) { } 391 392 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ 393 394 static void tlb_flush_mmu_free(struct mmu_gather *tlb) 395 { 396 tlb_table_flush(tlb); 397 #ifndef CONFIG_MMU_GATHER_NO_GATHER 398 tlb_batch_pages_flush(tlb); 399 #endif 400 } 401 402 void tlb_flush_mmu(struct mmu_gather *tlb) 403 { 404 tlb_flush_mmu_tlbonly(tlb); 405 tlb_flush_mmu_free(tlb); 406 } 407 408 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 409 bool fullmm) 410 { 411 tlb->mm = mm; 412 tlb->fullmm = fullmm; 413 414 #ifndef CONFIG_MMU_GATHER_NO_GATHER 415 tlb->need_flush_all = 0; 416 tlb->local.next = NULL; 417 tlb->local.nr = 0; 418 tlb->local.max = ARRAY_SIZE(tlb->__pages); 419 tlb->active = &tlb->local; 420 tlb->batch_count = 0; 421 #endif 422 tlb->delayed_rmap = 0; 423 424 tlb_table_init(tlb); 425 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 426 tlb->page_size = 0; 427 #endif 428 tlb->vma_pfn = 0; 429 430 tlb->fully_unshared_tables = 0; 431 __tlb_reset_range(tlb); 432 inc_tlb_flush_pending(tlb->mm); 433 } 434 435 /** 436 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down 437 * @tlb: the mmu_gather structure to initialize 438 * @mm: the mm_struct of the target address space 439 * 440 * Called to initialize an (on-stack) mmu_gather structure for page-table 441 * tear-down from @mm. 442 */ 443 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) 444 { 445 __tlb_gather_mmu(tlb, mm, false); 446 } 447 448 /** 449 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down 450 * @tlb: the mmu_gather structure to initialize 451 * @mm: the mm_struct of the target address space 452 * 453 * In this case, @mm is without users and we're going to destroy the 454 * full address space (exit/execve). 455 * 456 * Called to initialize an (on-stack) mmu_gather structure for page-table 457 * tear-down from @mm. 458 */ 459 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) 460 { 461 __tlb_gather_mmu(tlb, mm, true); 462 } 463 464 /** 465 * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a 466 * single VMA 467 * @tlb: the mmu_gather structure to initialize 468 * @vma: the vm_area_struct 469 * 470 * Called to initialize an (on-stack) mmu_gather structure for operating on 471 * a single VMA. In contrast to tlb_gather_mmu(), calling this function will 472 * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), 473 * this function will *not* call flush_cache_range(). 474 * 475 * For hugetlb VMAs, this function will also initialize the mmu_gather 476 * page_size accordingly, not requiring a separate call to 477 * tlb_change_page_size(). 478 * 479 */ 480 void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) 481 { 482 tlb_gather_mmu(tlb, vma->vm_mm); 483 tlb_update_vma_flags(tlb, vma); 484 if (is_vm_hugetlb_page(vma)) 485 /* All entries have the same size. */ 486 tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); 487 } 488 489 /** 490 * tlb_finish_mmu - finish an mmu_gather structure 491 * @tlb: the mmu_gather structure to finish 492 * 493 * Called at the end of the shootdown operation to free up any resources that 494 * were required. 495 */ 496 void tlb_finish_mmu(struct mmu_gather *tlb) 497 { 498 /* 499 * We expect an earlier huge_pmd_unshare_flush() call to sort this out, 500 * due to complicated locking requirements with page table unsharing. 501 */ 502 VM_WARN_ON_ONCE(tlb->fully_unshared_tables); 503 504 /* 505 * If there are parallel threads are doing PTE changes on same range 506 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB 507 * flush by batching, one thread may end up seeing inconsistent PTEs 508 * and result in having stale TLB entries. So flush TLB forcefully 509 * if we detect parallel PTE batching threads. 510 * 511 * However, some syscalls, e.g. munmap(), may free page tables, this 512 * needs force flush everything in the given range. Otherwise this 513 * may result in having stale TLB entries for some architectures, 514 * e.g. aarch64, that could specify flush what level TLB. 515 */ 516 if (mm_tlb_flush_nested(tlb->mm)) { 517 /* 518 * The aarch64 yields better performance with fullmm by 519 * avoiding multiple CPUs spamming TLBI messages at the 520 * same time. 521 * 522 * On x86 non-fullmm doesn't yield significant difference 523 * against fullmm. 524 */ 525 tlb->fullmm = 1; 526 __tlb_reset_range(tlb); 527 tlb->freed_tables = 1; 528 } 529 530 tlb_flush_mmu(tlb); 531 532 #ifndef CONFIG_MMU_GATHER_NO_GATHER 533 tlb_batch_list_free(tlb); 534 #endif 535 dec_tlb_flush_pending(tlb->mm); 536 } 537