1 #include <linux/gfp.h> 2 #include <linux/highmem.h> 3 #include <linux/kernel.h> 4 #include <linux/mmdebug.h> 5 #include <linux/mm_types.h> 6 #include <linux/mm_inline.h> 7 #include <linux/pagemap.h> 8 #include <linux/rcupdate.h> 9 #include <linux/smp.h> 10 #include <linux/swap.h> 11 #include <linux/rmap.h> 12 #include <linux/pgalloc.h> 13 #include <linux/hugetlb.h> 14 15 #include <asm/tlb.h> 16 17 #ifndef CONFIG_MMU_GATHER_NO_GATHER 18 19 static bool tlb_next_batch(struct mmu_gather *tlb) 20 { 21 struct mmu_gather_batch *batch; 22 23 /* Limit batching if we have delayed rmaps pending */ 24 if (tlb->delayed_rmap && tlb->active != &tlb->local) 25 return false; 26 27 batch = tlb->active; 28 if (batch->next) { 29 tlb->active = batch->next; 30 return true; 31 } 32 33 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 34 return false; 35 36 batch = (void *)__get_free_page(GFP_NOWAIT); 37 if (!batch) 38 return false; 39 40 tlb->batch_count++; 41 batch->next = NULL; 42 batch->nr = 0; 43 batch->max = MAX_GATHER_BATCH; 44 45 tlb->active->next = batch; 46 tlb->active = batch; 47 48 return true; 49 } 50 51 #ifdef CONFIG_SMP 52 static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) 53 { 54 struct encoded_page **pages = batch->encoded_pages; 55 56 for (int i = 0; i < batch->nr; i++) { 57 struct encoded_page *enc = pages[i]; 58 59 if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { 60 struct page *page = encoded_page_ptr(enc); 61 unsigned int nr_pages = 1; 62 63 if (unlikely(encoded_page_flags(enc) & 64 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 65 nr_pages = encoded_nr_pages(pages[++i]); 66 67 folio_remove_rmap_ptes(page_folio(page), page, nr_pages, 68 vma); 69 } 70 } 71 } 72 73 /** 74 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB 75 * @tlb: the current mmu_gather 76 * @vma: The memory area from which the pages are being removed. 77 * 78 * Note that because of how tlb_next_batch() above works, we will 79 * never start multiple new batches with pending delayed rmaps, so 80 * we only need to walk through the current active batch and the 81 * original local one. 82 */ 83 void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) 84 { 85 if (!tlb->delayed_rmap) 86 return; 87 88 tlb_flush_rmap_batch(&tlb->local, vma); 89 if (tlb->active != &tlb->local) 90 tlb_flush_rmap_batch(tlb->active, vma); 91 tlb->delayed_rmap = 0; 92 } 93 #endif 94 95 /* 96 * We might end up freeing a lot of pages. Reschedule on a regular 97 * basis to avoid soft lockups in configurations without full 98 * preemption enabled. The magic number of 512 folios seems to work. 99 */ 100 #define MAX_NR_FOLIOS_PER_FREE 512 101 102 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) 103 { 104 struct encoded_page **pages = batch->encoded_pages; 105 unsigned int nr, nr_pages; 106 107 while (batch->nr) { 108 if (!page_poisoning_enabled_static() && !want_init_on_free()) { 109 nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); 110 111 /* 112 * Make sure we cover page + nr_pages, and don't leave 113 * nr_pages behind when capping the number of entries. 114 */ 115 if (unlikely(encoded_page_flags(pages[nr - 1]) & 116 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 117 nr++; 118 } else { 119 /* 120 * With page poisoning and init_on_free, the time it 121 * takes to free memory grows proportionally with the 122 * actual memory size. Therefore, limit based on the 123 * actual memory size and not the number of involved 124 * folios. 125 */ 126 for (nr = 0, nr_pages = 0; 127 nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; 128 nr++) { 129 if (unlikely(encoded_page_flags(pages[nr]) & 130 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 131 nr_pages += encoded_nr_pages(pages[++nr]); 132 else 133 nr_pages++; 134 } 135 } 136 137 free_pages_and_swap_cache(pages, nr); 138 pages += nr; 139 batch->nr -= nr; 140 141 cond_resched(); 142 } 143 } 144 145 static void tlb_batch_pages_flush(struct mmu_gather *tlb) 146 { 147 struct mmu_gather_batch *batch; 148 149 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) 150 __tlb_batch_free_encoded_pages(batch); 151 tlb->active = &tlb->local; 152 } 153 154 static void tlb_batch_list_free(struct mmu_gather *tlb) 155 { 156 struct mmu_gather_batch *batch, *next; 157 158 for (batch = tlb->local.next; batch; batch = next) { 159 next = batch->next; 160 free_pages((unsigned long)batch, 0); 161 } 162 tlb->local.next = NULL; 163 } 164 165 static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, 166 struct page *page, unsigned int nr_pages, bool delay_rmap, 167 int page_size) 168 { 169 int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; 170 struct mmu_gather_batch *batch; 171 172 VM_BUG_ON(!tlb->end); 173 174 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 175 VM_WARN_ON(tlb->page_size != page_size); 176 VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); 177 VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); 178 #endif 179 180 batch = tlb->active; 181 /* 182 * Add the page and check if we are full. If so 183 * force a flush. 184 */ 185 if (likely(nr_pages == 1)) { 186 batch->encoded_pages[batch->nr++] = encode_page(page, flags); 187 } else { 188 flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; 189 batch->encoded_pages[batch->nr++] = encode_page(page, flags); 190 batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); 191 } 192 /* 193 * Make sure that we can always add another "page" + "nr_pages", 194 * requiring two entries instead of only a single one. 195 */ 196 if (batch->nr >= batch->max - 1) { 197 if (!tlb_next_batch(tlb)) 198 return true; 199 batch = tlb->active; 200 } 201 VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); 202 203 return false; 204 } 205 206 bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, 207 unsigned int nr_pages, bool delay_rmap) 208 { 209 return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, 210 PAGE_SIZE); 211 } 212 213 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) 214 { 215 return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size); 216 } 217 218 #endif /* MMU_GATHER_NO_GATHER */ 219 220 #ifdef CONFIG_MMU_GATHER_TABLE_FREE 221 222 static void __tlb_remove_table_free(struct mmu_table_batch *batch) 223 { 224 int i; 225 226 for (i = 0; i < batch->nr; i++) 227 __tlb_remove_table(batch->tables[i]); 228 229 free_page((unsigned long)batch); 230 } 231 232 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE 233 234 /* 235 * Semi RCU freeing of the page directories. 236 * 237 * This is needed by some architectures to implement software pagetable walkers. 238 * 239 * gup_fast() and other software pagetable walkers do a lockless page-table 240 * walk and therefore needs some synchronization with the freeing of the page 241 * directories. The chosen means to accomplish that is by disabling IRQs over 242 * the walk. 243 * 244 * Architectures that use IPIs to flush TLBs will then automagically DTRT, 245 * since we unlink the page, flush TLBs, free the page. Since the disabling of 246 * IRQs delays the completion of the TLB flush we can never observe an already 247 * freed page. 248 * 249 * Not all systems IPI every CPU for this purpose: 250 * 251 * - Some architectures have HW support for cross-CPU synchronisation of TLB 252 * flushes, so there's no IPI at all. 253 * 254 * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate 255 * with the hypervisor to defer flushing on preempted vCPUs. 256 * 257 * Such systems need to delay the freeing by some other means, this is that 258 * means. 259 * 260 * What we do is batch the freed directory pages (tables) and RCU free them. 261 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling 262 * holds off grace periods. 263 * 264 * However, in order to batch these pages we need to allocate storage, this 265 * allocation is deep inside the MM code and can thus easily fail on memory 266 * pressure. To guarantee progress we fall back to single table freeing, see 267 * the implementation of tlb_remove_table_one(). 268 * 269 */ 270 271 static void tlb_remove_table_smp_sync(void *arg) 272 { 273 /* Simply deliver the interrupt */ 274 } 275 276 void tlb_remove_table_sync_one(void) 277 { 278 /* 279 * This isn't an RCU grace period and hence the page-tables cannot be 280 * assumed to be actually RCU-freed. 281 * 282 * It is however sufficient for software page-table walkers that rely on 283 * IRQ disabling. 284 */ 285 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 286 } 287 288 static void tlb_remove_table_rcu(struct rcu_head *head) 289 { 290 __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); 291 } 292 293 static void tlb_remove_table_free(struct mmu_table_batch *batch) 294 { 295 call_rcu(&batch->rcu, tlb_remove_table_rcu); 296 } 297 298 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 299 300 static void tlb_remove_table_free(struct mmu_table_batch *batch) 301 { 302 __tlb_remove_table_free(batch); 303 } 304 305 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 306 307 /* 308 * If we want tlb_remove_table() to imply TLB invalidates. 309 */ 310 static inline void tlb_table_invalidate(struct mmu_gather *tlb) 311 { 312 if (tlb_needs_table_invalidate()) { 313 /* 314 * Invalidate page-table caches used by hardware walkers. Then 315 * we still need to RCU-sched wait while freeing the pages 316 * because software walkers can still be in-flight. 317 */ 318 tlb_flush_mmu_tlbonly(tlb); 319 } 320 } 321 322 #ifdef CONFIG_PT_RECLAIM 323 static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) 324 { 325 struct ptdesc *ptdesc; 326 327 ptdesc = container_of(head, struct ptdesc, pt_rcu_head); 328 __tlb_remove_table(ptdesc); 329 } 330 331 static inline void __tlb_remove_table_one(void *table) 332 { 333 struct ptdesc *ptdesc; 334 335 ptdesc = table; 336 call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); 337 } 338 #else 339 static inline void __tlb_remove_table_one(void *table) 340 { 341 tlb_remove_table_sync_one(); 342 __tlb_remove_table(table); 343 } 344 #endif /* CONFIG_PT_RECLAIM */ 345 346 static void tlb_remove_table_one(void *table) 347 { 348 __tlb_remove_table_one(table); 349 } 350 351 static void tlb_table_flush(struct mmu_gather *tlb) 352 { 353 struct mmu_table_batch **batch = &tlb->batch; 354 355 if (*batch) { 356 tlb_table_invalidate(tlb); 357 tlb_remove_table_free(*batch); 358 *batch = NULL; 359 } 360 } 361 362 void tlb_remove_table(struct mmu_gather *tlb, void *table) 363 { 364 struct mmu_table_batch **batch = &tlb->batch; 365 366 if (*batch == NULL) { 367 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT); 368 if (*batch == NULL) { 369 tlb_table_invalidate(tlb); 370 tlb_remove_table_one(table); 371 return; 372 } 373 (*batch)->nr = 0; 374 } 375 376 (*batch)->tables[(*batch)->nr++] = table; 377 if ((*batch)->nr == MAX_TABLE_BATCH) 378 tlb_table_flush(tlb); 379 } 380 381 static inline void tlb_table_init(struct mmu_gather *tlb) 382 { 383 tlb->batch = NULL; 384 } 385 386 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ 387 388 static inline void tlb_table_flush(struct mmu_gather *tlb) { } 389 static inline void tlb_table_init(struct mmu_gather *tlb) { } 390 391 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ 392 393 static void tlb_flush_mmu_free(struct mmu_gather *tlb) 394 { 395 tlb_table_flush(tlb); 396 #ifndef CONFIG_MMU_GATHER_NO_GATHER 397 tlb_batch_pages_flush(tlb); 398 #endif 399 } 400 401 void tlb_flush_mmu(struct mmu_gather *tlb) 402 { 403 tlb_flush_mmu_tlbonly(tlb); 404 tlb_flush_mmu_free(tlb); 405 } 406 407 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 408 bool fullmm) 409 { 410 tlb->mm = mm; 411 tlb->fullmm = fullmm; 412 413 #ifndef CONFIG_MMU_GATHER_NO_GATHER 414 tlb->need_flush_all = 0; 415 tlb->local.next = NULL; 416 tlb->local.nr = 0; 417 tlb->local.max = ARRAY_SIZE(tlb->__pages); 418 tlb->active = &tlb->local; 419 tlb->batch_count = 0; 420 #endif 421 tlb->delayed_rmap = 0; 422 423 tlb_table_init(tlb); 424 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 425 tlb->page_size = 0; 426 #endif 427 tlb->vma_pfn = 0; 428 429 tlb->fully_unshared_tables = 0; 430 __tlb_reset_range(tlb); 431 inc_tlb_flush_pending(tlb->mm); 432 } 433 434 /** 435 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down 436 * @tlb: the mmu_gather structure to initialize 437 * @mm: the mm_struct of the target address space 438 * 439 * Called to initialize an (on-stack) mmu_gather structure for page-table 440 * tear-down from @mm. 441 */ 442 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) 443 { 444 __tlb_gather_mmu(tlb, mm, false); 445 } 446 447 /** 448 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down 449 * @tlb: the mmu_gather structure to initialize 450 * @mm: the mm_struct of the target address space 451 * 452 * In this case, @mm is without users and we're going to destroy the 453 * full address space (exit/execve). 454 * 455 * Called to initialize an (on-stack) mmu_gather structure for page-table 456 * tear-down from @mm. 457 */ 458 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) 459 { 460 __tlb_gather_mmu(tlb, mm, true); 461 } 462 463 /** 464 * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a 465 * single VMA 466 * @tlb: the mmu_gather structure to initialize 467 * @vma: the vm_area_struct 468 * 469 * Called to initialize an (on-stack) mmu_gather structure for operating on 470 * a single VMA. In contrast to tlb_gather_mmu(), calling this function will 471 * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), 472 * this function will *not* call flush_cache_range(). 473 * 474 * For hugetlb VMAs, this function will also initialize the mmu_gather 475 * page_size accordingly, not requiring a separate call to 476 * tlb_change_page_size(). 477 * 478 */ 479 void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) 480 { 481 tlb_gather_mmu(tlb, vma->vm_mm); 482 tlb_update_vma_flags(tlb, vma); 483 if (is_vm_hugetlb_page(vma)) 484 /* All entries have the same size. */ 485 tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); 486 } 487 488 /** 489 * tlb_finish_mmu - finish an mmu_gather structure 490 * @tlb: the mmu_gather structure to finish 491 * 492 * Called at the end of the shootdown operation to free up any resources that 493 * were required. 494 */ 495 void tlb_finish_mmu(struct mmu_gather *tlb) 496 { 497 /* 498 * We expect an earlier huge_pmd_unshare_flush() call to sort this out, 499 * due to complicated locking requirements with page table unsharing. 500 */ 501 VM_WARN_ON_ONCE(tlb->fully_unshared_tables); 502 503 /* 504 * If there are parallel threads are doing PTE changes on same range 505 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB 506 * flush by batching, one thread may end up seeing inconsistent PTEs 507 * and result in having stale TLB entries. So flush TLB forcefully 508 * if we detect parallel PTE batching threads. 509 * 510 * However, some syscalls, e.g. munmap(), may free page tables, this 511 * needs force flush everything in the given range. Otherwise this 512 * may result in having stale TLB entries for some architectures, 513 * e.g. aarch64, that could specify flush what level TLB. 514 */ 515 if (mm_tlb_flush_nested(tlb->mm)) { 516 /* 517 * The aarch64 yields better performance with fullmm by 518 * avoiding multiple CPUs spamming TLBI messages at the 519 * same time. 520 * 521 * On x86 non-fullmm doesn't yield significant difference 522 * against fullmm. 523 */ 524 tlb->fullmm = 1; 525 __tlb_reset_range(tlb); 526 tlb->freed_tables = 1; 527 } 528 529 tlb_flush_mmu(tlb); 530 531 #ifndef CONFIG_MMU_GATHER_NO_GATHER 532 tlb_batch_list_free(tlb); 533 #endif 534 dec_tlb_flush_pending(tlb->mm); 535 } 536