1 #include <linux/gfp.h> 2 #include <linux/highmem.h> 3 #include <linux/kernel.h> 4 #include <linux/mmdebug.h> 5 #include <linux/mm_types.h> 6 #include <linux/mm_inline.h> 7 #include <linux/pagemap.h> 8 #include <linux/rcupdate.h> 9 #include <linux/smp.h> 10 #include <linux/swap.h> 11 #include <linux/rmap.h> 12 13 #include <asm/pgalloc.h> 14 #include <asm/tlb.h> 15 16 #ifndef CONFIG_MMU_GATHER_NO_GATHER 17 18 static bool tlb_next_batch(struct mmu_gather *tlb) 19 { 20 struct mmu_gather_batch *batch; 21 22 /* Limit batching if we have delayed rmaps pending */ 23 if (tlb->delayed_rmap && tlb->active != &tlb->local) 24 return false; 25 26 batch = tlb->active; 27 if (batch->next) { 28 tlb->active = batch->next; 29 return true; 30 } 31 32 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 33 return false; 34 35 batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 36 if (!batch) 37 return false; 38 39 tlb->batch_count++; 40 batch->next = NULL; 41 batch->nr = 0; 42 batch->max = MAX_GATHER_BATCH; 43 44 tlb->active->next = batch; 45 tlb->active = batch; 46 47 return true; 48 } 49 50 #ifdef CONFIG_SMP 51 static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) 52 { 53 struct encoded_page **pages = batch->encoded_pages; 54 55 for (int i = 0; i < batch->nr; i++) { 56 struct encoded_page *enc = pages[i]; 57 58 if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { 59 struct page *page = encoded_page_ptr(enc); 60 unsigned int nr_pages = 1; 61 62 if (unlikely(encoded_page_flags(enc) & 63 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 64 nr_pages = encoded_nr_pages(pages[++i]); 65 66 folio_remove_rmap_ptes(page_folio(page), page, nr_pages, 67 vma); 68 } 69 } 70 } 71 72 /** 73 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB 74 * @tlb: the current mmu_gather 75 * @vma: The memory area from which the pages are being removed. 76 * 77 * Note that because of how tlb_next_batch() above works, we will 78 * never start multiple new batches with pending delayed rmaps, so 79 * we only need to walk through the current active batch and the 80 * original local one. 81 */ 82 void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) 83 { 84 if (!tlb->delayed_rmap) 85 return; 86 87 tlb_flush_rmap_batch(&tlb->local, vma); 88 if (tlb->active != &tlb->local) 89 tlb_flush_rmap_batch(tlb->active, vma); 90 tlb->delayed_rmap = 0; 91 } 92 #endif 93 94 /* 95 * We might end up freeing a lot of pages. Reschedule on a regular 96 * basis to avoid soft lockups in configurations without full 97 * preemption enabled. The magic number of 512 folios seems to work. 98 */ 99 #define MAX_NR_FOLIOS_PER_FREE 512 100 101 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) 102 { 103 struct encoded_page **pages = batch->encoded_pages; 104 unsigned int nr, nr_pages; 105 106 while (batch->nr) { 107 if (!page_poisoning_enabled_static() && !want_init_on_free()) { 108 nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); 109 110 /* 111 * Make sure we cover page + nr_pages, and don't leave 112 * nr_pages behind when capping the number of entries. 113 */ 114 if (unlikely(encoded_page_flags(pages[nr - 1]) & 115 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 116 nr++; 117 } else { 118 /* 119 * With page poisoning and init_on_free, the time it 120 * takes to free memory grows proportionally with the 121 * actual memory size. Therefore, limit based on the 122 * actual memory size and not the number of involved 123 * folios. 124 */ 125 for (nr = 0, nr_pages = 0; 126 nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; 127 nr++) { 128 if (unlikely(encoded_page_flags(pages[nr]) & 129 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 130 nr_pages += encoded_nr_pages(pages[++nr]); 131 else 132 nr_pages++; 133 } 134 } 135 136 free_pages_and_swap_cache(pages, nr); 137 pages += nr; 138 batch->nr -= nr; 139 140 cond_resched(); 141 } 142 } 143 144 static void tlb_batch_pages_flush(struct mmu_gather *tlb) 145 { 146 struct mmu_gather_batch *batch; 147 148 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) 149 __tlb_batch_free_encoded_pages(batch); 150 tlb->active = &tlb->local; 151 } 152 153 static void tlb_batch_list_free(struct mmu_gather *tlb) 154 { 155 struct mmu_gather_batch *batch, *next; 156 157 for (batch = tlb->local.next; batch; batch = next) { 158 next = batch->next; 159 free_pages((unsigned long)batch, 0); 160 } 161 tlb->local.next = NULL; 162 } 163 164 static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, 165 struct page *page, unsigned int nr_pages, bool delay_rmap, 166 int page_size) 167 { 168 int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; 169 struct mmu_gather_batch *batch; 170 171 VM_BUG_ON(!tlb->end); 172 173 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 174 VM_WARN_ON(tlb->page_size != page_size); 175 VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); 176 VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); 177 #endif 178 179 batch = tlb->active; 180 /* 181 * Add the page and check if we are full. If so 182 * force a flush. 183 */ 184 if (likely(nr_pages == 1)) { 185 batch->encoded_pages[batch->nr++] = encode_page(page, flags); 186 } else { 187 flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; 188 batch->encoded_pages[batch->nr++] = encode_page(page, flags); 189 batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); 190 } 191 /* 192 * Make sure that we can always add another "page" + "nr_pages", 193 * requiring two entries instead of only a single one. 194 */ 195 if (batch->nr >= batch->max - 1) { 196 if (!tlb_next_batch(tlb)) 197 return true; 198 batch = tlb->active; 199 } 200 VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); 201 202 return false; 203 } 204 205 bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, 206 unsigned int nr_pages, bool delay_rmap) 207 { 208 return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, 209 PAGE_SIZE); 210 } 211 212 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, 213 bool delay_rmap, int page_size) 214 { 215 return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); 216 } 217 218 #endif /* MMU_GATHER_NO_GATHER */ 219 220 #ifdef CONFIG_MMU_GATHER_TABLE_FREE 221 222 static void __tlb_remove_table_free(struct mmu_table_batch *batch) 223 { 224 int i; 225 226 for (i = 0; i < batch->nr; i++) 227 __tlb_remove_table(batch->tables[i]); 228 229 free_page((unsigned long)batch); 230 } 231 232 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE 233 234 /* 235 * Semi RCU freeing of the page directories. 236 * 237 * This is needed by some architectures to implement software pagetable walkers. 238 * 239 * gup_fast() and other software pagetable walkers do a lockless page-table 240 * walk and therefore needs some synchronization with the freeing of the page 241 * directories. The chosen means to accomplish that is by disabling IRQs over 242 * the walk. 243 * 244 * Architectures that use IPIs to flush TLBs will then automagically DTRT, 245 * since we unlink the page, flush TLBs, free the page. Since the disabling of 246 * IRQs delays the completion of the TLB flush we can never observe an already 247 * freed page. 248 * 249 * Architectures that do not have this (PPC) need to delay the freeing by some 250 * other means, this is that means. 251 * 252 * What we do is batch the freed directory pages (tables) and RCU free them. 253 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling 254 * holds off grace periods. 255 * 256 * However, in order to batch these pages we need to allocate storage, this 257 * allocation is deep inside the MM code and can thus easily fail on memory 258 * pressure. To guarantee progress we fall back to single table freeing, see 259 * the implementation of tlb_remove_table_one(). 260 * 261 */ 262 263 static void tlb_remove_table_smp_sync(void *arg) 264 { 265 /* Simply deliver the interrupt */ 266 } 267 268 void tlb_remove_table_sync_one(void) 269 { 270 /* 271 * This isn't an RCU grace period and hence the page-tables cannot be 272 * assumed to be actually RCU-freed. 273 * 274 * It is however sufficient for software page-table walkers that rely on 275 * IRQ disabling. 276 */ 277 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 278 } 279 280 static void tlb_remove_table_rcu(struct rcu_head *head) 281 { 282 __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); 283 } 284 285 static void tlb_remove_table_free(struct mmu_table_batch *batch) 286 { 287 call_rcu(&batch->rcu, tlb_remove_table_rcu); 288 } 289 290 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 291 292 static void tlb_remove_table_free(struct mmu_table_batch *batch) 293 { 294 __tlb_remove_table_free(batch); 295 } 296 297 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 298 299 /* 300 * If we want tlb_remove_table() to imply TLB invalidates. 301 */ 302 static inline void tlb_table_invalidate(struct mmu_gather *tlb) 303 { 304 if (tlb_needs_table_invalidate()) { 305 /* 306 * Invalidate page-table caches used by hardware walkers. Then 307 * we still need to RCU-sched wait while freeing the pages 308 * because software walkers can still be in-flight. 309 */ 310 tlb_flush_mmu_tlbonly(tlb); 311 } 312 } 313 314 #ifdef CONFIG_PT_RECLAIM 315 static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) 316 { 317 struct ptdesc *ptdesc; 318 319 ptdesc = container_of(head, struct ptdesc, pt_rcu_head); 320 __tlb_remove_table(ptdesc); 321 } 322 323 static inline void __tlb_remove_table_one(void *table) 324 { 325 struct ptdesc *ptdesc; 326 327 ptdesc = table; 328 call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); 329 } 330 #else 331 static inline void __tlb_remove_table_one(void *table) 332 { 333 tlb_remove_table_sync_one(); 334 __tlb_remove_table(table); 335 } 336 #endif /* CONFIG_PT_RECLAIM */ 337 338 static void tlb_remove_table_one(void *table) 339 { 340 __tlb_remove_table_one(table); 341 } 342 343 static void tlb_table_flush(struct mmu_gather *tlb) 344 { 345 struct mmu_table_batch **batch = &tlb->batch; 346 347 if (*batch) { 348 tlb_table_invalidate(tlb); 349 tlb_remove_table_free(*batch); 350 *batch = NULL; 351 } 352 } 353 354 void tlb_remove_table(struct mmu_gather *tlb, void *table) 355 { 356 struct mmu_table_batch **batch = &tlb->batch; 357 358 if (*batch == NULL) { 359 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 360 if (*batch == NULL) { 361 tlb_table_invalidate(tlb); 362 tlb_remove_table_one(table); 363 return; 364 } 365 (*batch)->nr = 0; 366 } 367 368 (*batch)->tables[(*batch)->nr++] = table; 369 if ((*batch)->nr == MAX_TABLE_BATCH) 370 tlb_table_flush(tlb); 371 } 372 373 static inline void tlb_table_init(struct mmu_gather *tlb) 374 { 375 tlb->batch = NULL; 376 } 377 378 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ 379 380 static inline void tlb_table_flush(struct mmu_gather *tlb) { } 381 static inline void tlb_table_init(struct mmu_gather *tlb) { } 382 383 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ 384 385 static void tlb_flush_mmu_free(struct mmu_gather *tlb) 386 { 387 tlb_table_flush(tlb); 388 #ifndef CONFIG_MMU_GATHER_NO_GATHER 389 tlb_batch_pages_flush(tlb); 390 #endif 391 } 392 393 void tlb_flush_mmu(struct mmu_gather *tlb) 394 { 395 tlb_flush_mmu_tlbonly(tlb); 396 tlb_flush_mmu_free(tlb); 397 } 398 399 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 400 bool fullmm) 401 { 402 tlb->mm = mm; 403 tlb->fullmm = fullmm; 404 405 #ifndef CONFIG_MMU_GATHER_NO_GATHER 406 tlb->need_flush_all = 0; 407 tlb->local.next = NULL; 408 tlb->local.nr = 0; 409 tlb->local.max = ARRAY_SIZE(tlb->__pages); 410 tlb->active = &tlb->local; 411 tlb->batch_count = 0; 412 #endif 413 tlb->delayed_rmap = 0; 414 415 tlb_table_init(tlb); 416 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 417 tlb->page_size = 0; 418 #endif 419 420 __tlb_reset_range(tlb); 421 inc_tlb_flush_pending(tlb->mm); 422 } 423 424 /** 425 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down 426 * @tlb: the mmu_gather structure to initialize 427 * @mm: the mm_struct of the target address space 428 * 429 * Called to initialize an (on-stack) mmu_gather structure for page-table 430 * tear-down from @mm. 431 */ 432 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) 433 { 434 __tlb_gather_mmu(tlb, mm, false); 435 } 436 437 /** 438 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down 439 * @tlb: the mmu_gather structure to initialize 440 * @mm: the mm_struct of the target address space 441 * 442 * In this case, @mm is without users and we're going to destroy the 443 * full address space (exit/execve). 444 * 445 * Called to initialize an (on-stack) mmu_gather structure for page-table 446 * tear-down from @mm. 447 */ 448 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) 449 { 450 __tlb_gather_mmu(tlb, mm, true); 451 } 452 453 /** 454 * tlb_finish_mmu - finish an mmu_gather structure 455 * @tlb: the mmu_gather structure to finish 456 * 457 * Called at the end of the shootdown operation to free up any resources that 458 * were required. 459 */ 460 void tlb_finish_mmu(struct mmu_gather *tlb) 461 { 462 /* 463 * If there are parallel threads are doing PTE changes on same range 464 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB 465 * flush by batching, one thread may end up seeing inconsistent PTEs 466 * and result in having stale TLB entries. So flush TLB forcefully 467 * if we detect parallel PTE batching threads. 468 * 469 * However, some syscalls, e.g. munmap(), may free page tables, this 470 * needs force flush everything in the given range. Otherwise this 471 * may result in having stale TLB entries for some architectures, 472 * e.g. aarch64, that could specify flush what level TLB. 473 */ 474 if (mm_tlb_flush_nested(tlb->mm)) { 475 /* 476 * The aarch64 yields better performance with fullmm by 477 * avoiding multiple CPUs spamming TLBI messages at the 478 * same time. 479 * 480 * On x86 non-fullmm doesn't yield significant difference 481 * against fullmm. 482 */ 483 tlb->fullmm = 1; 484 __tlb_reset_range(tlb); 485 tlb->freed_tables = 1; 486 } 487 488 tlb_flush_mmu(tlb); 489 490 #ifndef CONFIG_MMU_GATHER_NO_GATHER 491 tlb_batch_list_free(tlb); 492 #endif 493 dec_tlb_flush_pending(tlb->mm); 494 } 495