1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * DAMON Primitives for Virtual Address Spaces 4 * 5 * Author: SeongJae Park <sj@kernel.org> 6 */ 7 8 #define pr_fmt(fmt) "damon-va: " fmt 9 10 #include <linux/highmem.h> 11 #include <linux/hugetlb.h> 12 #include <linux/mman.h> 13 #include <linux/mmu_notifier.h> 14 #include <linux/page_idle.h> 15 #include <linux/pagewalk.h> 16 #include <linux/sched/mm.h> 17 18 #include "ops-common.h" 19 20 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST 21 #undef DAMON_MIN_REGION 22 #define DAMON_MIN_REGION 1 23 #endif 24 25 /* 26 * 't->pid' should be the pointer to the relevant 'struct pid' having reference 27 * count. Caller must put the returned task, unless it is NULL. 28 */ 29 static inline struct task_struct *damon_get_task_struct(struct damon_target *t) 30 { 31 return get_pid_task(t->pid, PIDTYPE_PID); 32 } 33 34 /* 35 * Get the mm_struct of the given target 36 * 37 * Caller _must_ put the mm_struct after use, unless it is NULL. 38 * 39 * Returns the mm_struct of the target on success, NULL on failure 40 */ 41 static struct mm_struct *damon_get_mm(struct damon_target *t) 42 { 43 struct task_struct *task; 44 struct mm_struct *mm; 45 46 task = damon_get_task_struct(t); 47 if (!task) 48 return NULL; 49 50 mm = get_task_mm(task); 51 put_task_struct(task); 52 return mm; 53 } 54 55 /* 56 * Functions for the initial monitoring target regions construction 57 */ 58 59 /* 60 * Size-evenly split a region into 'nr_pieces' small regions 61 * 62 * Returns 0 on success, or negative error code otherwise. 63 */ 64 static int damon_va_evenly_split_region(struct damon_target *t, 65 struct damon_region *r, unsigned int nr_pieces) 66 { 67 unsigned long sz_orig, sz_piece, orig_end; 68 struct damon_region *n = NULL, *next; 69 unsigned long start; 70 unsigned int i; 71 72 if (!r || !nr_pieces) 73 return -EINVAL; 74 75 if (nr_pieces == 1) 76 return 0; 77 78 orig_end = r->ar.end; 79 sz_orig = damon_sz_region(r); 80 sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); 81 82 if (!sz_piece) 83 return -EINVAL; 84 85 r->ar.end = r->ar.start + sz_piece; 86 next = damon_next_region(r); 87 for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) { 88 n = damon_new_region(start, start + sz_piece); 89 if (!n) 90 return -ENOMEM; 91 damon_insert_region(n, r, next, t); 92 r = n; 93 } 94 /* complement last region for possible rounding error */ 95 if (n) 96 n->ar.end = orig_end; 97 98 return 0; 99 } 100 101 static unsigned long sz_range(struct damon_addr_range *r) 102 { 103 return r->end - r->start; 104 } 105 106 /* 107 * Find three regions separated by two biggest unmapped regions 108 * 109 * vma the head vma of the target address space 110 * regions an array of three address ranges that results will be saved 111 * 112 * This function receives an address space and finds three regions in it which 113 * separated by the two biggest unmapped regions in the space. Please refer to 114 * below comments of '__damon_va_init_regions()' function to know why this is 115 * necessary. 116 * 117 * Returns 0 if success, or negative error code otherwise. 118 */ 119 static int __damon_va_three_regions(struct mm_struct *mm, 120 struct damon_addr_range regions[3]) 121 { 122 struct damon_addr_range first_gap = {0}, second_gap = {0}; 123 VMA_ITERATOR(vmi, mm, 0); 124 struct vm_area_struct *vma, *prev = NULL; 125 unsigned long start; 126 127 /* 128 * Find the two biggest gaps so that first_gap > second_gap > others. 129 * If this is too slow, it can be optimised to examine the maple 130 * tree gaps. 131 */ 132 rcu_read_lock(); 133 for_each_vma(vmi, vma) { 134 unsigned long gap; 135 136 if (!prev) { 137 start = vma->vm_start; 138 goto next; 139 } 140 gap = vma->vm_start - prev->vm_end; 141 142 if (gap > sz_range(&first_gap)) { 143 second_gap = first_gap; 144 first_gap.start = prev->vm_end; 145 first_gap.end = vma->vm_start; 146 } else if (gap > sz_range(&second_gap)) { 147 second_gap.start = prev->vm_end; 148 second_gap.end = vma->vm_start; 149 } 150 next: 151 prev = vma; 152 } 153 rcu_read_unlock(); 154 155 if (!sz_range(&second_gap) || !sz_range(&first_gap)) 156 return -EINVAL; 157 158 /* Sort the two biggest gaps by address */ 159 if (first_gap.start > second_gap.start) 160 swap(first_gap, second_gap); 161 162 /* Store the result */ 163 regions[0].start = ALIGN(start, DAMON_MIN_REGION); 164 regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); 165 regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); 166 regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); 167 regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); 168 regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); 169 170 return 0; 171 } 172 173 /* 174 * Get the three regions in the given target (task) 175 * 176 * Returns 0 on success, negative error code otherwise. 177 */ 178 static int damon_va_three_regions(struct damon_target *t, 179 struct damon_addr_range regions[3]) 180 { 181 struct mm_struct *mm; 182 int rc; 183 184 mm = damon_get_mm(t); 185 if (!mm) 186 return -EINVAL; 187 188 mmap_read_lock(mm); 189 rc = __damon_va_three_regions(mm, regions); 190 mmap_read_unlock(mm); 191 192 mmput(mm); 193 return rc; 194 } 195 196 /* 197 * Initialize the monitoring target regions for the given target (task) 198 * 199 * t the given target 200 * 201 * Because only a number of small portions of the entire address space 202 * is actually mapped to the memory and accessed, monitoring the unmapped 203 * regions is wasteful. That said, because we can deal with small noises, 204 * tracking every mapping is not strictly required but could even incur a high 205 * overhead if the mapping frequently changes or the number of mappings is 206 * high. The adaptive regions adjustment mechanism will further help to deal 207 * with the noise by simply identifying the unmapped areas as a region that 208 * has no access. Moreover, applying the real mappings that would have many 209 * unmapped areas inside will make the adaptive mechanism quite complex. That 210 * said, too huge unmapped areas inside the monitoring target should be removed 211 * to not take the time for the adaptive mechanism. 212 * 213 * For the reason, we convert the complex mappings to three distinct regions 214 * that cover every mapped area of the address space. Also the two gaps 215 * between the three regions are the two biggest unmapped areas in the given 216 * address space. In detail, this function first identifies the start and the 217 * end of the mappings and the two biggest unmapped areas of the address space. 218 * Then, it constructs the three regions as below: 219 * 220 * [mappings[0]->start, big_two_unmapped_areas[0]->start) 221 * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) 222 * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) 223 * 224 * As usual memory map of processes is as below, the gap between the heap and 225 * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed 226 * region and the stack will be two biggest unmapped regions. Because these 227 * gaps are exceptionally huge areas in usual address space, excluding these 228 * two biggest unmapped regions will be sufficient to make a trade-off. 229 * 230 * <heap> 231 * <BIG UNMAPPED REGION 1> 232 * <uppermost mmap()-ed region> 233 * (other mmap()-ed regions and small unmapped regions) 234 * <lowermost mmap()-ed region> 235 * <BIG UNMAPPED REGION 2> 236 * <stack> 237 */ 238 static void __damon_va_init_regions(struct damon_ctx *ctx, 239 struct damon_target *t) 240 { 241 struct damon_target *ti; 242 struct damon_region *r; 243 struct damon_addr_range regions[3]; 244 unsigned long sz = 0, nr_pieces; 245 int i, tidx = 0; 246 247 if (damon_va_three_regions(t, regions)) { 248 damon_for_each_target(ti, ctx) { 249 if (ti == t) 250 break; 251 tidx++; 252 } 253 pr_debug("Failed to get three regions of %dth target\n", tidx); 254 return; 255 } 256 257 for (i = 0; i < 3; i++) 258 sz += regions[i].end - regions[i].start; 259 if (ctx->attrs.min_nr_regions) 260 sz /= ctx->attrs.min_nr_regions; 261 if (sz < DAMON_MIN_REGION) 262 sz = DAMON_MIN_REGION; 263 264 /* Set the initial three regions of the target */ 265 for (i = 0; i < 3; i++) { 266 r = damon_new_region(regions[i].start, regions[i].end); 267 if (!r) { 268 pr_err("%d'th init region creation failed\n", i); 269 return; 270 } 271 damon_add_region(r, t); 272 273 nr_pieces = (regions[i].end - regions[i].start) / sz; 274 damon_va_evenly_split_region(t, r, nr_pieces); 275 } 276 } 277 278 /* Initialize '->regions_list' of every target (task) */ 279 static void damon_va_init(struct damon_ctx *ctx) 280 { 281 struct damon_target *t; 282 283 damon_for_each_target(t, ctx) { 284 /* the user may set the target regions as they want */ 285 if (!damon_nr_regions(t)) 286 __damon_va_init_regions(ctx, t); 287 } 288 } 289 290 /* 291 * Update regions for current memory mappings 292 */ 293 static void damon_va_update(struct damon_ctx *ctx) 294 { 295 struct damon_addr_range three_regions[3]; 296 struct damon_target *t; 297 298 damon_for_each_target(t, ctx) { 299 if (damon_va_three_regions(t, three_regions)) 300 continue; 301 damon_set_regions(t, three_regions, 3); 302 } 303 } 304 305 static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, 306 unsigned long next, struct mm_walk *walk) 307 { 308 pte_t *pte; 309 pmd_t pmde; 310 spinlock_t *ptl; 311 312 if (pmd_trans_huge(pmdp_get(pmd))) { 313 ptl = pmd_lock(walk->mm, pmd); 314 pmde = pmdp_get(pmd); 315 316 if (!pmd_present(pmde)) { 317 spin_unlock(ptl); 318 return 0; 319 } 320 321 if (pmd_trans_huge(pmde)) { 322 damon_pmdp_mkold(pmd, walk->vma, addr); 323 spin_unlock(ptl); 324 return 0; 325 } 326 spin_unlock(ptl); 327 } 328 329 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 330 if (!pte) { 331 walk->action = ACTION_AGAIN; 332 return 0; 333 } 334 if (!pte_present(ptep_get(pte))) 335 goto out; 336 damon_ptep_mkold(pte, walk->vma, addr); 337 out: 338 pte_unmap_unlock(pte, ptl); 339 return 0; 340 } 341 342 #ifdef CONFIG_HUGETLB_PAGE 343 static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, 344 struct vm_area_struct *vma, unsigned long addr) 345 { 346 bool referenced = false; 347 pte_t entry = huge_ptep_get(mm, addr, pte); 348 struct folio *folio = pfn_folio(pte_pfn(entry)); 349 unsigned long psize = huge_page_size(hstate_vma(vma)); 350 351 folio_get(folio); 352 353 if (pte_young(entry)) { 354 referenced = true; 355 entry = pte_mkold(entry); 356 set_huge_pte_at(mm, addr, pte, entry, psize); 357 } 358 359 if (mmu_notifier_clear_young(mm, addr, 360 addr + huge_page_size(hstate_vma(vma)))) 361 referenced = true; 362 363 if (referenced) 364 folio_set_young(folio); 365 366 folio_set_idle(folio); 367 folio_put(folio); 368 } 369 370 static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, 371 unsigned long addr, unsigned long end, 372 struct mm_walk *walk) 373 { 374 struct hstate *h = hstate_vma(walk->vma); 375 spinlock_t *ptl; 376 pte_t entry; 377 378 ptl = huge_pte_lock(h, walk->mm, pte); 379 entry = huge_ptep_get(walk->mm, addr, pte); 380 if (!pte_present(entry)) 381 goto out; 382 383 damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); 384 385 out: 386 spin_unlock(ptl); 387 return 0; 388 } 389 #else 390 #define damon_mkold_hugetlb_entry NULL 391 #endif /* CONFIG_HUGETLB_PAGE */ 392 393 static const struct mm_walk_ops damon_mkold_ops = { 394 .pmd_entry = damon_mkold_pmd_entry, 395 .hugetlb_entry = damon_mkold_hugetlb_entry, 396 .walk_lock = PGWALK_RDLOCK, 397 }; 398 399 static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) 400 { 401 mmap_read_lock(mm); 402 walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); 403 mmap_read_unlock(mm); 404 } 405 406 /* 407 * Functions for the access checking of the regions 408 */ 409 410 static void __damon_va_prepare_access_check(struct mm_struct *mm, 411 struct damon_region *r) 412 { 413 r->sampling_addr = damon_rand(r->ar.start, r->ar.end); 414 415 damon_va_mkold(mm, r->sampling_addr); 416 } 417 418 static void damon_va_prepare_access_checks(struct damon_ctx *ctx) 419 { 420 struct damon_target *t; 421 struct mm_struct *mm; 422 struct damon_region *r; 423 424 damon_for_each_target(t, ctx) { 425 mm = damon_get_mm(t); 426 if (!mm) 427 continue; 428 damon_for_each_region(r, t) 429 __damon_va_prepare_access_check(mm, r); 430 mmput(mm); 431 } 432 } 433 434 struct damon_young_walk_private { 435 /* size of the folio for the access checked virtual memory address */ 436 unsigned long *folio_sz; 437 bool young; 438 }; 439 440 static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, 441 unsigned long next, struct mm_walk *walk) 442 { 443 pte_t *pte; 444 pte_t ptent; 445 spinlock_t *ptl; 446 struct folio *folio; 447 struct damon_young_walk_private *priv = walk->private; 448 449 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 450 if (pmd_trans_huge(pmdp_get(pmd))) { 451 pmd_t pmde; 452 453 ptl = pmd_lock(walk->mm, pmd); 454 pmde = pmdp_get(pmd); 455 456 if (!pmd_present(pmde)) { 457 spin_unlock(ptl); 458 return 0; 459 } 460 461 if (!pmd_trans_huge(pmde)) { 462 spin_unlock(ptl); 463 goto regular_page; 464 } 465 folio = damon_get_folio(pmd_pfn(pmde)); 466 if (!folio) 467 goto huge_out; 468 if (pmd_young(pmde) || !folio_test_idle(folio) || 469 mmu_notifier_test_young(walk->mm, 470 addr)) 471 priv->young = true; 472 *priv->folio_sz = HPAGE_PMD_SIZE; 473 folio_put(folio); 474 huge_out: 475 spin_unlock(ptl); 476 return 0; 477 } 478 479 regular_page: 480 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 481 482 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 483 if (!pte) { 484 walk->action = ACTION_AGAIN; 485 return 0; 486 } 487 ptent = ptep_get(pte); 488 if (!pte_present(ptent)) 489 goto out; 490 folio = damon_get_folio(pte_pfn(ptent)); 491 if (!folio) 492 goto out; 493 if (pte_young(ptent) || !folio_test_idle(folio) || 494 mmu_notifier_test_young(walk->mm, addr)) 495 priv->young = true; 496 *priv->folio_sz = folio_size(folio); 497 folio_put(folio); 498 out: 499 pte_unmap_unlock(pte, ptl); 500 return 0; 501 } 502 503 #ifdef CONFIG_HUGETLB_PAGE 504 static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, 505 unsigned long addr, unsigned long end, 506 struct mm_walk *walk) 507 { 508 struct damon_young_walk_private *priv = walk->private; 509 struct hstate *h = hstate_vma(walk->vma); 510 struct folio *folio; 511 spinlock_t *ptl; 512 pte_t entry; 513 514 ptl = huge_pte_lock(h, walk->mm, pte); 515 entry = huge_ptep_get(walk->mm, addr, pte); 516 if (!pte_present(entry)) 517 goto out; 518 519 folio = pfn_folio(pte_pfn(entry)); 520 folio_get(folio); 521 522 if (pte_young(entry) || !folio_test_idle(folio) || 523 mmu_notifier_test_young(walk->mm, addr)) 524 priv->young = true; 525 *priv->folio_sz = huge_page_size(h); 526 527 folio_put(folio); 528 529 out: 530 spin_unlock(ptl); 531 return 0; 532 } 533 #else 534 #define damon_young_hugetlb_entry NULL 535 #endif /* CONFIG_HUGETLB_PAGE */ 536 537 static const struct mm_walk_ops damon_young_ops = { 538 .pmd_entry = damon_young_pmd_entry, 539 .hugetlb_entry = damon_young_hugetlb_entry, 540 .walk_lock = PGWALK_RDLOCK, 541 }; 542 543 static bool damon_va_young(struct mm_struct *mm, unsigned long addr, 544 unsigned long *folio_sz) 545 { 546 struct damon_young_walk_private arg = { 547 .folio_sz = folio_sz, 548 .young = false, 549 }; 550 551 mmap_read_lock(mm); 552 walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); 553 mmap_read_unlock(mm); 554 return arg.young; 555 } 556 557 /* 558 * Check whether the region was accessed after the last preparation 559 * 560 * mm 'mm_struct' for the given virtual address space 561 * r the region to be checked 562 */ 563 static void __damon_va_check_access(struct mm_struct *mm, 564 struct damon_region *r, bool same_target, 565 struct damon_attrs *attrs) 566 { 567 static unsigned long last_addr; 568 static unsigned long last_folio_sz = PAGE_SIZE; 569 static bool last_accessed; 570 571 if (!mm) { 572 damon_update_region_access_rate(r, false, attrs); 573 return; 574 } 575 576 /* If the region is in the last checked page, reuse the result */ 577 if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == 578 ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { 579 damon_update_region_access_rate(r, last_accessed, attrs); 580 return; 581 } 582 583 last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); 584 damon_update_region_access_rate(r, last_accessed, attrs); 585 586 last_addr = r->sampling_addr; 587 } 588 589 static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) 590 { 591 struct damon_target *t; 592 struct mm_struct *mm; 593 struct damon_region *r; 594 unsigned int max_nr_accesses = 0; 595 bool same_target; 596 597 damon_for_each_target(t, ctx) { 598 mm = damon_get_mm(t); 599 same_target = false; 600 damon_for_each_region(r, t) { 601 __damon_va_check_access(mm, r, same_target, 602 &ctx->attrs); 603 max_nr_accesses = max(r->nr_accesses, max_nr_accesses); 604 same_target = true; 605 } 606 if (mm) 607 mmput(mm); 608 } 609 610 return max_nr_accesses; 611 } 612 613 /* 614 * Functions for the target validity check and cleanup 615 */ 616 617 static bool damon_va_target_valid(struct damon_target *t) 618 { 619 struct task_struct *task; 620 621 task = damon_get_task_struct(t); 622 if (task) { 623 put_task_struct(task); 624 return true; 625 } 626 627 return false; 628 } 629 630 #ifndef CONFIG_ADVISE_SYSCALLS 631 static unsigned long damos_madvise(struct damon_target *target, 632 struct damon_region *r, int behavior) 633 { 634 return 0; 635 } 636 #else 637 static unsigned long damos_madvise(struct damon_target *target, 638 struct damon_region *r, int behavior) 639 { 640 struct mm_struct *mm; 641 unsigned long start = PAGE_ALIGN(r->ar.start); 642 unsigned long len = PAGE_ALIGN(damon_sz_region(r)); 643 unsigned long applied; 644 645 mm = damon_get_mm(target); 646 if (!mm) 647 return 0; 648 649 applied = do_madvise(mm, start, len, behavior) ? 0 : len; 650 mmput(mm); 651 652 return applied; 653 } 654 #endif /* CONFIG_ADVISE_SYSCALLS */ 655 656 static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, 657 struct damon_target *t, struct damon_region *r, 658 struct damos *scheme) 659 { 660 int madv_action; 661 662 switch (scheme->action) { 663 case DAMOS_WILLNEED: 664 madv_action = MADV_WILLNEED; 665 break; 666 case DAMOS_COLD: 667 madv_action = MADV_COLD; 668 break; 669 case DAMOS_PAGEOUT: 670 madv_action = MADV_PAGEOUT; 671 break; 672 case DAMOS_HUGEPAGE: 673 madv_action = MADV_HUGEPAGE; 674 break; 675 case DAMOS_NOHUGEPAGE: 676 madv_action = MADV_NOHUGEPAGE; 677 break; 678 case DAMOS_STAT: 679 return 0; 680 default: 681 /* 682 * DAMOS actions that are not yet supported by 'vaddr'. 683 */ 684 return 0; 685 } 686 687 return damos_madvise(t, r, madv_action); 688 } 689 690 static int damon_va_scheme_score(struct damon_ctx *context, 691 struct damon_target *t, struct damon_region *r, 692 struct damos *scheme) 693 { 694 695 switch (scheme->action) { 696 case DAMOS_PAGEOUT: 697 return damon_cold_score(context, r, scheme); 698 default: 699 break; 700 } 701 702 return DAMOS_MAX_SCORE; 703 } 704 705 static int __init damon_va_initcall(void) 706 { 707 struct damon_operations ops = { 708 .id = DAMON_OPS_VADDR, 709 .init = damon_va_init, 710 .update = damon_va_update, 711 .prepare_access_checks = damon_va_prepare_access_checks, 712 .check_accesses = damon_va_check_accesses, 713 .reset_aggregated = NULL, 714 .target_valid = damon_va_target_valid, 715 .cleanup = NULL, 716 .apply_scheme = damon_va_apply_scheme, 717 .get_scheme_score = damon_va_scheme_score, 718 }; 719 /* ops for fixed virtual address ranges */ 720 struct damon_operations ops_fvaddr = ops; 721 int err; 722 723 /* Don't set the monitoring target regions for the entire mapping */ 724 ops_fvaddr.id = DAMON_OPS_FVADDR; 725 ops_fvaddr.init = NULL; 726 ops_fvaddr.update = NULL; 727 728 err = damon_register_ops(&ops); 729 if (err) 730 return err; 731 return damon_register_ops(&ops_fvaddr); 732 }; 733 734 subsys_initcall(damon_va_initcall); 735 736 #include "tests/vaddr-kunit.h" 737