xref: /linux/mm/damon/vaddr.c (revision 971370a88c3b1be1144c11468b4c84e3ed17af6d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * DAMON Code for Virtual Address Spaces
4  *
5  * Author: SeongJae Park <sj@kernel.org>
6  */
7 
8 #define pr_fmt(fmt) "damon-va: " fmt
9 
10 #include <linux/highmem.h>
11 #include <linux/hugetlb.h>
12 #include <linux/mman.h>
13 #include <linux/mmu_notifier.h>
14 #include <linux/page_idle.h>
15 #include <linux/pagewalk.h>
16 #include <linux/sched/mm.h>
17 
18 #include "../internal.h"
19 #include "ops-common.h"
20 
21 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
22 #undef DAMON_MIN_REGION
23 #define DAMON_MIN_REGION 1
24 #endif
25 
26 /*
27  * 't->pid' should be the pointer to the relevant 'struct pid' having reference
28  * count.  Caller must put the returned task, unless it is NULL.
29  */
damon_get_task_struct(struct damon_target * t)30 static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
31 {
32 	return get_pid_task(t->pid, PIDTYPE_PID);
33 }
34 
35 /*
36  * Get the mm_struct of the given target
37  *
38  * Caller _must_ put the mm_struct after use, unless it is NULL.
39  *
40  * Returns the mm_struct of the target on success, NULL on failure
41  */
damon_get_mm(struct damon_target * t)42 static struct mm_struct *damon_get_mm(struct damon_target *t)
43 {
44 	struct task_struct *task;
45 	struct mm_struct *mm;
46 
47 	task = damon_get_task_struct(t);
48 	if (!task)
49 		return NULL;
50 
51 	mm = get_task_mm(task);
52 	put_task_struct(task);
53 	return mm;
54 }
55 
56 /*
57  * Functions for the initial monitoring target regions construction
58  */
59 
60 /*
61  * Size-evenly split a region into 'nr_pieces' small regions
62  *
63  * Returns 0 on success, or negative error code otherwise.
64  */
damon_va_evenly_split_region(struct damon_target * t,struct damon_region * r,unsigned int nr_pieces)65 static int damon_va_evenly_split_region(struct damon_target *t,
66 		struct damon_region *r, unsigned int nr_pieces)
67 {
68 	unsigned long sz_orig, sz_piece, orig_end;
69 	struct damon_region *n = NULL, *next;
70 	unsigned long start;
71 	unsigned int i;
72 
73 	if (!r || !nr_pieces)
74 		return -EINVAL;
75 
76 	if (nr_pieces == 1)
77 		return 0;
78 
79 	orig_end = r->ar.end;
80 	sz_orig = damon_sz_region(r);
81 	sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
82 
83 	if (!sz_piece)
84 		return -EINVAL;
85 
86 	r->ar.end = r->ar.start + sz_piece;
87 	next = damon_next_region(r);
88 	for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) {
89 		n = damon_new_region(start, start + sz_piece);
90 		if (!n)
91 			return -ENOMEM;
92 		damon_insert_region(n, r, next, t);
93 		r = n;
94 	}
95 	/* complement last region for possible rounding error */
96 	if (n)
97 		n->ar.end = orig_end;
98 
99 	return 0;
100 }
101 
sz_range(struct damon_addr_range * r)102 static unsigned long sz_range(struct damon_addr_range *r)
103 {
104 	return r->end - r->start;
105 }
106 
107 /*
108  * Find three regions separated by two biggest unmapped regions
109  *
110  * vma		the head vma of the target address space
111  * regions	an array of three address ranges that results will be saved
112  *
113  * This function receives an address space and finds three regions in it which
114  * separated by the two biggest unmapped regions in the space.  Please refer to
115  * below comments of '__damon_va_init_regions()' function to know why this is
116  * necessary.
117  *
118  * Returns 0 if success, or negative error code otherwise.
119  */
__damon_va_three_regions(struct mm_struct * mm,struct damon_addr_range regions[3])120 static int __damon_va_three_regions(struct mm_struct *mm,
121 				       struct damon_addr_range regions[3])
122 {
123 	struct damon_addr_range first_gap = {0}, second_gap = {0};
124 	VMA_ITERATOR(vmi, mm, 0);
125 	struct vm_area_struct *vma, *prev = NULL;
126 	unsigned long start;
127 
128 	/*
129 	 * Find the two biggest gaps so that first_gap > second_gap > others.
130 	 * If this is too slow, it can be optimised to examine the maple
131 	 * tree gaps.
132 	 */
133 	rcu_read_lock();
134 	for_each_vma(vmi, vma) {
135 		unsigned long gap;
136 
137 		if (!prev) {
138 			start = vma->vm_start;
139 			goto next;
140 		}
141 		gap = vma->vm_start - prev->vm_end;
142 
143 		if (gap > sz_range(&first_gap)) {
144 			second_gap = first_gap;
145 			first_gap.start = prev->vm_end;
146 			first_gap.end = vma->vm_start;
147 		} else if (gap > sz_range(&second_gap)) {
148 			second_gap.start = prev->vm_end;
149 			second_gap.end = vma->vm_start;
150 		}
151 next:
152 		prev = vma;
153 	}
154 	rcu_read_unlock();
155 
156 	if (!sz_range(&second_gap) || !sz_range(&first_gap))
157 		return -EINVAL;
158 
159 	/* Sort the two biggest gaps by address */
160 	if (first_gap.start > second_gap.start)
161 		swap(first_gap, second_gap);
162 
163 	/* Store the result */
164 	regions[0].start = ALIGN(start, DAMON_MIN_REGION);
165 	regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION);
166 	regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
167 	regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
168 	regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
169 	regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);
170 
171 	return 0;
172 }
173 
174 /*
175  * Get the three regions in the given target (task)
176  *
177  * Returns 0 on success, negative error code otherwise.
178  */
damon_va_three_regions(struct damon_target * t,struct damon_addr_range regions[3])179 static int damon_va_three_regions(struct damon_target *t,
180 				struct damon_addr_range regions[3])
181 {
182 	struct mm_struct *mm;
183 	int rc;
184 
185 	mm = damon_get_mm(t);
186 	if (!mm)
187 		return -EINVAL;
188 
189 	mmap_read_lock(mm);
190 	rc = __damon_va_three_regions(mm, regions);
191 	mmap_read_unlock(mm);
192 
193 	mmput(mm);
194 	return rc;
195 }
196 
197 /*
198  * Initialize the monitoring target regions for the given target (task)
199  *
200  * t	the given target
201  *
202  * Because only a number of small portions of the entire address space
203  * is actually mapped to the memory and accessed, monitoring the unmapped
204  * regions is wasteful.  That said, because we can deal with small noises,
205  * tracking every mapping is not strictly required but could even incur a high
206  * overhead if the mapping frequently changes or the number of mappings is
207  * high.  The adaptive regions adjustment mechanism will further help to deal
208  * with the noise by simply identifying the unmapped areas as a region that
209  * has no access.  Moreover, applying the real mappings that would have many
210  * unmapped areas inside will make the adaptive mechanism quite complex.  That
211  * said, too huge unmapped areas inside the monitoring target should be removed
212  * to not take the time for the adaptive mechanism.
213  *
214  * For the reason, we convert the complex mappings to three distinct regions
215  * that cover every mapped area of the address space.  Also the two gaps
216  * between the three regions are the two biggest unmapped areas in the given
217  * address space.  In detail, this function first identifies the start and the
218  * end of the mappings and the two biggest unmapped areas of the address space.
219  * Then, it constructs the three regions as below:
220  *
221  *     [mappings[0]->start, big_two_unmapped_areas[0]->start)
222  *     [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start)
223  *     [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end)
224  *
225  * As usual memory map of processes is as below, the gap between the heap and
226  * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed
227  * region and the stack will be two biggest unmapped regions.  Because these
228  * gaps are exceptionally huge areas in usual address space, excluding these
229  * two biggest unmapped regions will be sufficient to make a trade-off.
230  *
231  *   <heap>
232  *   <BIG UNMAPPED REGION 1>
233  *   <uppermost mmap()-ed region>
234  *   (other mmap()-ed regions and small unmapped regions)
235  *   <lowermost mmap()-ed region>
236  *   <BIG UNMAPPED REGION 2>
237  *   <stack>
238  */
__damon_va_init_regions(struct damon_ctx * ctx,struct damon_target * t)239 static void __damon_va_init_regions(struct damon_ctx *ctx,
240 				     struct damon_target *t)
241 {
242 	struct damon_target *ti;
243 	struct damon_region *r;
244 	struct damon_addr_range regions[3];
245 	unsigned long sz = 0, nr_pieces;
246 	int i, tidx = 0;
247 
248 	if (damon_va_three_regions(t, regions)) {
249 		damon_for_each_target(ti, ctx) {
250 			if (ti == t)
251 				break;
252 			tidx++;
253 		}
254 		pr_debug("Failed to get three regions of %dth target\n", tidx);
255 		return;
256 	}
257 
258 	for (i = 0; i < 3; i++)
259 		sz += regions[i].end - regions[i].start;
260 	if (ctx->attrs.min_nr_regions)
261 		sz /= ctx->attrs.min_nr_regions;
262 	if (sz < DAMON_MIN_REGION)
263 		sz = DAMON_MIN_REGION;
264 
265 	/* Set the initial three regions of the target */
266 	for (i = 0; i < 3; i++) {
267 		r = damon_new_region(regions[i].start, regions[i].end);
268 		if (!r) {
269 			pr_err("%d'th init region creation failed\n", i);
270 			return;
271 		}
272 		damon_add_region(r, t);
273 
274 		nr_pieces = (regions[i].end - regions[i].start) / sz;
275 		damon_va_evenly_split_region(t, r, nr_pieces);
276 	}
277 }
278 
279 /* Initialize '->regions_list' of every target (task) */
damon_va_init(struct damon_ctx * ctx)280 static void damon_va_init(struct damon_ctx *ctx)
281 {
282 	struct damon_target *t;
283 
284 	damon_for_each_target(t, ctx) {
285 		/* the user may set the target regions as they want */
286 		if (!damon_nr_regions(t))
287 			__damon_va_init_regions(ctx, t);
288 	}
289 }
290 
291 /*
292  * Update regions for current memory mappings
293  */
damon_va_update(struct damon_ctx * ctx)294 static void damon_va_update(struct damon_ctx *ctx)
295 {
296 	struct damon_addr_range three_regions[3];
297 	struct damon_target *t;
298 
299 	damon_for_each_target(t, ctx) {
300 		if (damon_va_three_regions(t, three_regions))
301 			continue;
302 		damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
303 	}
304 }
305 
damon_mkold_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)306 static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
307 		unsigned long next, struct mm_walk *walk)
308 {
309 	pte_t *pte;
310 	pmd_t pmde;
311 	spinlock_t *ptl;
312 
313 	if (pmd_trans_huge(pmdp_get(pmd))) {
314 		ptl = pmd_lock(walk->mm, pmd);
315 		pmde = pmdp_get(pmd);
316 
317 		if (!pmd_present(pmde)) {
318 			spin_unlock(ptl);
319 			return 0;
320 		}
321 
322 		if (pmd_trans_huge(pmde)) {
323 			damon_pmdp_mkold(pmd, walk->vma, addr);
324 			spin_unlock(ptl);
325 			return 0;
326 		}
327 		spin_unlock(ptl);
328 	}
329 
330 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
331 	if (!pte)
332 		return 0;
333 	if (!pte_present(ptep_get(pte)))
334 		goto out;
335 	damon_ptep_mkold(pte, walk->vma, addr);
336 out:
337 	pte_unmap_unlock(pte, ptl);
338 	return 0;
339 }
340 
341 #ifdef CONFIG_HUGETLB_PAGE
damon_hugetlb_mkold(pte_t * pte,struct mm_struct * mm,struct vm_area_struct * vma,unsigned long addr)342 static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
343 				struct vm_area_struct *vma, unsigned long addr)
344 {
345 	bool referenced = false;
346 	pte_t entry = huge_ptep_get(mm, addr, pte);
347 	struct folio *folio = pfn_folio(pte_pfn(entry));
348 	unsigned long psize = huge_page_size(hstate_vma(vma));
349 
350 	folio_get(folio);
351 
352 	if (pte_young(entry)) {
353 		referenced = true;
354 		entry = pte_mkold(entry);
355 		set_huge_pte_at(mm, addr, pte, entry, psize);
356 	}
357 
358 	if (mmu_notifier_clear_young(mm, addr,
359 				     addr + huge_page_size(hstate_vma(vma))))
360 		referenced = true;
361 
362 	if (referenced)
363 		folio_set_young(folio);
364 
365 	folio_set_idle(folio);
366 	folio_put(folio);
367 }
368 
damon_mkold_hugetlb_entry(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)369 static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
370 				     unsigned long addr, unsigned long end,
371 				     struct mm_walk *walk)
372 {
373 	struct hstate *h = hstate_vma(walk->vma);
374 	spinlock_t *ptl;
375 	pte_t entry;
376 
377 	ptl = huge_pte_lock(h, walk->mm, pte);
378 	entry = huge_ptep_get(walk->mm, addr, pte);
379 	if (!pte_present(entry))
380 		goto out;
381 
382 	damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
383 
384 out:
385 	spin_unlock(ptl);
386 	return 0;
387 }
388 #else
389 #define damon_mkold_hugetlb_entry NULL
390 #endif /* CONFIG_HUGETLB_PAGE */
391 
392 static const struct mm_walk_ops damon_mkold_ops = {
393 	.pmd_entry = damon_mkold_pmd_entry,
394 	.hugetlb_entry = damon_mkold_hugetlb_entry,
395 	.walk_lock = PGWALK_RDLOCK,
396 };
397 
damon_va_mkold(struct mm_struct * mm,unsigned long addr)398 static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
399 {
400 	mmap_read_lock(mm);
401 	walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
402 	mmap_read_unlock(mm);
403 }
404 
405 /*
406  * Functions for the access checking of the regions
407  */
408 
__damon_va_prepare_access_check(struct mm_struct * mm,struct damon_region * r)409 static void __damon_va_prepare_access_check(struct mm_struct *mm,
410 					struct damon_region *r)
411 {
412 	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
413 
414 	damon_va_mkold(mm, r->sampling_addr);
415 }
416 
damon_va_prepare_access_checks(struct damon_ctx * ctx)417 static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
418 {
419 	struct damon_target *t;
420 	struct mm_struct *mm;
421 	struct damon_region *r;
422 
423 	damon_for_each_target(t, ctx) {
424 		mm = damon_get_mm(t);
425 		if (!mm)
426 			continue;
427 		damon_for_each_region(r, t)
428 			__damon_va_prepare_access_check(mm, r);
429 		mmput(mm);
430 	}
431 }
432 
433 struct damon_young_walk_private {
434 	/* size of the folio for the access checked virtual memory address */
435 	unsigned long *folio_sz;
436 	bool young;
437 };
438 
damon_young_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)439 static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
440 		unsigned long next, struct mm_walk *walk)
441 {
442 	pte_t *pte;
443 	pte_t ptent;
444 	spinlock_t *ptl;
445 	struct folio *folio;
446 	struct damon_young_walk_private *priv = walk->private;
447 
448 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
449 	if (pmd_trans_huge(pmdp_get(pmd))) {
450 		pmd_t pmde;
451 
452 		ptl = pmd_lock(walk->mm, pmd);
453 		pmde = pmdp_get(pmd);
454 
455 		if (!pmd_present(pmde)) {
456 			spin_unlock(ptl);
457 			return 0;
458 		}
459 
460 		if (!pmd_trans_huge(pmde)) {
461 			spin_unlock(ptl);
462 			goto regular_page;
463 		}
464 		folio = damon_get_folio(pmd_pfn(pmde));
465 		if (!folio)
466 			goto huge_out;
467 		if (pmd_young(pmde) || !folio_test_idle(folio) ||
468 					mmu_notifier_test_young(walk->mm,
469 						addr))
470 			priv->young = true;
471 		*priv->folio_sz = HPAGE_PMD_SIZE;
472 		folio_put(folio);
473 huge_out:
474 		spin_unlock(ptl);
475 		return 0;
476 	}
477 
478 regular_page:
479 #endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
480 
481 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
482 	if (!pte)
483 		return 0;
484 	ptent = ptep_get(pte);
485 	if (!pte_present(ptent))
486 		goto out;
487 	folio = damon_get_folio(pte_pfn(ptent));
488 	if (!folio)
489 		goto out;
490 	if (pte_young(ptent) || !folio_test_idle(folio) ||
491 			mmu_notifier_test_young(walk->mm, addr))
492 		priv->young = true;
493 	*priv->folio_sz = folio_size(folio);
494 	folio_put(folio);
495 out:
496 	pte_unmap_unlock(pte, ptl);
497 	return 0;
498 }
499 
500 #ifdef CONFIG_HUGETLB_PAGE
damon_young_hugetlb_entry(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)501 static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
502 				     unsigned long addr, unsigned long end,
503 				     struct mm_walk *walk)
504 {
505 	struct damon_young_walk_private *priv = walk->private;
506 	struct hstate *h = hstate_vma(walk->vma);
507 	struct folio *folio;
508 	spinlock_t *ptl;
509 	pte_t entry;
510 
511 	ptl = huge_pte_lock(h, walk->mm, pte);
512 	entry = huge_ptep_get(walk->mm, addr, pte);
513 	if (!pte_present(entry))
514 		goto out;
515 
516 	folio = pfn_folio(pte_pfn(entry));
517 	folio_get(folio);
518 
519 	if (pte_young(entry) || !folio_test_idle(folio) ||
520 	    mmu_notifier_test_young(walk->mm, addr))
521 		priv->young = true;
522 	*priv->folio_sz = huge_page_size(h);
523 
524 	folio_put(folio);
525 
526 out:
527 	spin_unlock(ptl);
528 	return 0;
529 }
530 #else
531 #define damon_young_hugetlb_entry NULL
532 #endif /* CONFIG_HUGETLB_PAGE */
533 
534 static const struct mm_walk_ops damon_young_ops = {
535 	.pmd_entry = damon_young_pmd_entry,
536 	.hugetlb_entry = damon_young_hugetlb_entry,
537 	.walk_lock = PGWALK_RDLOCK,
538 };
539 
damon_va_young(struct mm_struct * mm,unsigned long addr,unsigned long * folio_sz)540 static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
541 		unsigned long *folio_sz)
542 {
543 	struct damon_young_walk_private arg = {
544 		.folio_sz = folio_sz,
545 		.young = false,
546 	};
547 
548 	mmap_read_lock(mm);
549 	walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg);
550 	mmap_read_unlock(mm);
551 	return arg.young;
552 }
553 
554 /*
555  * Check whether the region was accessed after the last preparation
556  *
557  * mm	'mm_struct' for the given virtual address space
558  * r	the region to be checked
559  */
__damon_va_check_access(struct mm_struct * mm,struct damon_region * r,bool same_target,struct damon_attrs * attrs)560 static void __damon_va_check_access(struct mm_struct *mm,
561 				struct damon_region *r, bool same_target,
562 				struct damon_attrs *attrs)
563 {
564 	static unsigned long last_addr;
565 	static unsigned long last_folio_sz = PAGE_SIZE;
566 	static bool last_accessed;
567 
568 	if (!mm) {
569 		damon_update_region_access_rate(r, false, attrs);
570 		return;
571 	}
572 
573 	/* If the region is in the last checked page, reuse the result */
574 	if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
575 				ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
576 		damon_update_region_access_rate(r, last_accessed, attrs);
577 		return;
578 	}
579 
580 	last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
581 	damon_update_region_access_rate(r, last_accessed, attrs);
582 
583 	last_addr = r->sampling_addr;
584 }
585 
damon_va_check_accesses(struct damon_ctx * ctx)586 static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
587 {
588 	struct damon_target *t;
589 	struct mm_struct *mm;
590 	struct damon_region *r;
591 	unsigned int max_nr_accesses = 0;
592 	bool same_target;
593 
594 	damon_for_each_target(t, ctx) {
595 		mm = damon_get_mm(t);
596 		same_target = false;
597 		damon_for_each_region(r, t) {
598 			__damon_va_check_access(mm, r, same_target,
599 					&ctx->attrs);
600 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
601 			same_target = true;
602 		}
603 		if (mm)
604 			mmput(mm);
605 	}
606 
607 	return max_nr_accesses;
608 }
609 
damos_va_filter_young_match(struct damos_filter * filter,struct folio * folio,struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pmd_t * pmdp)610 static bool damos_va_filter_young_match(struct damos_filter *filter,
611 		struct folio *folio, struct vm_area_struct *vma,
612 		unsigned long addr, pte_t *ptep, pmd_t *pmdp)
613 {
614 	bool young = false;
615 
616 	if (ptep)
617 		young = pte_young(ptep_get(ptep));
618 	else if (pmdp)
619 		young = pmd_young(pmdp_get(pmdp));
620 
621 	young = young || !folio_test_idle(folio) ||
622 		mmu_notifier_test_young(vma->vm_mm, addr);
623 
624 	if (young && ptep)
625 		damon_ptep_mkold(ptep, vma, addr);
626 	else if (young && pmdp)
627 		damon_pmdp_mkold(pmdp, vma, addr);
628 
629 	return young == filter->matching;
630 }
631 
damos_va_filter_out(struct damos * scheme,struct folio * folio,struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pmd_t * pmdp)632 static bool damos_va_filter_out(struct damos *scheme, struct folio *folio,
633 		struct vm_area_struct *vma, unsigned long addr,
634 		pte_t *ptep, pmd_t *pmdp)
635 {
636 	struct damos_filter *filter;
637 	bool matched;
638 
639 	if (scheme->core_filters_allowed)
640 		return false;
641 
642 	damos_for_each_ops_filter(filter, scheme) {
643 		/*
644 		 * damos_folio_filter_match checks the young filter by doing an
645 		 * rmap on the folio to find its page table. However, being the
646 		 * vaddr scheme, we have direct access to the page tables, so
647 		 * use that instead.
648 		 */
649 		if (filter->type == DAMOS_FILTER_TYPE_YOUNG)
650 			matched = damos_va_filter_young_match(filter, folio,
651 				vma, addr, ptep, pmdp);
652 		else
653 			matched = damos_folio_filter_match(filter, folio);
654 
655 		if (matched)
656 			return !filter->allow;
657 	}
658 	return scheme->ops_filters_default_reject;
659 }
660 
661 struct damos_va_migrate_private {
662 	struct list_head *migration_lists;
663 	struct damos *scheme;
664 };
665 
666 /*
667  * Place the given folio in the migration_list corresponding to where the folio
668  * should be migrated.
669  *
670  * The algorithm used here is similar to weighted_interleave_nid()
671  */
damos_va_migrate_dests_add(struct folio * folio,struct vm_area_struct * vma,unsigned long addr,struct damos_migrate_dests * dests,struct list_head * migration_lists)672 static void damos_va_migrate_dests_add(struct folio *folio,
673 		struct vm_area_struct *vma, unsigned long addr,
674 		struct damos_migrate_dests *dests,
675 		struct list_head *migration_lists)
676 {
677 	pgoff_t ilx;
678 	int order;
679 	unsigned int target;
680 	unsigned int weight_total = 0;
681 	int i;
682 
683 	/*
684 	 * If dests is empty, there is only one migration list corresponding
685 	 * to s->target_nid.
686 	 */
687 	if (!dests->nr_dests) {
688 		i = 0;
689 		goto isolate;
690 	}
691 
692 	order = folio_order(folio);
693 	ilx = vma->vm_pgoff >> order;
694 	ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
695 
696 	for (i = 0; i < dests->nr_dests; i++)
697 		weight_total += dests->weight_arr[i];
698 
699 	/* If the total weights are somehow 0, don't migrate at all */
700 	if (!weight_total)
701 		return;
702 
703 	target = ilx % weight_total;
704 	for (i = 0; i < dests->nr_dests; i++) {
705 		if (target < dests->weight_arr[i])
706 			break;
707 		target -= dests->weight_arr[i];
708 	}
709 
710 	/* If the folio is already in the right node, don't do anything */
711 	if (folio_nid(folio) == dests->node_id_arr[i])
712 		return;
713 
714 isolate:
715 	if (!folio_isolate_lru(folio))
716 		return;
717 
718 	list_add(&folio->lru, &migration_lists[i]);
719 }
720 
721 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
damos_va_migrate_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)722 static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
723 		unsigned long next, struct mm_walk *walk)
724 {
725 	struct damos_va_migrate_private *priv = walk->private;
726 	struct list_head *migration_lists = priv->migration_lists;
727 	struct damos *s = priv->scheme;
728 	struct damos_migrate_dests *dests = &s->migrate_dests;
729 	struct folio *folio;
730 	spinlock_t *ptl;
731 	pmd_t pmde;
732 
733 	ptl = pmd_lock(walk->mm, pmd);
734 	pmde = pmdp_get(pmd);
735 
736 	if (!pmd_present(pmde) || !pmd_trans_huge(pmde))
737 		goto unlock;
738 
739 	/* Tell page walk code to not split the PMD */
740 	walk->action = ACTION_CONTINUE;
741 
742 	folio = damon_get_folio(pmd_pfn(pmde));
743 	if (!folio)
744 		goto unlock;
745 
746 	if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd))
747 		goto put_folio;
748 
749 	damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
750 		migration_lists);
751 
752 put_folio:
753 	folio_put(folio);
754 unlock:
755 	spin_unlock(ptl);
756 	return 0;
757 }
758 #else
759 #define damos_va_migrate_pmd_entry NULL
760 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
761 
damos_va_migrate_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)762 static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr,
763 		unsigned long next, struct mm_walk *walk)
764 {
765 	struct damos_va_migrate_private *priv = walk->private;
766 	struct list_head *migration_lists = priv->migration_lists;
767 	struct damos *s = priv->scheme;
768 	struct damos_migrate_dests *dests = &s->migrate_dests;
769 	struct folio *folio;
770 	pte_t ptent;
771 
772 	ptent = ptep_get(pte);
773 	if (pte_none(ptent) || !pte_present(ptent))
774 		return 0;
775 
776 	folio = damon_get_folio(pte_pfn(ptent));
777 	if (!folio)
778 		return 0;
779 
780 	if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
781 		goto put_folio;
782 
783 	damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
784 		migration_lists);
785 
786 put_folio:
787 	folio_put(folio);
788 	return 0;
789 }
790 
791 /*
792  * Functions for the target validity check and cleanup
793  */
794 
damon_va_target_valid(struct damon_target * t)795 static bool damon_va_target_valid(struct damon_target *t)
796 {
797 	struct task_struct *task;
798 
799 	task = damon_get_task_struct(t);
800 	if (task) {
801 		put_task_struct(task);
802 		return true;
803 	}
804 
805 	return false;
806 }
807 
damon_va_cleanup_target(struct damon_target * t)808 static void damon_va_cleanup_target(struct damon_target *t)
809 {
810 	put_pid(t->pid);
811 }
812 
813 #ifndef CONFIG_ADVISE_SYSCALLS
damos_madvise(struct damon_target * target,struct damon_region * r,int behavior)814 static unsigned long damos_madvise(struct damon_target *target,
815 		struct damon_region *r, int behavior)
816 {
817 	return 0;
818 }
819 #else
damos_madvise(struct damon_target * target,struct damon_region * r,int behavior)820 static unsigned long damos_madvise(struct damon_target *target,
821 		struct damon_region *r, int behavior)
822 {
823 	struct mm_struct *mm;
824 	unsigned long start = PAGE_ALIGN(r->ar.start);
825 	unsigned long len = PAGE_ALIGN(damon_sz_region(r));
826 	unsigned long applied;
827 
828 	mm = damon_get_mm(target);
829 	if (!mm)
830 		return 0;
831 
832 	applied = do_madvise(mm, start, len, behavior) ? 0 : len;
833 	mmput(mm);
834 
835 	return applied;
836 }
837 #endif	/* CONFIG_ADVISE_SYSCALLS */
838 
damos_va_migrate(struct damon_target * target,struct damon_region * r,struct damos * s,unsigned long * sz_filter_passed)839 static unsigned long damos_va_migrate(struct damon_target *target,
840 		struct damon_region *r, struct damos *s,
841 		unsigned long *sz_filter_passed)
842 {
843 	LIST_HEAD(folio_list);
844 	struct damos_va_migrate_private priv;
845 	struct mm_struct *mm;
846 	int nr_dests;
847 	int nid;
848 	bool use_target_nid;
849 	unsigned long applied = 0;
850 	struct damos_migrate_dests *dests = &s->migrate_dests;
851 	struct mm_walk_ops walk_ops = {
852 		.pmd_entry = damos_va_migrate_pmd_entry,
853 		.pte_entry = damos_va_migrate_pte_entry,
854 		.walk_lock = PGWALK_RDLOCK,
855 	};
856 
857 	use_target_nid = dests->nr_dests == 0;
858 	nr_dests = use_target_nid ? 1 : dests->nr_dests;
859 	priv.scheme = s;
860 	priv.migration_lists = kmalloc_array(nr_dests,
861 		sizeof(*priv.migration_lists), GFP_KERNEL);
862 	if (!priv.migration_lists)
863 		return 0;
864 
865 	for (int i = 0; i < nr_dests; i++)
866 		INIT_LIST_HEAD(&priv.migration_lists[i]);
867 
868 
869 	mm = damon_get_mm(target);
870 	if (!mm)
871 		goto free_lists;
872 
873 	mmap_read_lock(mm);
874 	walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
875 	mmap_read_unlock(mm);
876 	mmput(mm);
877 
878 	for (int i = 0; i < nr_dests; i++) {
879 		nid = use_target_nid ? s->target_nid : dests->node_id_arr[i];
880 		applied += damon_migrate_pages(&priv.migration_lists[i], nid);
881 		cond_resched();
882 	}
883 
884 free_lists:
885 	kfree(priv.migration_lists);
886 	return applied * PAGE_SIZE;
887 }
888 
889 struct damos_va_stat_private {
890 	struct damos *scheme;
891 	unsigned long *sz_filter_passed;
892 };
893 
damos_va_invalid_folio(struct folio * folio,struct damos * s)894 static inline bool damos_va_invalid_folio(struct folio *folio,
895 		struct damos *s)
896 {
897 	return !folio || folio == s->last_applied;
898 }
899 
damos_va_stat_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)900 static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr,
901 		unsigned long next, struct mm_walk *walk)
902 {
903 	struct damos_va_stat_private *priv = walk->private;
904 	struct damos *s = priv->scheme;
905 	unsigned long *sz_filter_passed = priv->sz_filter_passed;
906 	struct vm_area_struct *vma = walk->vma;
907 	struct folio *folio;
908 	spinlock_t *ptl;
909 	pte_t *start_pte, *pte, ptent;
910 	int nr;
911 
912 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
913 	if (pmd_trans_huge(*pmd)) {
914 		pmd_t pmde;
915 
916 		ptl = pmd_trans_huge_lock(pmd, vma);
917 		if (!ptl)
918 			return 0;
919 		pmde = pmdp_get(pmd);
920 		if (!pmd_present(pmde))
921 			goto huge_unlock;
922 
923 		folio = vm_normal_folio_pmd(vma, addr, pmde);
924 
925 		if (damos_va_invalid_folio(folio, s))
926 			goto huge_unlock;
927 
928 		if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd))
929 			*sz_filter_passed += folio_size(folio);
930 		s->last_applied = folio;
931 
932 huge_unlock:
933 		spin_unlock(ptl);
934 		return 0;
935 	}
936 #endif
937 	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
938 	if (!start_pte)
939 		return 0;
940 
941 	for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) {
942 		nr = 1;
943 		ptent = ptep_get(pte);
944 
945 		if (pte_none(ptent) || !pte_present(ptent))
946 			continue;
947 
948 		folio = vm_normal_folio(vma, addr, ptent);
949 
950 		if (damos_va_invalid_folio(folio, s))
951 			continue;
952 
953 		if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL))
954 			*sz_filter_passed += folio_size(folio);
955 		nr = folio_nr_pages(folio);
956 		s->last_applied = folio;
957 	}
958 	pte_unmap_unlock(start_pte, ptl);
959 	return 0;
960 }
961 
damos_va_stat(struct damon_target * target,struct damon_region * r,struct damos * s,unsigned long * sz_filter_passed)962 static unsigned long damos_va_stat(struct damon_target *target,
963 		struct damon_region *r, struct damos *s,
964 		unsigned long *sz_filter_passed)
965 {
966 	struct damos_va_stat_private priv;
967 	struct mm_struct *mm;
968 	struct mm_walk_ops walk_ops = {
969 		.pmd_entry = damos_va_stat_pmd_entry,
970 		.walk_lock = PGWALK_RDLOCK,
971 	};
972 
973 	priv.scheme = s;
974 	priv.sz_filter_passed = sz_filter_passed;
975 
976 	if (!damos_ops_has_filter(s))
977 		return 0;
978 
979 	mm = damon_get_mm(target);
980 	if (!mm)
981 		return 0;
982 
983 	mmap_read_lock(mm);
984 	walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
985 	mmap_read_unlock(mm);
986 	mmput(mm);
987 	return 0;
988 }
989 
damon_va_apply_scheme(struct damon_ctx * ctx,struct damon_target * t,struct damon_region * r,struct damos * scheme,unsigned long * sz_filter_passed)990 static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
991 		struct damon_target *t, struct damon_region *r,
992 		struct damos *scheme, unsigned long *sz_filter_passed)
993 {
994 	int madv_action;
995 
996 	switch (scheme->action) {
997 	case DAMOS_WILLNEED:
998 		madv_action = MADV_WILLNEED;
999 		break;
1000 	case DAMOS_COLD:
1001 		madv_action = MADV_COLD;
1002 		break;
1003 	case DAMOS_PAGEOUT:
1004 		madv_action = MADV_PAGEOUT;
1005 		break;
1006 	case DAMOS_HUGEPAGE:
1007 		madv_action = MADV_HUGEPAGE;
1008 		break;
1009 	case DAMOS_NOHUGEPAGE:
1010 		madv_action = MADV_NOHUGEPAGE;
1011 		break;
1012 	case DAMOS_MIGRATE_HOT:
1013 	case DAMOS_MIGRATE_COLD:
1014 		return damos_va_migrate(t, r, scheme, sz_filter_passed);
1015 	case DAMOS_STAT:
1016 		return damos_va_stat(t, r, scheme, sz_filter_passed);
1017 	default:
1018 		/*
1019 		 * DAMOS actions that are not yet supported by 'vaddr'.
1020 		 */
1021 		return 0;
1022 	}
1023 
1024 	return damos_madvise(t, r, madv_action);
1025 }
1026 
damon_va_scheme_score(struct damon_ctx * context,struct damon_target * t,struct damon_region * r,struct damos * scheme)1027 static int damon_va_scheme_score(struct damon_ctx *context,
1028 		struct damon_target *t, struct damon_region *r,
1029 		struct damos *scheme)
1030 {
1031 
1032 	switch (scheme->action) {
1033 	case DAMOS_PAGEOUT:
1034 		return damon_cold_score(context, r, scheme);
1035 	case DAMOS_MIGRATE_HOT:
1036 		return damon_hot_score(context, r, scheme);
1037 	case DAMOS_MIGRATE_COLD:
1038 		return damon_cold_score(context, r, scheme);
1039 	default:
1040 		break;
1041 	}
1042 
1043 	return DAMOS_MAX_SCORE;
1044 }
1045 
damon_va_initcall(void)1046 static int __init damon_va_initcall(void)
1047 {
1048 	struct damon_operations ops = {
1049 		.id = DAMON_OPS_VADDR,
1050 		.init = damon_va_init,
1051 		.update = damon_va_update,
1052 		.prepare_access_checks = damon_va_prepare_access_checks,
1053 		.check_accesses = damon_va_check_accesses,
1054 		.target_valid = damon_va_target_valid,
1055 		.cleanup_target = damon_va_cleanup_target,
1056 		.cleanup = NULL,
1057 		.apply_scheme = damon_va_apply_scheme,
1058 		.get_scheme_score = damon_va_scheme_score,
1059 	};
1060 	/* ops for fixed virtual address ranges */
1061 	struct damon_operations ops_fvaddr = ops;
1062 	int err;
1063 
1064 	/* Don't set the monitoring target regions for the entire mapping */
1065 	ops_fvaddr.id = DAMON_OPS_FVADDR;
1066 	ops_fvaddr.init = NULL;
1067 	ops_fvaddr.update = NULL;
1068 
1069 	err = damon_register_ops(&ops);
1070 	if (err)
1071 		return err;
1072 	return damon_register_ops(&ops_fvaddr);
1073 };
1074 
1075 subsys_initcall(damon_va_initcall);
1076 
1077 #include "tests/vaddr-kunit.h"
1078