1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Common Code for Data Access Monitoring
4 *
5 * Author: SeongJae Park <sj@kernel.org>
6 */
7
8 #include <linux/migrate.h>
9 #include <linux/mmu_notifier.h>
10 #include <linux/page_idle.h>
11 #include <linux/pagemap.h>
12 #include <linux/rmap.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15
16 #include "../internal.h"
17 #include "ops-common.h"
18
19 /*
20 * Get an online page for a pfn if it's in the LRU list. Otherwise, returns
21 * NULL.
22 *
23 * The body of this function is stolen from the 'page_idle_get_folio()'. We
24 * steal rather than reuse it because the code is quite simple.
25 */
damon_get_folio(unsigned long pfn)26 struct folio *damon_get_folio(unsigned long pfn)
27 {
28 struct page *page = pfn_to_online_page(pfn);
29 struct folio *folio;
30
31 if (!page)
32 return NULL;
33
34 folio = page_folio(page);
35 if (!folio_test_lru(folio) || !folio_try_get(folio))
36 return NULL;
37 if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
38 folio_put(folio);
39 folio = NULL;
40 }
41 return folio;
42 }
43
damon_ptep_mkold(pte_t * pte,struct vm_area_struct * vma,unsigned long addr)44 void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
45 {
46 pte_t pteval = ptep_get(pte);
47 struct folio *folio;
48 bool young = false;
49 unsigned long pfn;
50
51 if (likely(pte_present(pteval)))
52 pfn = pte_pfn(pteval);
53 else
54 pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
55
56 folio = damon_get_folio(pfn);
57 if (!folio)
58 return;
59
60 /*
61 * PFN swap PTEs, such as device-exclusive ones, that actually map pages
62 * are "old" from a CPU perspective. The MMU notifier takes care of any
63 * device aspects.
64 */
65 if (likely(pte_present(pteval)))
66 young |= ptep_test_and_clear_young(vma, addr, pte);
67 young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
68 if (young)
69 folio_set_young(folio);
70
71 folio_set_idle(folio);
72 folio_put(folio);
73 }
74
damon_pmdp_mkold(pmd_t * pmd,struct vm_area_struct * vma,unsigned long addr)75 void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
76 {
77 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
78 struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd)));
79
80 if (!folio)
81 return;
82
83 if (pmdp_clear_young_notify(vma, addr, pmd))
84 folio_set_young(folio);
85
86 folio_set_idle(folio);
87 folio_put(folio);
88 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
89 }
90
91 #define DAMON_MAX_SUBSCORE (100)
92 #define DAMON_MAX_AGE_IN_LOG (32)
93
damon_hot_score(struct damon_ctx * c,struct damon_region * r,struct damos * s)94 int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
95 struct damos *s)
96 {
97 int freq_subscore;
98 unsigned int age_in_sec;
99 int age_in_log, age_subscore;
100 unsigned int freq_weight = s->quota.weight_nr_accesses;
101 unsigned int age_weight = s->quota.weight_age;
102 int hotness;
103
104 freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE /
105 damon_max_nr_accesses(&c->attrs);
106
107 age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
108 for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
109 age_in_log++, age_in_sec >>= 1)
110 ;
111
112 /* If frequency is 0, higher age means it's colder */
113 if (freq_subscore == 0)
114 age_in_log *= -1;
115
116 /*
117 * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
118 * Scale it to be in [0, 100] and set it as age subscore.
119 */
120 age_in_log += DAMON_MAX_AGE_IN_LOG;
121 age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
122 DAMON_MAX_AGE_IN_LOG / 2;
123
124 hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
125 if (freq_weight + age_weight)
126 hotness /= freq_weight + age_weight;
127 /*
128 * Transform it to fit in [0, DAMOS_MAX_SCORE]
129 */
130 hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
131
132 return hotness;
133 }
134
damon_cold_score(struct damon_ctx * c,struct damon_region * r,struct damos * s)135 int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
136 struct damos *s)
137 {
138 int hotness = damon_hot_score(c, r, s);
139
140 /* Return coldness of the region */
141 return DAMOS_MAX_SCORE - hotness;
142 }
143
damon_folio_mkold_one(struct folio * folio,struct vm_area_struct * vma,unsigned long addr,void * arg)144 static bool damon_folio_mkold_one(struct folio *folio,
145 struct vm_area_struct *vma, unsigned long addr, void *arg)
146 {
147 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
148
149 while (page_vma_mapped_walk(&pvmw)) {
150 addr = pvmw.address;
151 if (pvmw.pte)
152 damon_ptep_mkold(pvmw.pte, vma, addr);
153 else
154 damon_pmdp_mkold(pvmw.pmd, vma, addr);
155 }
156 return true;
157 }
158
damon_folio_mkold(struct folio * folio)159 void damon_folio_mkold(struct folio *folio)
160 {
161 struct rmap_walk_control rwc = {
162 .rmap_one = damon_folio_mkold_one,
163 .anon_lock = folio_lock_anon_vma_read,
164 };
165 bool need_lock;
166
167 if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
168 folio_set_idle(folio);
169 return;
170 }
171
172 need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
173 if (need_lock && !folio_trylock(folio))
174 return;
175
176 rmap_walk(folio, &rwc);
177
178 if (need_lock)
179 folio_unlock(folio);
180
181 }
182
damon_folio_young_one(struct folio * folio,struct vm_area_struct * vma,unsigned long addr,void * arg)183 static bool damon_folio_young_one(struct folio *folio,
184 struct vm_area_struct *vma, unsigned long addr, void *arg)
185 {
186 bool *accessed = arg;
187 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
188 pte_t pte;
189
190 *accessed = false;
191 while (page_vma_mapped_walk(&pvmw)) {
192 addr = pvmw.address;
193 if (pvmw.pte) {
194 pte = ptep_get(pvmw.pte);
195
196 /*
197 * PFN swap PTEs, such as device-exclusive ones, that
198 * actually map pages are "old" from a CPU perspective.
199 * The MMU notifier takes care of any device aspects.
200 */
201 *accessed = (pte_present(pte) && pte_young(pte)) ||
202 !folio_test_idle(folio) ||
203 mmu_notifier_test_young(vma->vm_mm, addr);
204 } else {
205 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
206 *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
207 !folio_test_idle(folio) ||
208 mmu_notifier_test_young(vma->vm_mm, addr);
209 #else
210 WARN_ON_ONCE(1);
211 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
212 }
213 if (*accessed) {
214 page_vma_mapped_walk_done(&pvmw);
215 break;
216 }
217 }
218
219 /* If accessed, stop walking */
220 return *accessed == false;
221 }
222
damon_folio_young(struct folio * folio)223 bool damon_folio_young(struct folio *folio)
224 {
225 bool accessed = false;
226 struct rmap_walk_control rwc = {
227 .arg = &accessed,
228 .rmap_one = damon_folio_young_one,
229 .anon_lock = folio_lock_anon_vma_read,
230 };
231 bool need_lock;
232
233 if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
234 if (folio_test_idle(folio))
235 return false;
236 else
237 return true;
238 }
239
240 need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
241 if (need_lock && !folio_trylock(folio))
242 return false;
243
244 rmap_walk(folio, &rwc);
245
246 if (need_lock)
247 folio_unlock(folio);
248
249 return accessed;
250 }
251
damos_folio_filter_match(struct damos_filter * filter,struct folio * folio)252 bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio)
253 {
254 bool matched = false;
255 struct mem_cgroup *memcg;
256 size_t folio_sz;
257
258 switch (filter->type) {
259 case DAMOS_FILTER_TYPE_ANON:
260 matched = folio_test_anon(folio);
261 break;
262 case DAMOS_FILTER_TYPE_ACTIVE:
263 matched = folio_test_active(folio);
264 break;
265 case DAMOS_FILTER_TYPE_MEMCG:
266 rcu_read_lock();
267 memcg = folio_memcg_check(folio);
268 if (!memcg)
269 matched = false;
270 else
271 matched = filter->memcg_id == mem_cgroup_id(memcg);
272 rcu_read_unlock();
273 break;
274 case DAMOS_FILTER_TYPE_YOUNG:
275 matched = damon_folio_young(folio);
276 if (matched)
277 damon_folio_mkold(folio);
278 break;
279 case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
280 folio_sz = folio_size(folio);
281 matched = filter->sz_range.min <= folio_sz &&
282 folio_sz <= filter->sz_range.max;
283 break;
284 case DAMOS_FILTER_TYPE_UNMAPPED:
285 matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
286 break;
287 default:
288 break;
289 }
290
291 return matched == filter->matching;
292 }
293
__damon_migrate_folio_list(struct list_head * migrate_folios,struct pglist_data * pgdat,int target_nid)294 static unsigned int __damon_migrate_folio_list(
295 struct list_head *migrate_folios, struct pglist_data *pgdat,
296 int target_nid)
297 {
298 unsigned int nr_succeeded = 0;
299 struct migration_target_control mtc = {
300 /*
301 * Allocate from 'node', or fail quickly and quietly.
302 * When this happens, 'page' will likely just be discarded
303 * instead of migrated.
304 */
305 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
306 __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
307 .nid = target_nid,
308 };
309
310 if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
311 return 0;
312
313 if (list_empty(migrate_folios))
314 return 0;
315
316 /* Migration ignores all cpuset and mempolicy settings */
317 migrate_pages(migrate_folios, alloc_migration_target, NULL,
318 (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
319 &nr_succeeded);
320
321 return nr_succeeded;
322 }
323
damon_migrate_folio_list(struct list_head * folio_list,struct pglist_data * pgdat,int target_nid)324 static unsigned int damon_migrate_folio_list(struct list_head *folio_list,
325 struct pglist_data *pgdat,
326 int target_nid)
327 {
328 unsigned int nr_migrated = 0;
329 struct folio *folio;
330 LIST_HEAD(ret_folios);
331 LIST_HEAD(migrate_folios);
332
333 while (!list_empty(folio_list)) {
334 struct folio *folio;
335
336 cond_resched();
337
338 folio = lru_to_folio(folio_list);
339 list_del(&folio->lru);
340
341 if (!folio_trylock(folio))
342 goto keep;
343
344 /* Relocate its contents to another node. */
345 list_add(&folio->lru, &migrate_folios);
346 folio_unlock(folio);
347 continue;
348 keep:
349 list_add(&folio->lru, &ret_folios);
350 }
351 /* 'folio_list' is always empty here */
352
353 /* Migrate folios selected for migration */
354 nr_migrated += __damon_migrate_folio_list(
355 &migrate_folios, pgdat, target_nid);
356 /*
357 * Folios that could not be migrated are still in @migrate_folios. Add
358 * those back on @folio_list
359 */
360 if (!list_empty(&migrate_folios))
361 list_splice_init(&migrate_folios, folio_list);
362
363 try_to_unmap_flush();
364
365 list_splice(&ret_folios, folio_list);
366
367 while (!list_empty(folio_list)) {
368 folio = lru_to_folio(folio_list);
369 list_del(&folio->lru);
370 folio_putback_lru(folio);
371 }
372
373 return nr_migrated;
374 }
375
damon_migrate_pages(struct list_head * folio_list,int target_nid)376 unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
377 {
378 int nid;
379 unsigned long nr_migrated = 0;
380 LIST_HEAD(node_folio_list);
381 unsigned int noreclaim_flag;
382
383 if (list_empty(folio_list))
384 return nr_migrated;
385
386 if (target_nid < 0 || target_nid >= MAX_NUMNODES ||
387 !node_state(target_nid, N_MEMORY))
388 return nr_migrated;
389
390 noreclaim_flag = memalloc_noreclaim_save();
391
392 nid = folio_nid(lru_to_folio(folio_list));
393 do {
394 struct folio *folio = lru_to_folio(folio_list);
395
396 if (nid == folio_nid(folio)) {
397 list_move(&folio->lru, &node_folio_list);
398 continue;
399 }
400
401 nr_migrated += damon_migrate_folio_list(&node_folio_list,
402 NODE_DATA(nid),
403 target_nid);
404 nid = folio_nid(lru_to_folio(folio_list));
405 } while (!list_empty(folio_list));
406
407 nr_migrated += damon_migrate_folio_list(&node_folio_list,
408 NODE_DATA(nid),
409 target_nid);
410
411 memalloc_noreclaim_restore(noreclaim_flag);
412
413 return nr_migrated;
414 }
415