xref: /linux/mm/swap_table.h (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _MM_SWAP_TABLE_H
3 #define _MM_SWAP_TABLE_H
4 
5 #include <linux/rcupdate.h>
6 #include <linux/atomic.h>
7 #include "swap.h"
8 
9 /* A typical flat array in each cluster as swap table */
10 struct swap_table {
11 	atomic_long_t entries[SWAPFILE_CLUSTER];
12 };
13 
14 /* For storing memcg private id */
15 struct swap_memcg_table {
16 	unsigned short id[SWAPFILE_CLUSTER];
17 };
18 
19 #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
20 
21 /*
22  * A swap table entry represents the status of a swap slot on a swap
23  * (physical or virtual) device. The swap table in each cluster is a
24  * 1:1 map of the swap slots in this cluster.
25  *
26  * Swap table entry type and bits layouts:
27  *
28  * NULL:     |---------------- 0 ---------------| - Free slot
29  * Shadow:   |SWAP_COUNT|Z|---- SHADOW_VAL ---|1| - Swapped out slot
30  * PFN:      |SWAP_COUNT|Z|------ PFN -------|10| - Cached slot
31  * Pointer:  |----------- Pointer ----------|100| - (Unused)
32  * Bad:      |------------- 1 -------------|1000| - Bad slot
33  *
34  * COUNT is `SWP_TB_COUNT_BITS` long, Z is the `SWP_TB_ZERO_FLAG` bit,
35  * and together they form the `SWP_TB_FLAGS_BITS` wide flags field.
36  * Each entry is an atomic long.
37  *
38  * Usages:
39  *
40  * - NULL: Swap slot is unused, could be allocated.
41  *
42  * - Shadow: Swap slot is used and not cached (usually swapped out). It reuses
43  *   the XA_VALUE format to be compatible with working set shadows. SHADOW_VAL
44  *   part might be all 0 if the working shadow info is absent. In such a case,
45  *   we still want to keep the shadow format as a placeholder.
46  *
47  *   Memcg ID is embedded in SHADOW_VAL.
48  *
49  * - PFN: Swap slot is in use, and cached. Memcg info is recorded on the page
50  *   struct.
51  *
52  * - Pointer: Unused yet. `0b100` is reserved for potential pointer usage
53  *   because only the lower three bits can be used as a marker for 8 bytes
54  *   aligned pointers.
55  *
56  * - Bad: Swap slot is reserved, protects swap header or holes on swap devices.
57  */
58 
59 /* NULL Entry, all 0 */
60 #define SWP_TB_NULL		0UL
61 
62 /* Swapped out: shadow */
63 #define SWP_TB_SHADOW_MARK	0b1UL
64 
65 /* Cached: PFN */
66 #define SWP_TB_PFN_BITS		(SWAP_CACHE_PFN_BITS + SWAP_CACHE_PFN_MARK_BITS)
67 #define SWP_TB_PFN_MARK		0b10UL
68 #define SWP_TB_PFN_MARK_MASK	(BIT(SWAP_CACHE_PFN_MARK_BITS) - 1)
69 
70 /* Flags: For PFN or shadow, contains SWAP_COUNT, width changes */
71 #define SWP_TB_FLAGS_BITS	min(5, BITS_PER_LONG - SWP_TB_PFN_BITS)
72 #define SWP_TB_COUNT_BITS	(SWP_TB_FLAGS_BITS - SWAP_TABLE_HAS_ZEROFLAG)
73 #define SWP_TB_FLAGS_MASK	(~((~0UL) >> SWP_TB_FLAGS_BITS))
74 #define SWP_TB_COUNT_MASK      (~((~0UL) >> SWP_TB_COUNT_BITS))
75 #define SWP_TB_FLAGS_SHIFT     (BITS_PER_LONG - SWP_TB_FLAGS_BITS)
76 #define SWP_TB_COUNT_SHIFT     (BITS_PER_LONG - SWP_TB_COUNT_BITS)
77 #define SWP_TB_COUNT_MAX       ((1 << SWP_TB_COUNT_BITS) - 1)
78 /* The first flag is zero bit (SWAP_TABLE_HAS_ZEROFLAG) */
79 #define SWP_TB_ZERO_FLAG	BIT(BITS_PER_LONG - SWP_TB_FLAGS_BITS)
80 
81 /* Bad slot: ends with 0b1000 and rests of bits are all 1 */
82 #define SWP_TB_BAD		((~0UL) << 3)
83 
84 /* Macro for shadow offset calculation */
85 #define SWAP_COUNT_SHIFT	SWP_TB_FLAGS_BITS
86 
87 /*
88  * Helpers for casting one type of info into a swap table entry.
89  */
90 static inline unsigned long null_to_swp_tb(void)
91 {
92 	BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t));
93 	return 0;
94 }
95 
96 static inline unsigned long __count_to_swp_tb(unsigned char count)
97 {
98 	/*
99 	 * At least three values are needed to distinguish free (0),
100 	 * used (count > 0 && count < SWP_TB_COUNT_MAX), and
101 	 * overflow (count == SWP_TB_COUNT_MAX).
102 	 */
103 	BUILD_BUG_ON(SWP_TB_COUNT_BITS < SWAP_COUNT_MIN_BITS);
104 	VM_WARN_ON(count > SWP_TB_COUNT_MAX);
105 	return ((unsigned long)count) << SWP_TB_COUNT_SHIFT;
106 }
107 
108 static inline unsigned long __flags_to_swp_tb(unsigned char flags)
109 {
110 	BUILD_BUG_ON(SWP_TB_FLAGS_BITS > BITS_PER_BYTE);
111 	VM_WARN_ON(flags >> SWP_TB_FLAGS_BITS);
112 	return ((unsigned long)flags) << SWP_TB_FLAGS_SHIFT;
113 }
114 
115 static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned char flags)
116 {
117 	unsigned long swp_tb;
118 
119 	BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *));
120 	BUILD_BUG_ON(SWAP_CACHE_PFN_BITS >
121 		     (BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - SWP_TB_FLAGS_BITS));
122 
123 	swp_tb = (pfn << SWAP_CACHE_PFN_MARK_BITS) | SWP_TB_PFN_MARK;
124 	VM_WARN_ON_ONCE(swp_tb & SWP_TB_FLAGS_MASK);
125 
126 	return swp_tb | __flags_to_swp_tb(flags);
127 }
128 
129 static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned char flags)
130 {
131 	return pfn_to_swp_tb(folio_pfn(folio), flags);
132 }
133 
134 static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned char flags)
135 {
136 	BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=
137 		     BITS_PER_BYTE * sizeof(unsigned long));
138 	BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK);
139 
140 	VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
141 	VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_FLAGS_MASK));
142 
143 	return (unsigned long)shadow | SWP_TB_SHADOW_MARK | __flags_to_swp_tb(flags);
144 }
145 
146 /*
147  * Helpers for swap table entry type checking.
148  */
149 static inline bool swp_tb_is_null(unsigned long swp_tb)
150 {
151 	return !swp_tb;
152 }
153 
154 static inline bool swp_tb_is_folio(unsigned long swp_tb)
155 {
156 	return ((swp_tb & SWP_TB_PFN_MARK_MASK) == SWP_TB_PFN_MARK);
157 }
158 
159 static inline bool swp_tb_is_shadow(unsigned long swp_tb)
160 {
161 	return xa_is_value((void *)swp_tb);
162 }
163 
164 static inline bool swp_tb_is_bad(unsigned long swp_tb)
165 {
166 	return swp_tb == SWP_TB_BAD;
167 }
168 
169 static inline bool swp_tb_is_countable(unsigned long swp_tb)
170 {
171 	return (swp_tb_is_shadow(swp_tb) || swp_tb_is_folio(swp_tb) ||
172 		swp_tb_is_null(swp_tb));
173 }
174 
175 /*
176  * Helpers for retrieving info from swap table.
177  */
178 static inline struct folio *swp_tb_to_folio(unsigned long swp_tb)
179 {
180 	VM_WARN_ON(!swp_tb_is_folio(swp_tb));
181 	return pfn_folio((swp_tb & ~SWP_TB_FLAGS_MASK) >> SWAP_CACHE_PFN_MARK_BITS);
182 }
183 
184 static inline void *swp_tb_to_shadow(unsigned long swp_tb)
185 {
186 	VM_WARN_ON(!swp_tb_is_shadow(swp_tb));
187 	/* No shift needed, xa_value is stored as it is in the lower bits. */
188 	return (void *)(swp_tb & ~SWP_TB_FLAGS_MASK);
189 }
190 
191 static inline unsigned char __swp_tb_get_count(unsigned long swp_tb)
192 {
193 	VM_WARN_ON(!swp_tb_is_countable(swp_tb));
194 	return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT);
195 }
196 
197 static inline unsigned char __swp_tb_get_flags(unsigned long swp_tb)
198 {
199 	VM_WARN_ON(!swp_tb_is_countable(swp_tb));
200 	return ((swp_tb & SWP_TB_FLAGS_MASK) >> SWP_TB_FLAGS_SHIFT);
201 }
202 
203 static inline int swp_tb_get_count(unsigned long swp_tb)
204 {
205 	if (swp_tb_is_countable(swp_tb))
206 		return __swp_tb_get_count(swp_tb);
207 	return -EINVAL;
208 }
209 
210 static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count)
211 {
212 	return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count));
213 }
214 
215 /*
216  * Helpers for accessing or modifying the swap table of a cluster,
217  * the swap cluster must be locked.
218  */
219 static inline void __swap_table_set(struct swap_cluster_info *ci,
220 				    unsigned int off, unsigned long swp_tb)
221 {
222 	atomic_long_t *table = rcu_dereference_protected(ci->table, true);
223 
224 	lockdep_assert_held(&ci->lock);
225 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
226 	atomic_long_set(&table[off], swp_tb);
227 }
228 
229 static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
230 					      unsigned int off, unsigned long swp_tb)
231 {
232 	atomic_long_t *table = rcu_dereference_protected(ci->table, true);
233 
234 	lockdep_assert_held(&ci->lock);
235 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
236 	/* Ordering is guaranteed by cluster lock, relax */
237 	return atomic_long_xchg_relaxed(&table[off], swp_tb);
238 }
239 
240 static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
241 					     unsigned int off)
242 {
243 	atomic_long_t *table;
244 
245 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
246 	table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
247 
248 	return atomic_long_read(&table[off]);
249 }
250 
251 static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
252 					unsigned int off)
253 {
254 	atomic_long_t *table;
255 	unsigned long swp_tb;
256 
257 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
258 
259 	rcu_read_lock();
260 	table = rcu_dereference(ci->table);
261 	swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
262 	rcu_read_unlock();
263 
264 	return swp_tb;
265 }
266 
267 static inline void __swap_table_set_zero(struct swap_cluster_info *ci,
268 					 unsigned int ci_off)
269 {
270 #if SWAP_TABLE_HAS_ZEROFLAG
271 	unsigned long swp_tb = __swap_table_get(ci, ci_off);
272 
273 	BUILD_BUG_ON(SWP_TB_ZERO_FLAG & ~SWP_TB_FLAGS_MASK);
274 	VM_WARN_ON(!swp_tb_is_countable(swp_tb));
275 	swp_tb |= SWP_TB_ZERO_FLAG;
276 	__swap_table_set(ci, ci_off, swp_tb);
277 #else
278 	lockdep_assert_held(&ci->lock);
279 	__set_bit(ci_off, ci->zero_bitmap);
280 #endif
281 }
282 
283 static inline bool __swap_table_test_zero(struct swap_cluster_info *ci,
284 					  unsigned int ci_off)
285 {
286 #if SWAP_TABLE_HAS_ZEROFLAG
287 	unsigned long swp_tb = __swap_table_get(ci, ci_off);
288 
289 	VM_WARN_ON(!swp_tb_is_countable(swp_tb));
290 	return !!(swp_tb & SWP_TB_ZERO_FLAG);
291 #else
292 	return test_bit(ci_off, ci->zero_bitmap);
293 #endif
294 }
295 
296 static inline void __swap_table_clear_zero(struct swap_cluster_info *ci,
297 					   unsigned int ci_off)
298 {
299 #if SWAP_TABLE_HAS_ZEROFLAG
300 	unsigned long swp_tb = __swap_table_get(ci, ci_off);
301 
302 	VM_WARN_ON(!swp_tb_is_countable(swp_tb));
303 	swp_tb &= ~SWP_TB_ZERO_FLAG;
304 	__swap_table_set(ci, ci_off, swp_tb);
305 #else
306 	lockdep_assert_held(&ci->lock);
307 	__clear_bit(ci_off, ci->zero_bitmap);
308 #endif
309 }
310 
311 #ifdef CONFIG_MEMCG
312 static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
313 		unsigned int ci_off, unsigned long nr, unsigned short id)
314 {
315 	lockdep_assert_held(&ci->lock);
316 	VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
317 	if (WARN_ON_ONCE(!ci->memcg_table))
318 		return;
319 	do {
320 		ci->memcg_table->id[ci_off++] = id;
321 	} while (--nr);
322 }
323 
324 static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
325 					       unsigned int ci_off)
326 {
327 	lockdep_assert_held(&ci->lock);
328 	VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
329 	if (unlikely(!ci->memcg_table))
330 		return 0;
331 	return ci->memcg_table->id[ci_off];
332 }
333 
334 static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
335 						 unsigned int ci_off,
336 						 unsigned long nr)
337 {
338 	unsigned short old = __swap_cgroup_get(ci, ci_off);
339 
340 	if (!old)
341 		return 0;
342 	do {
343 		VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old);
344 		ci->memcg_table->id[ci_off++] = 0;
345 	} while (--nr);
346 
347 	return old;
348 }
349 #else
350 static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
351 		unsigned int ci_off, unsigned long nr, unsigned short id)
352 {
353 }
354 
355 static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
356 					       unsigned int ci_off)
357 {
358 	return 0;
359 }
360 
361 static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
362 						 unsigned int ci_off,
363 						 unsigned long nr)
364 {
365 	return 0;
366 }
367 #endif
368 
369 #endif
370