1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _MM_SWAP_TABLE_H 3 #define _MM_SWAP_TABLE_H 4 5 #include <linux/rcupdate.h> 6 #include <linux/atomic.h> 7 #include "swap.h" 8 9 /* A typical flat array in each cluster as swap table */ 10 struct swap_table { 11 atomic_long_t entries[SWAPFILE_CLUSTER]; 12 }; 13 14 /* For storing memcg private id */ 15 struct swap_memcg_table { 16 unsigned short id[SWAPFILE_CLUSTER]; 17 }; 18 19 #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) 20 21 /* 22 * A swap table entry represents the status of a swap slot on a swap 23 * (physical or virtual) device. The swap table in each cluster is a 24 * 1:1 map of the swap slots in this cluster. 25 * 26 * Swap table entry type and bits layouts: 27 * 28 * NULL: |---------------- 0 ---------------| - Free slot 29 * Shadow: |SWAP_COUNT|Z|---- SHADOW_VAL ---|1| - Swapped out slot 30 * PFN: |SWAP_COUNT|Z|------ PFN -------|10| - Cached slot 31 * Pointer: |----------- Pointer ----------|100| - (Unused) 32 * Bad: |------------- 1 -------------|1000| - Bad slot 33 * 34 * COUNT is `SWP_TB_COUNT_BITS` long, Z is the `SWP_TB_ZERO_FLAG` bit, 35 * and together they form the `SWP_TB_FLAGS_BITS` wide flags field. 36 * Each entry is an atomic long. 37 * 38 * Usages: 39 * 40 * - NULL: Swap slot is unused, could be allocated. 41 * 42 * - Shadow: Swap slot is used and not cached (usually swapped out). It reuses 43 * the XA_VALUE format to be compatible with working set shadows. SHADOW_VAL 44 * part might be all 0 if the working shadow info is absent. In such a case, 45 * we still want to keep the shadow format as a placeholder. 46 * 47 * Memcg ID is embedded in SHADOW_VAL. 48 * 49 * - PFN: Swap slot is in use, and cached. Memcg info is recorded on the page 50 * struct. 51 * 52 * - Pointer: Unused yet. `0b100` is reserved for potential pointer usage 53 * because only the lower three bits can be used as a marker for 8 bytes 54 * aligned pointers. 55 * 56 * - Bad: Swap slot is reserved, protects swap header or holes on swap devices. 57 */ 58 59 /* NULL Entry, all 0 */ 60 #define SWP_TB_NULL 0UL 61 62 /* Swapped out: shadow */ 63 #define SWP_TB_SHADOW_MARK 0b1UL 64 65 /* Cached: PFN */ 66 #define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWAP_CACHE_PFN_MARK_BITS) 67 #define SWP_TB_PFN_MARK 0b10UL 68 #define SWP_TB_PFN_MARK_MASK (BIT(SWAP_CACHE_PFN_MARK_BITS) - 1) 69 70 /* Flags: For PFN or shadow, contains SWAP_COUNT, width changes */ 71 #define SWP_TB_FLAGS_BITS min(5, BITS_PER_LONG - SWP_TB_PFN_BITS) 72 #define SWP_TB_COUNT_BITS (SWP_TB_FLAGS_BITS - SWAP_TABLE_HAS_ZEROFLAG) 73 #define SWP_TB_FLAGS_MASK (~((~0UL) >> SWP_TB_FLAGS_BITS)) 74 #define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) 75 #define SWP_TB_FLAGS_SHIFT (BITS_PER_LONG - SWP_TB_FLAGS_BITS) 76 #define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) 77 #define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) 78 /* The first flag is zero bit (SWAP_TABLE_HAS_ZEROFLAG) */ 79 #define SWP_TB_ZERO_FLAG BIT(BITS_PER_LONG - SWP_TB_FLAGS_BITS) 80 81 /* Bad slot: ends with 0b1000 and rests of bits are all 1 */ 82 #define SWP_TB_BAD ((~0UL) << 3) 83 84 /* Macro for shadow offset calculation */ 85 #define SWAP_COUNT_SHIFT SWP_TB_FLAGS_BITS 86 87 /* 88 * Helpers for casting one type of info into a swap table entry. 89 */ 90 static inline unsigned long null_to_swp_tb(void) 91 { 92 BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); 93 return 0; 94 } 95 96 static inline unsigned long __count_to_swp_tb(unsigned char count) 97 { 98 /* 99 * At least three values are needed to distinguish free (0), 100 * used (count > 0 && count < SWP_TB_COUNT_MAX), and 101 * overflow (count == SWP_TB_COUNT_MAX). 102 */ 103 BUILD_BUG_ON(SWP_TB_COUNT_BITS < SWAP_COUNT_MIN_BITS); 104 VM_WARN_ON(count > SWP_TB_COUNT_MAX); 105 return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; 106 } 107 108 static inline unsigned long __flags_to_swp_tb(unsigned char flags) 109 { 110 BUILD_BUG_ON(SWP_TB_FLAGS_BITS > BITS_PER_BYTE); 111 VM_WARN_ON(flags >> SWP_TB_FLAGS_BITS); 112 return ((unsigned long)flags) << SWP_TB_FLAGS_SHIFT; 113 } 114 115 static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned char flags) 116 { 117 unsigned long swp_tb; 118 119 BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); 120 BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > 121 (BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - SWP_TB_FLAGS_BITS)); 122 123 swp_tb = (pfn << SWAP_CACHE_PFN_MARK_BITS) | SWP_TB_PFN_MARK; 124 VM_WARN_ON_ONCE(swp_tb & SWP_TB_FLAGS_MASK); 125 126 return swp_tb | __flags_to_swp_tb(flags); 127 } 128 129 static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned char flags) 130 { 131 return pfn_to_swp_tb(folio_pfn(folio), flags); 132 } 133 134 static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned char flags) 135 { 136 BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != 137 BITS_PER_BYTE * sizeof(unsigned long)); 138 BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK); 139 140 VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); 141 VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_FLAGS_MASK)); 142 143 return (unsigned long)shadow | SWP_TB_SHADOW_MARK | __flags_to_swp_tb(flags); 144 } 145 146 /* 147 * Helpers for swap table entry type checking. 148 */ 149 static inline bool swp_tb_is_null(unsigned long swp_tb) 150 { 151 return !swp_tb; 152 } 153 154 static inline bool swp_tb_is_folio(unsigned long swp_tb) 155 { 156 return ((swp_tb & SWP_TB_PFN_MARK_MASK) == SWP_TB_PFN_MARK); 157 } 158 159 static inline bool swp_tb_is_shadow(unsigned long swp_tb) 160 { 161 return xa_is_value((void *)swp_tb); 162 } 163 164 static inline bool swp_tb_is_bad(unsigned long swp_tb) 165 { 166 return swp_tb == SWP_TB_BAD; 167 } 168 169 static inline bool swp_tb_is_countable(unsigned long swp_tb) 170 { 171 return (swp_tb_is_shadow(swp_tb) || swp_tb_is_folio(swp_tb) || 172 swp_tb_is_null(swp_tb)); 173 } 174 175 /* 176 * Helpers for retrieving info from swap table. 177 */ 178 static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) 179 { 180 VM_WARN_ON(!swp_tb_is_folio(swp_tb)); 181 return pfn_folio((swp_tb & ~SWP_TB_FLAGS_MASK) >> SWAP_CACHE_PFN_MARK_BITS); 182 } 183 184 static inline void *swp_tb_to_shadow(unsigned long swp_tb) 185 { 186 VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); 187 /* No shift needed, xa_value is stored as it is in the lower bits. */ 188 return (void *)(swp_tb & ~SWP_TB_FLAGS_MASK); 189 } 190 191 static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) 192 { 193 VM_WARN_ON(!swp_tb_is_countable(swp_tb)); 194 return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); 195 } 196 197 static inline unsigned char __swp_tb_get_flags(unsigned long swp_tb) 198 { 199 VM_WARN_ON(!swp_tb_is_countable(swp_tb)); 200 return ((swp_tb & SWP_TB_FLAGS_MASK) >> SWP_TB_FLAGS_SHIFT); 201 } 202 203 static inline int swp_tb_get_count(unsigned long swp_tb) 204 { 205 if (swp_tb_is_countable(swp_tb)) 206 return __swp_tb_get_count(swp_tb); 207 return -EINVAL; 208 } 209 210 static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count) 211 { 212 return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count)); 213 } 214 215 /* 216 * Helpers for accessing or modifying the swap table of a cluster, 217 * the swap cluster must be locked. 218 */ 219 static inline void __swap_table_set(struct swap_cluster_info *ci, 220 unsigned int off, unsigned long swp_tb) 221 { 222 atomic_long_t *table = rcu_dereference_protected(ci->table, true); 223 224 lockdep_assert_held(&ci->lock); 225 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 226 atomic_long_set(&table[off], swp_tb); 227 } 228 229 static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, 230 unsigned int off, unsigned long swp_tb) 231 { 232 atomic_long_t *table = rcu_dereference_protected(ci->table, true); 233 234 lockdep_assert_held(&ci->lock); 235 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 236 /* Ordering is guaranteed by cluster lock, relax */ 237 return atomic_long_xchg_relaxed(&table[off], swp_tb); 238 } 239 240 static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, 241 unsigned int off) 242 { 243 atomic_long_t *table; 244 245 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 246 table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); 247 248 return atomic_long_read(&table[off]); 249 } 250 251 static inline unsigned long swap_table_get(struct swap_cluster_info *ci, 252 unsigned int off) 253 { 254 atomic_long_t *table; 255 unsigned long swp_tb; 256 257 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 258 259 rcu_read_lock(); 260 table = rcu_dereference(ci->table); 261 swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); 262 rcu_read_unlock(); 263 264 return swp_tb; 265 } 266 267 static inline void __swap_table_set_zero(struct swap_cluster_info *ci, 268 unsigned int ci_off) 269 { 270 #if SWAP_TABLE_HAS_ZEROFLAG 271 unsigned long swp_tb = __swap_table_get(ci, ci_off); 272 273 BUILD_BUG_ON(SWP_TB_ZERO_FLAG & ~SWP_TB_FLAGS_MASK); 274 VM_WARN_ON(!swp_tb_is_countable(swp_tb)); 275 swp_tb |= SWP_TB_ZERO_FLAG; 276 __swap_table_set(ci, ci_off, swp_tb); 277 #else 278 lockdep_assert_held(&ci->lock); 279 __set_bit(ci_off, ci->zero_bitmap); 280 #endif 281 } 282 283 static inline bool __swap_table_test_zero(struct swap_cluster_info *ci, 284 unsigned int ci_off) 285 { 286 #if SWAP_TABLE_HAS_ZEROFLAG 287 unsigned long swp_tb = __swap_table_get(ci, ci_off); 288 289 VM_WARN_ON(!swp_tb_is_countable(swp_tb)); 290 return !!(swp_tb & SWP_TB_ZERO_FLAG); 291 #else 292 return test_bit(ci_off, ci->zero_bitmap); 293 #endif 294 } 295 296 static inline void __swap_table_clear_zero(struct swap_cluster_info *ci, 297 unsigned int ci_off) 298 { 299 #if SWAP_TABLE_HAS_ZEROFLAG 300 unsigned long swp_tb = __swap_table_get(ci, ci_off); 301 302 VM_WARN_ON(!swp_tb_is_countable(swp_tb)); 303 swp_tb &= ~SWP_TB_ZERO_FLAG; 304 __swap_table_set(ci, ci_off, swp_tb); 305 #else 306 lockdep_assert_held(&ci->lock); 307 __clear_bit(ci_off, ci->zero_bitmap); 308 #endif 309 } 310 311 #ifdef CONFIG_MEMCG 312 static inline void __swap_cgroup_set(struct swap_cluster_info *ci, 313 unsigned int ci_off, unsigned long nr, unsigned short id) 314 { 315 lockdep_assert_held(&ci->lock); 316 VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER); 317 if (WARN_ON_ONCE(!ci->memcg_table)) 318 return; 319 do { 320 ci->memcg_table->id[ci_off++] = id; 321 } while (--nr); 322 } 323 324 static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci, 325 unsigned int ci_off) 326 { 327 lockdep_assert_held(&ci->lock); 328 VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER); 329 if (unlikely(!ci->memcg_table)) 330 return 0; 331 return ci->memcg_table->id[ci_off]; 332 } 333 334 static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci, 335 unsigned int ci_off, 336 unsigned long nr) 337 { 338 unsigned short old = __swap_cgroup_get(ci, ci_off); 339 340 if (!old) 341 return 0; 342 do { 343 VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old); 344 ci->memcg_table->id[ci_off++] = 0; 345 } while (--nr); 346 347 return old; 348 } 349 #else 350 static inline void __swap_cgroup_set(struct swap_cluster_info *ci, 351 unsigned int ci_off, unsigned long nr, unsigned short id) 352 { 353 } 354 355 static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci, 356 unsigned int ci_off) 357 { 358 return 0; 359 } 360 361 static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci, 362 unsigned int ci_off, 363 unsigned long nr) 364 { 365 return 0; 366 } 367 #endif 368 369 #endif 370