1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2018, Joyent, Inc. 25 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 26 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 27 * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 #include <sys/spa.h> 31 #include <sys/zio.h> 32 #include <sys/spa_impl.h> 33 #include <sys/zio_compress.h> 34 #include <sys/zio_checksum.h> 35 #include <sys/zfs_context.h> 36 #include <sys/arc.h> 37 #include <sys/zfs_refcount.h> 38 #include <sys/vdev.h> 39 #include <sys/vdev_trim.h> 40 #include <sys/vdev_impl.h> 41 #include <sys/dsl_pool.h> 42 #include <sys/multilist.h> 43 #include <sys/abd.h> 44 #include <sys/zil.h> 45 #include <sys/fm/fs/zfs.h> 46 #include <sys/shrinker.h> 47 #include <sys/vmsystm.h> 48 #include <sys/zpl.h> 49 #include <linux/page_compat.h> 50 #include <linux/notifier.h> 51 #include <linux/memory.h> 52 #include <linux/version.h> 53 #include <sys/callb.h> 54 #include <sys/kstat.h> 55 #include <sys/zthr.h> 56 #include <zfs_fletcher.h> 57 #include <sys/arc_impl.h> 58 #include <sys/trace_zfs.h> 59 #include <sys/aggsum.h> 60 61 /* 62 * This is a limit on how many pages the ARC shrinker makes available for 63 * eviction in response to one page allocation attempt. Note that in 64 * practice, the kernel's shrinker can ask us to evict up to about 4x this 65 * for one allocation attempt. 66 * 67 * For example a value of 10,000 (in practice, 160MB per allocation attempt 68 * with 4K pages) limits the amount of time spent attempting to reclaim ARC 69 * memory to less than 100ms per allocation attempt, even with a small 70 * average compressed block size of ~8KB. 71 * 72 * See also the comment in arc_shrinker_count(). 73 * Set to 0 to disable limit. 74 */ 75 static int zfs_arc_shrinker_limit = 0; 76 77 /* 78 * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted 79 * page. Bigger values make ARC more precious and evictions smaller comparing 80 * to other kernel subsystems. Value of 4 means parity with page cache, 81 * according to my reading of kernel's do_shrink_slab() and other code. 82 */ 83 static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS; 84 85 #ifdef CONFIG_MEMORY_HOTPLUG 86 static struct notifier_block arc_hotplug_callback_mem_nb; 87 #endif 88 89 /* 90 * Return a default max arc size based on the amount of physical memory. 91 * This may be overridden by tuning the zfs_arc_max module parameter. 92 */ 93 uint64_t 94 arc_default_max(uint64_t min, uint64_t allmem) 95 { 96 uint64_t size; 97 98 if (allmem >= 1 << 30) 99 size = allmem - (1 << 30); 100 else 101 size = min; 102 return (MAX(allmem * 5 / 8, size)); 103 } 104 105 /* 106 * Return maximum amount of memory that we could possibly use. Reduced 107 * to half of all memory in user space which is primarily used for testing. 108 */ 109 uint64_t 110 arc_all_memory(void) 111 { 112 #ifdef CONFIG_HIGHMEM 113 return (ptob(zfs_totalram_pages - zfs_totalhigh_pages)); 114 #else 115 return (ptob(zfs_totalram_pages)); 116 #endif /* CONFIG_HIGHMEM */ 117 } 118 119 /* 120 * Return the amount of memory that is considered free. In user space 121 * which is primarily used for testing we pretend that free memory ranges 122 * from 0-20% of all memory. 123 */ 124 uint64_t 125 arc_free_memory(void) 126 { 127 #ifdef CONFIG_HIGHMEM 128 struct sysinfo si; 129 si_meminfo(&si); 130 return (ptob(si.freeram - si.freehigh)); 131 #else 132 return (ptob(nr_free_pages() + 133 nr_inactive_file_pages())); 134 #endif /* CONFIG_HIGHMEM */ 135 } 136 137 /* 138 * Return the amount of memory that can be consumed before reclaim will be 139 * needed. Positive if there is sufficient free memory, negative indicates 140 * the amount of memory that needs to be freed up. 141 */ 142 int64_t 143 arc_available_memory(void) 144 { 145 return (arc_free_memory() - arc_sys_free); 146 } 147 148 static uint64_t 149 arc_evictable_memory(void) 150 { 151 int64_t asize = aggsum_value(&arc_sums.arcstat_size); 152 uint64_t arc_clean = 153 zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + 154 zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + 155 zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + 156 zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 157 uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0); 158 159 /* 160 * Scale reported evictable memory in proportion to page cache, cap 161 * at specified min/max. 162 */ 163 uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent; 164 min = MAX(arc_c_min, MIN(arc_c_max, min)); 165 166 if (arc_dirty >= min) 167 return (arc_clean); 168 169 return (MAX((int64_t)asize - (int64_t)min, 0)); 170 } 171 172 /* 173 * The _count() function returns the number of free-able objects. 174 * The _scan() function returns the number of objects that were freed. 175 */ 176 static unsigned long 177 arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) 178 { 179 /* 180 * The kernel's shrinker code may not understand how many pages the 181 * ARC's callback actually frees, so it may ask the ARC to shrink a 182 * lot for one page allocation. This is problematic because it may 183 * take a long time, thus delaying the page allocation, and because 184 * it may force the ARC to unnecessarily shrink very small. 185 * 186 * Therefore, we limit the amount of data that we say is evictable, 187 * which limits the amount that the shrinker will ask us to evict for 188 * one page allocation attempt. 189 * 190 * In practice, we may be asked to shrink 4x the limit to satisfy one 191 * page allocation, before the kernel's shrinker code gives up on us. 192 * When that happens, we rely on the kernel code to find the pages 193 * that we freed before invoking the OOM killer. This happens in 194 * __alloc_pages_slowpath(), which retries and finds the pages we 195 * freed when it calls get_page_from_freelist(). 196 * 197 * See also the comment above zfs_arc_shrinker_limit. 198 */ 199 int64_t can_free = btop(arc_evictable_memory()); 200 if (current_is_kswapd() && zfs_arc_shrinker_limit) 201 can_free = MIN(can_free, zfs_arc_shrinker_limit); 202 return (can_free); 203 } 204 205 static unsigned long 206 arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) 207 { 208 /* The arc is considered warm once reclaim has occurred */ 209 if (unlikely(arc_warm == B_FALSE)) 210 arc_warm = B_TRUE; 211 212 /* 213 * We are experiencing memory pressure which the arc_evict_zthr was 214 * unable to keep up with. Set arc_no_grow to briefly pause ARC 215 * growth to avoid compounding the memory pressure. 216 */ 217 arc_no_grow = B_TRUE; 218 219 /* 220 * Evict the requested number of pages by reducing arc_c and waiting 221 * for the requested amount of data to be evicted. To avoid deadlock 222 * do not wait for eviction if we may be called from ZFS itself (see 223 * kmem_flags_convert() removing __GFP_FS). It may cause excessive 224 * eviction later if many evictions are accumulated, but just skipping 225 * the eviction is not good either if most of memory is used by ARC. 226 */ 227 uint64_t to_free = arc_reduce_target_size(ptob(sc->nr_to_scan)); 228 if (sc->gfp_mask & __GFP_FS) 229 arc_wait_for_eviction(to_free, B_FALSE, B_FALSE); 230 if (current->reclaim_state != NULL) 231 #ifdef HAVE_RECLAIM_STATE_RECLAIMED 232 current->reclaim_state->reclaimed += btop(to_free); 233 #else 234 current->reclaim_state->reclaimed_slab += btop(to_free); 235 #endif 236 237 /* 238 * When direct reclaim is observed it usually indicates a rapid 239 * increase in memory pressure. This occurs because the kswapd 240 * threads were unable to asynchronously keep enough free memory 241 * available. 242 */ 243 if (current_is_kswapd()) { 244 ARCSTAT_BUMP(arcstat_memory_indirect_count); 245 } else { 246 ARCSTAT_BUMP(arcstat_memory_direct_count); 247 } 248 249 return (btop(to_free)); 250 } 251 252 static struct shrinker *arc_shrinker = NULL; 253 254 int 255 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) 256 { 257 uint64_t free_memory = arc_free_memory(); 258 259 if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100) 260 return (0); 261 262 if (txg > spa->spa_lowmem_last_txg) { 263 spa->spa_lowmem_last_txg = txg; 264 spa->spa_lowmem_page_load = 0; 265 } 266 /* 267 * If we are in pageout, we know that memory is already tight, 268 * the arc is already going to be evicting, so we just want to 269 * continue to let page writes occur as quickly as possible. 270 */ 271 if (current_is_kswapd()) { 272 if (spa->spa_lowmem_page_load > 273 MAX(arc_sys_free / 4, free_memory) / 4) { 274 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); 275 return (SET_ERROR(ERESTART)); 276 } 277 /* Note: reserve is inflated, so we deflate */ 278 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); 279 return (0); 280 } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { 281 /* memory is low, delay before restarting */ 282 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 283 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); 284 return (SET_ERROR(EAGAIN)); 285 } 286 spa->spa_lowmem_page_load = 0; 287 return (0); 288 } 289 290 static void 291 arc_set_sys_free(uint64_t allmem) 292 { 293 /* 294 * The ARC tries to keep at least this much memory available for the 295 * system. This gives the ARC time to shrink in response to memory 296 * pressure, before running completely out of memory and invoking the 297 * direct-reclaim ARC shrinker. 298 * 299 * This should be more than twice high_wmark_pages(), so that 300 * arc_wait_for_eviction() will wait until at least the 301 * high_wmark_pages() are free (see arc_evict_state_impl()). 302 * 303 * Note: If concurrent allocations consume these pages, there may 304 * still be insufficient free pages, and the OOM killer takes action. 305 * 306 * By setting arc_sys_free large enough, and having 307 * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 308 * free memory, it is much less likely that concurrent allocations can 309 * consume all the memory that was evicted before checking for 310 * OOM. 311 * 312 * It's hard to iterate the zones from a linux kernel module, which 313 * makes it difficult to determine the watermark dynamically. Instead 314 * we compute the maximum high watermark for this system, based 315 * on the amount of memory, using the same method as the kernel uses 316 * to calculate its internal `min_free_kbytes` variable. See 317 * torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value 318 * from 64M to 256M. 319 */ 320 321 /* 322 * Base wmark_low is 4 * the square root of Kbytes of RAM. 323 */ 324 long wmark = int_sqrt(allmem / 1024 * 16) * 1024; 325 326 /* 327 * Clamp to between 128K and 256/64MB. 328 */ 329 wmark = MAX(wmark, 128 * 1024); 330 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) 331 wmark = MIN(wmark, 256 * 1024 * 1024); 332 #else 333 wmark = MIN(wmark, 64 * 1024 * 1024); 334 #endif 335 336 /* 337 * watermark_boost can increase the wmark by up to 150%. 338 */ 339 wmark += wmark * 150 / 100; 340 341 /* 342 * arc_sys_free needs to be more than 2x the watermark, because 343 * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up 344 * to 3x to ensure we're above it. 345 */ 346 arc_sys_free = wmark * 3 + allmem / 32; 347 } 348 349 void 350 arc_lowmem_init(void) 351 { 352 uint64_t allmem = arc_all_memory(); 353 354 /* 355 * Register a shrinker to support synchronous (direct) memory 356 * reclaim from the arc. This is done to prevent kswapd from 357 * swapping out pages when it is preferable to shrink the arc. 358 */ 359 arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", 360 arc_shrinker_count, arc_shrinker_scan, zfs_arc_shrinker_seeks); 361 VERIFY(arc_shrinker); 362 363 arc_set_sys_free(allmem); 364 } 365 366 void 367 arc_lowmem_fini(void) 368 { 369 spl_unregister_shrinker(arc_shrinker); 370 arc_shrinker = NULL; 371 } 372 373 int 374 param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp) 375 { 376 int error; 377 378 error = spl_param_set_u64(buf, kp); 379 if (error < 0) 380 return (SET_ERROR(error)); 381 382 arc_tuning_update(B_TRUE); 383 384 return (0); 385 } 386 387 int 388 param_set_arc_min(const char *buf, zfs_kernel_param_t *kp) 389 { 390 return (param_set_arc_u64(buf, kp)); 391 } 392 393 int 394 param_set_arc_max(const char *buf, zfs_kernel_param_t *kp) 395 { 396 return (param_set_arc_u64(buf, kp)); 397 } 398 399 int 400 param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) 401 { 402 int error; 403 404 error = param_set_int(buf, kp); 405 if (error < 0) 406 return (SET_ERROR(error)); 407 408 arc_tuning_update(B_TRUE); 409 410 return (0); 411 } 412 413 #ifdef CONFIG_MEMORY_HOTPLUG 414 static int 415 arc_hotplug_callback(struct notifier_block *self, unsigned long action, 416 void *arg) 417 { 418 (void) self, (void) arg; 419 uint64_t allmem = arc_all_memory(); 420 if (action != MEM_ONLINE) 421 return (NOTIFY_OK); 422 423 arc_set_limits(allmem); 424 425 #ifdef __LP64__ 426 if (zfs_dirty_data_max_max == 0) 427 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, 428 allmem * zfs_dirty_data_max_max_percent / 100); 429 #else 430 if (zfs_dirty_data_max_max == 0) 431 zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, 432 allmem * zfs_dirty_data_max_max_percent / 100); 433 #endif 434 435 arc_set_sys_free(allmem); 436 return (NOTIFY_OK); 437 } 438 #endif 439 440 void 441 arc_register_hotplug(void) 442 { 443 #ifdef CONFIG_MEMORY_HOTPLUG 444 arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback; 445 /* There is no significance to the value 100 */ 446 arc_hotplug_callback_mem_nb.priority = 100; 447 register_memory_notifier(&arc_hotplug_callback_mem_nb); 448 #endif 449 } 450 451 void 452 arc_unregister_hotplug(void) 453 { 454 #ifdef CONFIG_MEMORY_HOTPLUG 455 unregister_memory_notifier(&arc_hotplug_callback_mem_nb); 456 #endif 457 } 458 459 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, 460 "Limit on number of pages that ARC shrinker can reclaim at once"); 461 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_seeks, INT, ZMOD_RD, 462 "Relative cost of ARC eviction vs other kernel subsystems"); 463