1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #include <linux/delay.h> 7 #include <linux/mm.h> 8 #include <linux/sched/mm.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 12 #include "logger.h" 13 #include "memory-alloc.h" 14 #include "permassert.h" 15 16 /* 17 * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads 18 * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads' 19 * thread_registry and its associated methods implement this tracking. 20 */ 21 static struct thread_registry allocating_threads; 22 23 static inline bool allocations_allowed(void) 24 { 25 return vdo_lookup_thread(&allocating_threads) != NULL; 26 } 27 28 /* 29 * Register the current thread as an allocating thread. 30 * 31 * An optional flag location can be supplied indicating whether, at any given point in time, the 32 * threads associated with that flag should be allocating storage. If the flag is false, a message 33 * will be logged. 34 * 35 * If no flag is supplied, the thread is always allowed to allocate storage without complaint. 36 * 37 * @new_thread: registered_thread structure to use for the current thread 38 * @flag_ptr: Location of the allocation-allowed flag 39 */ 40 void vdo_register_allocating_thread(struct registered_thread *new_thread, 41 const bool *flag_ptr) 42 { 43 if (flag_ptr == NULL) { 44 static const bool allocation_always_allowed = true; 45 46 flag_ptr = &allocation_always_allowed; 47 } 48 49 vdo_register_thread(&allocating_threads, new_thread, flag_ptr); 50 } 51 52 /* Unregister the current thread as an allocating thread. */ 53 void vdo_unregister_allocating_thread(void) 54 { 55 vdo_unregister_thread(&allocating_threads); 56 } 57 58 /* 59 * We track how much memory has been allocated and freed. When we unload the module, we log an 60 * error if we have not freed all the memory that we allocated. Nearly all memory allocation and 61 * freeing is done using this module. 62 * 63 * We do not use kernel functions like the kvasprintf() method, which allocate memory indirectly 64 * using kmalloc. 65 * 66 * These data structures and methods are used to track the amount of memory used. 67 */ 68 69 /* 70 * We allocate very few large objects, and allocation/deallocation isn't done in a 71 * performance-critical stage for us, so a linked list should be fine. 72 */ 73 struct vmalloc_block_info { 74 void *ptr; 75 size_t size; 76 struct vmalloc_block_info *next; 77 }; 78 79 static struct { 80 spinlock_t lock; 81 size_t kmalloc_blocks; 82 size_t kmalloc_bytes; 83 size_t vmalloc_blocks; 84 size_t vmalloc_bytes; 85 size_t peak_bytes; 86 struct vmalloc_block_info *vmalloc_list; 87 } memory_stats __cacheline_aligned; 88 89 static void update_peak_usage(void) 90 { 91 size_t total_bytes = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes; 92 93 if (total_bytes > memory_stats.peak_bytes) 94 memory_stats.peak_bytes = total_bytes; 95 } 96 97 static void add_kmalloc_block(size_t size) 98 { 99 unsigned long flags; 100 101 spin_lock_irqsave(&memory_stats.lock, flags); 102 memory_stats.kmalloc_blocks++; 103 memory_stats.kmalloc_bytes += size; 104 update_peak_usage(); 105 spin_unlock_irqrestore(&memory_stats.lock, flags); 106 } 107 108 static void remove_kmalloc_block(size_t size) 109 { 110 unsigned long flags; 111 112 spin_lock_irqsave(&memory_stats.lock, flags); 113 memory_stats.kmalloc_blocks--; 114 memory_stats.kmalloc_bytes -= size; 115 spin_unlock_irqrestore(&memory_stats.lock, flags); 116 } 117 118 static void add_vmalloc_block(struct vmalloc_block_info *block) 119 { 120 unsigned long flags; 121 122 spin_lock_irqsave(&memory_stats.lock, flags); 123 block->next = memory_stats.vmalloc_list; 124 memory_stats.vmalloc_list = block; 125 memory_stats.vmalloc_blocks++; 126 memory_stats.vmalloc_bytes += block->size; 127 update_peak_usage(); 128 spin_unlock_irqrestore(&memory_stats.lock, flags); 129 } 130 131 static void remove_vmalloc_block(void *ptr) 132 { 133 struct vmalloc_block_info *block; 134 struct vmalloc_block_info **block_ptr; 135 unsigned long flags; 136 137 spin_lock_irqsave(&memory_stats.lock, flags); 138 for (block_ptr = &memory_stats.vmalloc_list; 139 (block = *block_ptr) != NULL; 140 block_ptr = &block->next) { 141 if (block->ptr == ptr) { 142 *block_ptr = block->next; 143 memory_stats.vmalloc_blocks--; 144 memory_stats.vmalloc_bytes -= block->size; 145 break; 146 } 147 } 148 149 spin_unlock_irqrestore(&memory_stats.lock, flags); 150 if (block != NULL) 151 vdo_free(block); 152 else 153 vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr); 154 } 155 156 /* 157 * Determine whether allocating a memory block should use kmalloc or __vmalloc. 158 * 159 * vmalloc can allocate any integral number of pages. 160 * 161 * kmalloc can allocate any number of bytes up to a configured limit, which defaults to 8 megabytes 162 * on some systems. kmalloc is especially good when memory is being both allocated and freed, and 163 * it does this efficiently in a multi CPU environment. 164 * 165 * kmalloc usually rounds the size of the block up to the next power of two, so when the requested 166 * block is bigger than PAGE_SIZE / 2 bytes, kmalloc will never give you less space than the 167 * corresponding vmalloc allocation. Sometimes vmalloc will use less overhead than kmalloc. 168 * 169 * The advantages of kmalloc do not help out UDS or VDO, because we allocate all our memory up 170 * front and do not free and reallocate it. Sometimes we have problems using kmalloc, because the 171 * Linux memory page map can become so fragmented that kmalloc will not give us a 32KB chunk. We 172 * have used vmalloc as a backup to kmalloc in the past, and a follow-up vmalloc of 32KB will work. 173 * But there is no strong case to be made for using kmalloc over vmalloc for these size chunks. 174 * 175 * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB requests. There is no 176 * strong reason for favoring either kmalloc or vmalloc for 4KB requests, except that tracking 177 * vmalloc statistics uses a linked list implementation. Using a simple test, this choice of 178 * boundary results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB results in an 179 * additional 6374 vmalloc calls, which is much less efficient for tracking. 180 * 181 * @size: How many bytes to allocate 182 */ 183 static inline bool use_kmalloc(size_t size) 184 { 185 return size <= PAGE_SIZE; 186 } 187 188 /* 189 * Allocate storage based on memory size and alignment, logging an error if the allocation fails. 190 * The memory will be zeroed. 191 * 192 * @size: The size of an object 193 * @align: The required alignment 194 * @what: What is being allocated (for error logging) 195 * @ptr: A pointer to hold the allocated memory 196 * 197 * Return: VDO_SUCCESS or an error code 198 */ 199 int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr) 200 { 201 /* 202 * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim 203 * procedures that have previously failed if there is some indication that progress has 204 * been made elsewhere. It can wait for other tasks to attempt high level approaches to 205 * freeing memory such as compaction (which removes fragmentation) and page-out. There is 206 * still a definite limit to the number of retries, but it is a larger limit than with 207 * __GFP_NORETRY. Allocations with this flag may fail, but only when there is genuinely 208 * little unused memory. While these allocations do not directly trigger the OOM killer, 209 * their failure indicates that the system is likely to need to use the OOM killer soon. 210 * The caller must handle failure, but can reasonably do so by failing a higher-level 211 * request, or completing it only in a much less efficient manner. 212 */ 213 const gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL; 214 unsigned int noio_flags; 215 bool allocations_restricted = !allocations_allowed(); 216 unsigned long start_time; 217 void *p = NULL; 218 219 if (unlikely(ptr == NULL)) 220 return -EINVAL; 221 222 if (size == 0) { 223 *((void **) ptr) = NULL; 224 return VDO_SUCCESS; 225 } 226 227 if (allocations_restricted) 228 noio_flags = memalloc_noio_save(); 229 230 start_time = jiffies; 231 if (use_kmalloc(size) && (align < PAGE_SIZE)) { 232 p = kmalloc(size, gfp_flags | __GFP_NOWARN); 233 if (p == NULL) { 234 /* 235 * It is possible for kmalloc to fail to allocate memory because there is 236 * no page available. A short sleep may allow the page reclaimer to 237 * free a page. 238 */ 239 fsleep(1000); 240 p = kmalloc(size, gfp_flags); 241 } 242 243 if (p != NULL) 244 add_kmalloc_block(ksize(p)); 245 } else { 246 struct vmalloc_block_info *block; 247 248 if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) { 249 /* 250 * It is possible for __vmalloc to fail to allocate memory because there 251 * are no pages available. A short sleep may allow the page reclaimer 252 * to free enough pages for a small allocation. 253 * 254 * For larger allocations, the page_alloc code is racing against the page 255 * reclaimer. If the page reclaimer can stay ahead of page_alloc, the 256 * __vmalloc will succeed. But if page_alloc overtakes the page reclaimer, 257 * the allocation fails. It is possible that more retries will succeed. 258 */ 259 for (;;) { 260 p = __vmalloc(size, gfp_flags | __GFP_NOWARN); 261 if (p != NULL) 262 break; 263 264 if (jiffies_to_msecs(jiffies - start_time) > 1000) { 265 /* Try one more time, logging a failure for this call. */ 266 p = __vmalloc(size, gfp_flags); 267 break; 268 } 269 270 fsleep(1000); 271 } 272 273 if (p == NULL) { 274 vdo_free(block); 275 } else { 276 block->ptr = p; 277 block->size = PAGE_ALIGN(size); 278 add_vmalloc_block(block); 279 } 280 } 281 } 282 283 if (allocations_restricted) 284 memalloc_noio_restore(noio_flags); 285 286 if (unlikely(p == NULL)) { 287 vdo_log_error("Could not allocate %zu bytes for %s in %u msecs", 288 size, what, jiffies_to_msecs(jiffies - start_time)); 289 return -ENOMEM; 290 } 291 292 *((void **) ptr) = p; 293 return VDO_SUCCESS; 294 } 295 296 /* 297 * Allocate storage based on memory size, failing immediately if the required memory is not 298 * available. The memory will be zeroed. 299 * 300 * @size: The size of an object. 301 * @what: What is being allocated (for error logging) 302 * 303 * Return: pointer to the allocated memory, or NULL if the required space is not available. 304 */ 305 void *vdo_allocate_memory_nowait(size_t size, const char *what __maybe_unused) 306 { 307 void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO); 308 309 if (p != NULL) 310 add_kmalloc_block(ksize(p)); 311 312 return p; 313 } 314 315 void vdo_free(void *ptr) 316 { 317 if (ptr != NULL) { 318 if (is_vmalloc_addr(ptr)) { 319 remove_vmalloc_block(ptr); 320 vfree(ptr); 321 } else { 322 remove_kmalloc_block(ksize(ptr)); 323 kfree(ptr); 324 } 325 } 326 } 327 328 /* 329 * Reallocate dynamically allocated memory. There are no alignment guarantees for the reallocated 330 * memory. If the new memory is larger than the old memory, the new space will be zeroed. 331 * 332 * @ptr: The memory to reallocate. 333 * @old_size: The old size of the memory 334 * @size: The new size to allocate 335 * @what: What is being allocated (for error logging) 336 * @new_ptr: A pointer to hold the reallocated pointer 337 * 338 * Return: VDO_SUCCESS or an error code 339 */ 340 int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what, 341 void *new_ptr) 342 { 343 int result; 344 345 if (size == 0) { 346 vdo_free(ptr); 347 *(void **) new_ptr = NULL; 348 return VDO_SUCCESS; 349 } 350 351 result = vdo_allocate(size, char, what, new_ptr); 352 if (result != VDO_SUCCESS) 353 return result; 354 355 if (ptr != NULL) { 356 if (old_size < size) 357 size = old_size; 358 359 memcpy(*((void **) new_ptr), ptr, size); 360 vdo_free(ptr); 361 } 362 363 return VDO_SUCCESS; 364 } 365 366 int vdo_duplicate_string(const char *string, const char *what, char **new_string) 367 { 368 int result; 369 u8 *dup; 370 371 result = vdo_allocate(strlen(string) + 1, u8, what, &dup); 372 if (result != VDO_SUCCESS) 373 return result; 374 375 memcpy(dup, string, strlen(string) + 1); 376 *new_string = dup; 377 return VDO_SUCCESS; 378 } 379 380 void vdo_memory_init(void) 381 { 382 spin_lock_init(&memory_stats.lock); 383 vdo_initialize_thread_registry(&allocating_threads); 384 } 385 386 void vdo_memory_exit(void) 387 { 388 VDO_ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0, 389 "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel", 390 memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks); 391 VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0, 392 "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel", 393 memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks); 394 vdo_log_debug("peak usage %zd bytes", memory_stats.peak_bytes); 395 } 396 397 void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used) 398 { 399 unsigned long flags; 400 401 spin_lock_irqsave(&memory_stats.lock, flags); 402 *bytes_used = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes; 403 *peak_bytes_used = memory_stats.peak_bytes; 404 spin_unlock_irqrestore(&memory_stats.lock, flags); 405 } 406 407 /* 408 * Report stats on any allocated memory that we're tracking. Not all allocation types are 409 * guaranteed to be tracked in bytes (e.g., bios). 410 */ 411 void vdo_report_memory_usage(void) 412 { 413 unsigned long flags; 414 u64 kmalloc_blocks; 415 u64 kmalloc_bytes; 416 u64 vmalloc_blocks; 417 u64 vmalloc_bytes; 418 u64 peak_usage; 419 u64 total_bytes; 420 421 spin_lock_irqsave(&memory_stats.lock, flags); 422 kmalloc_blocks = memory_stats.kmalloc_blocks; 423 kmalloc_bytes = memory_stats.kmalloc_bytes; 424 vmalloc_blocks = memory_stats.vmalloc_blocks; 425 vmalloc_bytes = memory_stats.vmalloc_bytes; 426 peak_usage = memory_stats.peak_bytes; 427 spin_unlock_irqrestore(&memory_stats.lock, flags); 428 total_bytes = kmalloc_bytes + vmalloc_bytes; 429 vdo_log_info("current module memory tracking (actual allocation sizes, not requested):"); 430 vdo_log_info(" %llu bytes in %llu kmalloc blocks", 431 (unsigned long long) kmalloc_bytes, 432 (unsigned long long) kmalloc_blocks); 433 vdo_log_info(" %llu bytes in %llu vmalloc blocks", 434 (unsigned long long) vmalloc_bytes, 435 (unsigned long long) vmalloc_blocks); 436 vdo_log_info(" total %llu bytes, peak usage %llu bytes", 437 (unsigned long long) total_bytes, (unsigned long long) peak_usage); 438 } 439