1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. 4 * Copyright (C) 2007 The Regents of the University of California. 5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 6 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 7 * UCRL-CODE-235197 8 * 9 * This file is part of the SPL, Solaris Porting Layer. 10 * 11 * The SPL is free software; you can redistribute it and/or modify it 12 * under the terms of the GNU General Public License as published by the 13 * Free Software Foundation; either version 2 of the License, or (at your 14 * option) any later version. 15 * 16 * The SPL is distributed in the hope that it will be useful, but WITHOUT 17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 19 * for more details. 20 * 21 * You should have received a copy of the GNU General Public License along 22 * with the SPL. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25 #include <sys/debug.h> 26 #include <sys/sysmacros.h> 27 #include <sys/kmem.h> 28 #include <sys/vmem.h> 29 30 /* 31 * As a general rule kmem_alloc() allocations should be small, preferably 32 * just a few pages since they must by physically contiguous. Therefore, a 33 * rate limited warning will be printed to the console for any kmem_alloc() 34 * which exceeds a reasonable threshold. 35 * 36 * The default warning threshold is set to sixteen pages but capped at 64K to 37 * accommodate systems using large pages. This value was selected to be small 38 * enough to ensure the largest allocations are quickly noticed and fixed. 39 * But large enough to avoid logging any warnings when a allocation size is 40 * larger than optimal but not a serious concern. Since this value is tunable, 41 * developers are encouraged to set it lower when testing so any new largish 42 * allocations are quickly caught. These warnings may be disabled by setting 43 * the threshold to zero. 44 */ 45 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); 46 module_param(spl_kmem_alloc_warn, uint, 0644); 47 MODULE_PARM_DESC(spl_kmem_alloc_warn, 48 "Warning threshold in bytes for a kmem_alloc()"); 49 EXPORT_SYMBOL(spl_kmem_alloc_warn); 50 51 /* 52 * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. 53 * Allocations which are marginally smaller than this limit may succeed but 54 * should still be avoided due to the expense of locating a contiguous range 55 * of free pages. Therefore, a maximum kmem size with reasonable safely 56 * margin of 4x is set. Kmem_alloc() allocations larger than this maximum 57 * will quickly fail. Vmem_alloc() allocations less than or equal to this 58 * value will use kmalloc(), but shift to vmalloc() when exceeding this value. 59 */ 60 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); 61 module_param(spl_kmem_alloc_max, uint, 0644); 62 MODULE_PARM_DESC(spl_kmem_alloc_max, 63 "Maximum size in bytes for a kmem_alloc()"); 64 EXPORT_SYMBOL(spl_kmem_alloc_max); 65 66 int 67 kmem_debugging(void) 68 { 69 return (0); 70 } 71 EXPORT_SYMBOL(kmem_debugging); 72 73 char * 74 kmem_vasprintf(const char *fmt, va_list ap) 75 { 76 va_list aq; 77 char *ptr; 78 79 do { 80 va_copy(aq, ap); 81 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); 82 va_end(aq); 83 } while (ptr == NULL); 84 85 return (ptr); 86 } 87 EXPORT_SYMBOL(kmem_vasprintf); 88 89 char * 90 kmem_asprintf(const char *fmt, ...) 91 { 92 va_list ap; 93 char *ptr; 94 95 do { 96 va_start(ap, fmt); 97 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); 98 va_end(ap); 99 } while (ptr == NULL); 100 101 return (ptr); 102 } 103 EXPORT_SYMBOL(kmem_asprintf); 104 105 static char * 106 __strdup(const char *str, int flags) 107 { 108 char *ptr; 109 int n; 110 111 n = strlen(str); 112 ptr = kmalloc(n + 1, kmem_flags_convert(flags)); 113 if (ptr) 114 memcpy(ptr, str, n + 1); 115 116 return (ptr); 117 } 118 119 char * 120 kmem_strdup(const char *str) 121 { 122 return (__strdup(str, KM_SLEEP)); 123 } 124 EXPORT_SYMBOL(kmem_strdup); 125 126 void 127 kmem_strfree(char *str) 128 { 129 kfree(str); 130 } 131 EXPORT_SYMBOL(kmem_strfree); 132 133 void * 134 spl_kvmalloc(size_t size, gfp_t lflags) 135 { 136 /* 137 * GFP_KERNEL allocations can safely use kvmalloc which may 138 * improve performance by avoiding a) high latency caused by 139 * vmalloc's on-access allocation, b) performance loss due to 140 * MMU memory address mapping and c) vmalloc locking overhead. 141 * This has the side-effect that the slab statistics will 142 * incorrectly report this as a vmem allocation, but that is 143 * purely cosmetic. 144 */ 145 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 146 return (kvmalloc(size, lflags)); 147 148 gfp_t kmalloc_lflags = lflags; 149 150 if (size > PAGE_SIZE) { 151 /* 152 * We need to set __GFP_NOWARN here since spl_kvmalloc is not 153 * only called by spl_kmem_alloc_impl but can be called 154 * directly with custom lflags, too. In that case 155 * kmem_flags_convert does not get called, which would 156 * implicitly set __GFP_NOWARN. 157 */ 158 kmalloc_lflags |= __GFP_NOWARN; 159 160 /* 161 * N.B. __GFP_RETRY_MAYFAIL is supported only for large 162 * e (>32kB) allocations. 163 * 164 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY 165 * for !costly requests because there is no other way to tell 166 * the allocator that we want to fail rather than retry 167 * endlessly. 168 */ 169 if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || 170 (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 171 kmalloc_lflags |= __GFP_NORETRY; 172 } 173 } 174 175 /* 176 * We first try kmalloc - even for big sizes - and fall back to 177 * spl_vmalloc if that fails. 178 * 179 * For non-__GFP-RECLAIM allocations we always stick to 180 * kmalloc_node, and fail when kmalloc is not successful (returns 181 * NULL). 182 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc 183 * internally uses GPF_KERNEL allocations. 184 */ 185 void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); 186 if (ptr || size <= PAGE_SIZE || 187 (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { 188 return (ptr); 189 } 190 191 return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); 192 } 193 194 /* 195 * General purpose unified implementation of kmem_alloc(). It is an 196 * amalgamation of Linux and Illumos allocator design. It should never be 197 * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains 198 * relatively portable. Consumers may only access this function through 199 * wrappers that enforce the common flags to ensure portability. 200 */ 201 inline void * 202 spl_kmem_alloc_impl(size_t size, int flags, int node) 203 { 204 gfp_t lflags = kmem_flags_convert(flags); 205 void *ptr; 206 207 /* 208 * Log abnormally large allocations and rate limit the console output. 209 * Allocations larger than spl_kmem_alloc_warn should be performed 210 * through the vmem_alloc()/vmem_zalloc() interfaces. 211 */ 212 if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && 213 !(flags & KM_VMEM)) { 214 printk(KERN_WARNING 215 "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" 216 "https://github.com/openzfs/zfs/issues/new\n", 217 (unsigned long)size, flags); 218 dump_stack(); 219 } 220 221 /* 222 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used 223 * unlike kmem_alloc() with KM_SLEEP on Illumos. 224 */ 225 do { 226 /* 227 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max 228 * is unsafe. This must fail for all for kmem_alloc() and 229 * kmem_zalloc() callers. 230 * 231 * For vmem_alloc() and vmem_zalloc() callers it is permissible 232 * to use spl_vmalloc(). However, in general use of 233 * spl_vmalloc() is strongly discouraged because a global lock 234 * must be acquired. Contention on this lock can significantly 235 * impact performance so frequently manipulating the virtual 236 * address space is strongly discouraged. 237 */ 238 if (size > spl_kmem_alloc_max) { 239 if (flags & KM_VMEM) { 240 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); 241 } else { 242 return (NULL); 243 } 244 } else { 245 /* 246 * We use kmalloc when doing kmem_alloc(KM_NOSLEEP), 247 * because kvmalloc/vmalloc may sleep. We also use 248 * kmalloc on systems with limited kernel VA space (e.g. 249 * 32-bit), which have HIGHMEM. Otherwise we use 250 * kvmalloc, which tries to get contiguous physical 251 * memory (fast, like kmalloc) and falls back on using 252 * virtual memory to stitch together pages (slow, like 253 * vmalloc). 254 */ 255 #ifdef CONFIG_HIGHMEM 256 if (flags & KM_VMEM) { 257 #else 258 if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) { 259 #endif 260 ptr = spl_kvmalloc(size, lflags); 261 } else { 262 ptr = kmalloc_node(size, lflags, node); 263 } 264 } 265 266 if (likely(ptr) || (flags & KM_NOSLEEP)) 267 return (ptr); 268 269 /* 270 * Try hard to satisfy the allocation. However, when progress 271 * cannot be made, the allocation is allowed to fail. 272 */ 273 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 274 lflags |= __GFP_RETRY_MAYFAIL; 275 276 /* 277 * Use cond_resched() instead of congestion_wait() to avoid 278 * deadlocking systems where there are no block devices. 279 */ 280 cond_resched(); 281 } while (1); 282 283 return (NULL); 284 } 285 286 inline void 287 spl_kmem_free_impl(const void *buf, size_t size) 288 { 289 if (is_vmalloc_addr(buf)) 290 vfree(buf); 291 else 292 kfree(buf); 293 } 294 295 /* 296 * Memory allocation and accounting for kmem_* * style allocations. When 297 * DEBUG_KMEM is enabled the total memory allocated will be tracked and 298 * any memory leaked will be reported during module unload. 299 * 300 * ./configure --enable-debug-kmem 301 */ 302 #ifdef DEBUG_KMEM 303 304 /* Shim layer memory accounting */ 305 #ifdef HAVE_ATOMIC64_T 306 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); 307 unsigned long long kmem_alloc_max = 0; 308 #else /* HAVE_ATOMIC64_T */ 309 atomic_t kmem_alloc_used = ATOMIC_INIT(0); 310 unsigned long long kmem_alloc_max = 0; 311 #endif /* HAVE_ATOMIC64_T */ 312 313 EXPORT_SYMBOL(kmem_alloc_used); 314 EXPORT_SYMBOL(kmem_alloc_max); 315 316 inline void * 317 spl_kmem_alloc_debug(size_t size, int flags, int node) 318 { 319 void *ptr; 320 321 ptr = spl_kmem_alloc_impl(size, flags, node); 322 if (ptr) { 323 kmem_alloc_used_add(size); 324 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) 325 kmem_alloc_max = kmem_alloc_used_read(); 326 } 327 328 return (ptr); 329 } 330 331 inline void 332 spl_kmem_free_debug(const void *ptr, size_t size) 333 { 334 kmem_alloc_used_sub(size); 335 spl_kmem_free_impl(ptr, size); 336 } 337 338 /* 339 * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked 340 * but also the location of every alloc and free. When the SPL module is 341 * unloaded a list of all leaked addresses and where they were allocated 342 * will be dumped to the console. Enabling this feature has a significant 343 * impact on performance but it makes finding memory leaks straight forward. 344 * 345 * Not surprisingly with debugging enabled the xmem_locks are very highly 346 * contended particularly on xfree(). If we want to run with this detailed 347 * debugging enabled for anything other than debugging we need to minimize 348 * the contention by moving to a lock per xmem_table entry model. 349 * 350 * ./configure --enable-debug-kmem-tracking 351 */ 352 #ifdef DEBUG_KMEM_TRACKING 353 354 #include <linux/hash.h> 355 #include <linux/ctype.h> 356 357 #define KMEM_HASH_BITS 10 358 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) 359 360 typedef struct kmem_debug { 361 struct hlist_node kd_hlist; /* Hash node linkage */ 362 struct list_head kd_list; /* List of all allocations */ 363 void *kd_addr; /* Allocation pointer */ 364 size_t kd_size; /* Allocation size */ 365 const char *kd_func; /* Allocation function */ 366 int kd_line; /* Allocation line */ 367 } kmem_debug_t; 368 369 static spinlock_t kmem_lock; 370 static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; 371 static struct list_head kmem_list; 372 373 static kmem_debug_t * 374 kmem_del_init(spinlock_t *lock, struct hlist_head *table, 375 int bits, const void *addr) 376 { 377 struct hlist_head *head; 378 struct hlist_node *node = NULL; 379 struct kmem_debug *p; 380 unsigned long flags; 381 382 spin_lock_irqsave(lock, flags); 383 384 head = &table[hash_ptr((void *)addr, bits)]; 385 hlist_for_each(node, head) { 386 p = list_entry(node, struct kmem_debug, kd_hlist); 387 if (p->kd_addr == addr) { 388 hlist_del_init(&p->kd_hlist); 389 list_del_init(&p->kd_list); 390 spin_unlock_irqrestore(lock, flags); 391 return (p); 392 } 393 } 394 395 spin_unlock_irqrestore(lock, flags); 396 397 return (NULL); 398 } 399 400 inline void * 401 spl_kmem_alloc_track(size_t size, int flags, 402 const char *func, int line, int node) 403 { 404 void *ptr = NULL; 405 kmem_debug_t *dptr; 406 unsigned long irq_flags; 407 408 dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); 409 if (dptr == NULL) 410 return (NULL); 411 412 dptr->kd_func = __strdup(func, flags); 413 if (dptr->kd_func == NULL) { 414 kfree(dptr); 415 return (NULL); 416 } 417 418 ptr = spl_kmem_alloc_debug(size, flags, node); 419 if (ptr == NULL) { 420 kfree(dptr->kd_func); 421 kfree(dptr); 422 return (NULL); 423 } 424 425 INIT_HLIST_NODE(&dptr->kd_hlist); 426 INIT_LIST_HEAD(&dptr->kd_list); 427 428 dptr->kd_addr = ptr; 429 dptr->kd_size = size; 430 dptr->kd_line = line; 431 432 spin_lock_irqsave(&kmem_lock, irq_flags); 433 hlist_add_head(&dptr->kd_hlist, 434 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); 435 list_add_tail(&dptr->kd_list, &kmem_list); 436 spin_unlock_irqrestore(&kmem_lock, irq_flags); 437 438 return (ptr); 439 } 440 441 inline void 442 spl_kmem_free_track(const void *ptr, size_t size) 443 { 444 kmem_debug_t *dptr; 445 446 /* Ignore NULL pointer since we haven't tracked it at all */ 447 if (ptr == NULL) 448 return; 449 450 /* Must exist in hash due to kmem_alloc() */ 451 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); 452 ASSERT3P(dptr, !=, NULL); 453 ASSERT3S(dptr->kd_size, ==, size); 454 455 kfree(dptr->kd_func); 456 kfree(dptr); 457 458 spl_kmem_free_debug(ptr, size); 459 } 460 #endif /* DEBUG_KMEM_TRACKING */ 461 #endif /* DEBUG_KMEM */ 462 463 /* 464 * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. 465 */ 466 void * 467 spl_kmem_alloc(size_t size, int flags, const char *func, int line) 468 { 469 ASSERT0(flags & ~KM_PUBLIC_MASK); 470 471 #if !defined(DEBUG_KMEM) 472 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 473 #elif !defined(DEBUG_KMEM_TRACKING) 474 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 475 #else 476 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 477 #endif 478 } 479 EXPORT_SYMBOL(spl_kmem_alloc); 480 481 void * 482 spl_kmem_zalloc(size_t size, int flags, const char *func, int line) 483 { 484 ASSERT0(flags & ~KM_PUBLIC_MASK); 485 486 flags |= KM_ZERO; 487 488 #if !defined(DEBUG_KMEM) 489 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 490 #elif !defined(DEBUG_KMEM_TRACKING) 491 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 492 #else 493 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 494 #endif 495 } 496 EXPORT_SYMBOL(spl_kmem_zalloc); 497 498 void 499 spl_kmem_free(const void *buf, size_t size) 500 { 501 #if !defined(DEBUG_KMEM) 502 return (spl_kmem_free_impl(buf, size)); 503 #elif !defined(DEBUG_KMEM_TRACKING) 504 return (spl_kmem_free_debug(buf, size)); 505 #else 506 return (spl_kmem_free_track(buf, size)); 507 #endif 508 } 509 EXPORT_SYMBOL(spl_kmem_free); 510 511 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) 512 static char * 513 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) 514 { 515 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; 516 int i, flag = 1; 517 518 ASSERT(str != NULL && len >= 17); 519 memset(str, 0, len); 520 521 /* 522 * Check for a fully printable string, and while we are at 523 * it place the printable characters in the passed buffer. 524 */ 525 for (i = 0; i < size; i++) { 526 str[i] = ((char *)(kd->kd_addr))[i]; 527 if (isprint(str[i])) { 528 continue; 529 } else { 530 /* 531 * Minimum number of printable characters found 532 * to make it worthwhile to print this as ascii. 533 */ 534 if (i > min) 535 break; 536 537 flag = 0; 538 break; 539 } 540 } 541 542 if (!flag) { 543 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", 544 *((uint8_t *)kd->kd_addr), 545 *((uint8_t *)kd->kd_addr + 2), 546 *((uint8_t *)kd->kd_addr + 4), 547 *((uint8_t *)kd->kd_addr + 6), 548 *((uint8_t *)kd->kd_addr + 8), 549 *((uint8_t *)kd->kd_addr + 10), 550 *((uint8_t *)kd->kd_addr + 12), 551 *((uint8_t *)kd->kd_addr + 14)); 552 } 553 554 return (str); 555 } 556 557 static int 558 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) 559 { 560 int i; 561 562 spin_lock_init(lock); 563 INIT_LIST_HEAD(list); 564 565 for (i = 0; i < size; i++) 566 INIT_HLIST_HEAD(&kmem_table[i]); 567 568 return (0); 569 } 570 571 static void 572 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) 573 { 574 unsigned long flags; 575 kmem_debug_t *kd = NULL; 576 char str[17]; 577 578 spin_lock_irqsave(lock, flags); 579 if (!list_empty(list)) 580 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", 581 "size", "data", "func", "line"); 582 583 list_for_each_entry(kd, list, kd_list) { 584 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, 585 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), 586 kd->kd_func, kd->kd_line); 587 } 588 589 spin_unlock_irqrestore(lock, flags); 590 } 591 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ 592 593 int 594 spl_kmem_init(void) 595 { 596 597 #ifdef DEBUG_KMEM 598 kmem_alloc_used_set(0); 599 600 601 602 #ifdef DEBUG_KMEM_TRACKING 603 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); 604 #endif /* DEBUG_KMEM_TRACKING */ 605 #endif /* DEBUG_KMEM */ 606 607 return (0); 608 } 609 610 void 611 spl_kmem_fini(void) 612 { 613 #ifdef DEBUG_KMEM 614 /* 615 * Display all unreclaimed memory addresses, including the 616 * allocation size and the first few bytes of what's located 617 * at that address to aid in debugging. Performance is not 618 * a serious concern here since it is module unload time. 619 */ 620 if (kmem_alloc_used_read() != 0) 621 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", 622 (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); 623 624 #ifdef DEBUG_KMEM_TRACKING 625 spl_kmem_fini_tracking(&kmem_list, &kmem_lock); 626 #endif /* DEBUG_KMEM_TRACKING */ 627 #endif /* DEBUG_KMEM */ 628 } 629