1 /* 2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. 3 * Copyright (C) 2007 The Regents of the University of California. 4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 6 * UCRL-CODE-235197 7 * 8 * This file is part of the SPL, Solaris Porting Layer. 9 * 10 * The SPL is free software; you can redistribute it and/or modify it 11 * under the terms of the GNU General Public License as published by the 12 * Free Software Foundation; either version 2 of the License, or (at your 13 * option) any later version. 14 * 15 * The SPL is distributed in the hope that it will be useful, but WITHOUT 16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18 * for more details. 19 * 20 * You should have received a copy of the GNU General Public License along 21 * with the SPL. If not, see <http://www.gnu.org/licenses/>. 22 */ 23 24 #include <sys/debug.h> 25 #include <sys/sysmacros.h> 26 #include <sys/kmem.h> 27 #include <sys/vmem.h> 28 29 /* 30 * As a general rule kmem_alloc() allocations should be small, preferably 31 * just a few pages since they must by physically contiguous. Therefore, a 32 * rate limited warning will be printed to the console for any kmem_alloc() 33 * which exceeds a reasonable threshold. 34 * 35 * The default warning threshold is set to sixteen pages but capped at 64K to 36 * accommodate systems using large pages. This value was selected to be small 37 * enough to ensure the largest allocations are quickly noticed and fixed. 38 * But large enough to avoid logging any warnings when a allocation size is 39 * larger than optimal but not a serious concern. Since this value is tunable, 40 * developers are encouraged to set it lower when testing so any new largish 41 * allocations are quickly caught. These warnings may be disabled by setting 42 * the threshold to zero. 43 */ 44 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); 45 module_param(spl_kmem_alloc_warn, uint, 0644); 46 MODULE_PARM_DESC(spl_kmem_alloc_warn, 47 "Warning threshold in bytes for a kmem_alloc()"); 48 EXPORT_SYMBOL(spl_kmem_alloc_warn); 49 50 /* 51 * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. 52 * Allocations which are marginally smaller than this limit may succeed but 53 * should still be avoided due to the expense of locating a contiguous range 54 * of free pages. Therefore, a maximum kmem size with reasonable safely 55 * margin of 4x is set. Kmem_alloc() allocations larger than this maximum 56 * will quickly fail. Vmem_alloc() allocations less than or equal to this 57 * value will use kmalloc(), but shift to vmalloc() when exceeding this value. 58 */ 59 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); 60 module_param(spl_kmem_alloc_max, uint, 0644); 61 MODULE_PARM_DESC(spl_kmem_alloc_max, 62 "Maximum size in bytes for a kmem_alloc()"); 63 EXPORT_SYMBOL(spl_kmem_alloc_max); 64 65 int 66 kmem_debugging(void) 67 { 68 return (0); 69 } 70 EXPORT_SYMBOL(kmem_debugging); 71 72 char * 73 kmem_vasprintf(const char *fmt, va_list ap) 74 { 75 va_list aq; 76 char *ptr; 77 78 do { 79 va_copy(aq, ap); 80 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); 81 va_end(aq); 82 } while (ptr == NULL); 83 84 return (ptr); 85 } 86 EXPORT_SYMBOL(kmem_vasprintf); 87 88 char * 89 kmem_asprintf(const char *fmt, ...) 90 { 91 va_list ap; 92 char *ptr; 93 94 do { 95 va_start(ap, fmt); 96 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); 97 va_end(ap); 98 } while (ptr == NULL); 99 100 return (ptr); 101 } 102 EXPORT_SYMBOL(kmem_asprintf); 103 104 static char * 105 __strdup(const char *str, int flags) 106 { 107 char *ptr; 108 int n; 109 110 n = strlen(str); 111 ptr = kmalloc(n + 1, kmem_flags_convert(flags)); 112 if (ptr) 113 memcpy(ptr, str, n + 1); 114 115 return (ptr); 116 } 117 118 char * 119 kmem_strdup(const char *str) 120 { 121 return (__strdup(str, KM_SLEEP)); 122 } 123 EXPORT_SYMBOL(kmem_strdup); 124 125 void 126 kmem_strfree(char *str) 127 { 128 kfree(str); 129 } 130 EXPORT_SYMBOL(kmem_strfree); 131 132 void * 133 spl_kvmalloc(size_t size, gfp_t lflags) 134 { 135 /* 136 * GFP_KERNEL allocations can safely use kvmalloc which may 137 * improve performance by avoiding a) high latency caused by 138 * vmalloc's on-access allocation, b) performance loss due to 139 * MMU memory address mapping and c) vmalloc locking overhead. 140 * This has the side-effect that the slab statistics will 141 * incorrectly report this as a vmem allocation, but that is 142 * purely cosmetic. 143 */ 144 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 145 return (kvmalloc(size, lflags)); 146 147 gfp_t kmalloc_lflags = lflags; 148 149 if (size > PAGE_SIZE) { 150 /* 151 * We need to set __GFP_NOWARN here since spl_kvmalloc is not 152 * only called by spl_kmem_alloc_impl but can be called 153 * directly with custom lflags, too. In that case 154 * kmem_flags_convert does not get called, which would 155 * implicitly set __GFP_NOWARN. 156 */ 157 kmalloc_lflags |= __GFP_NOWARN; 158 159 /* 160 * N.B. __GFP_RETRY_MAYFAIL is supported only for large 161 * e (>32kB) allocations. 162 * 163 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY 164 * for !costly requests because there is no other way to tell 165 * the allocator that we want to fail rather than retry 166 * endlessly. 167 */ 168 if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || 169 (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 170 kmalloc_lflags |= __GFP_NORETRY; 171 } 172 } 173 174 /* 175 * We first try kmalloc - even for big sizes - and fall back to 176 * spl_vmalloc if that fails. 177 * 178 * For non-__GFP-RECLAIM allocations we always stick to 179 * kmalloc_node, and fail when kmalloc is not successful (returns 180 * NULL). 181 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc 182 * internally uses GPF_KERNEL allocations. 183 */ 184 void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); 185 if (ptr || size <= PAGE_SIZE || 186 (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { 187 return (ptr); 188 } 189 190 return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); 191 } 192 193 /* 194 * General purpose unified implementation of kmem_alloc(). It is an 195 * amalgamation of Linux and Illumos allocator design. It should never be 196 * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains 197 * relatively portable. Consumers may only access this function through 198 * wrappers that enforce the common flags to ensure portability. 199 */ 200 inline void * 201 spl_kmem_alloc_impl(size_t size, int flags, int node) 202 { 203 gfp_t lflags = kmem_flags_convert(flags); 204 void *ptr; 205 206 /* 207 * Log abnormally large allocations and rate limit the console output. 208 * Allocations larger than spl_kmem_alloc_warn should be performed 209 * through the vmem_alloc()/vmem_zalloc() interfaces. 210 */ 211 if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && 212 !(flags & KM_VMEM)) { 213 printk(KERN_WARNING 214 "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" 215 "https://github.com/openzfs/zfs/issues/new\n", 216 (unsigned long)size, flags); 217 dump_stack(); 218 } 219 220 /* 221 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used 222 * unlike kmem_alloc() with KM_SLEEP on Illumos. 223 */ 224 do { 225 /* 226 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max 227 * is unsafe. This must fail for all for kmem_alloc() and 228 * kmem_zalloc() callers. 229 * 230 * For vmem_alloc() and vmem_zalloc() callers it is permissible 231 * to use spl_vmalloc(). However, in general use of 232 * spl_vmalloc() is strongly discouraged because a global lock 233 * must be acquired. Contention on this lock can significantly 234 * impact performance so frequently manipulating the virtual 235 * address space is strongly discouraged. 236 */ 237 if (size > spl_kmem_alloc_max) { 238 if (flags & KM_VMEM) { 239 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); 240 } else { 241 return (NULL); 242 } 243 } else { 244 /* 245 * We use kmalloc when doing kmem_alloc(KM_NOSLEEP), 246 * because kvmalloc/vmalloc may sleep. We also use 247 * kmalloc on systems with limited kernel VA space (e.g. 248 * 32-bit), which have HIGHMEM. Otherwise we use 249 * kvmalloc, which tries to get contiguous physical 250 * memory (fast, like kmalloc) and falls back on using 251 * virtual memory to stitch together pages (slow, like 252 * vmalloc). 253 */ 254 #ifdef CONFIG_HIGHMEM 255 if (flags & KM_VMEM) { 256 #else 257 if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) { 258 #endif 259 ptr = spl_kvmalloc(size, lflags); 260 } else { 261 ptr = kmalloc_node(size, lflags, node); 262 } 263 } 264 265 if (likely(ptr) || (flags & KM_NOSLEEP)) 266 return (ptr); 267 268 /* 269 * Try hard to satisfy the allocation. However, when progress 270 * cannot be made, the allocation is allowed to fail. 271 */ 272 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 273 lflags |= __GFP_RETRY_MAYFAIL; 274 275 /* 276 * Use cond_resched() instead of congestion_wait() to avoid 277 * deadlocking systems where there are no block devices. 278 */ 279 cond_resched(); 280 } while (1); 281 282 return (NULL); 283 } 284 285 inline void 286 spl_kmem_free_impl(const void *buf, size_t size) 287 { 288 if (is_vmalloc_addr(buf)) 289 vfree(buf); 290 else 291 kfree(buf); 292 } 293 294 /* 295 * Memory allocation and accounting for kmem_* * style allocations. When 296 * DEBUG_KMEM is enabled the total memory allocated will be tracked and 297 * any memory leaked will be reported during module unload. 298 * 299 * ./configure --enable-debug-kmem 300 */ 301 #ifdef DEBUG_KMEM 302 303 /* Shim layer memory accounting */ 304 #ifdef HAVE_ATOMIC64_T 305 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); 306 unsigned long long kmem_alloc_max = 0; 307 #else /* HAVE_ATOMIC64_T */ 308 atomic_t kmem_alloc_used = ATOMIC_INIT(0); 309 unsigned long long kmem_alloc_max = 0; 310 #endif /* HAVE_ATOMIC64_T */ 311 312 EXPORT_SYMBOL(kmem_alloc_used); 313 EXPORT_SYMBOL(kmem_alloc_max); 314 315 inline void * 316 spl_kmem_alloc_debug(size_t size, int flags, int node) 317 { 318 void *ptr; 319 320 ptr = spl_kmem_alloc_impl(size, flags, node); 321 if (ptr) { 322 kmem_alloc_used_add(size); 323 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) 324 kmem_alloc_max = kmem_alloc_used_read(); 325 } 326 327 return (ptr); 328 } 329 330 inline void 331 spl_kmem_free_debug(const void *ptr, size_t size) 332 { 333 kmem_alloc_used_sub(size); 334 spl_kmem_free_impl(ptr, size); 335 } 336 337 /* 338 * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked 339 * but also the location of every alloc and free. When the SPL module is 340 * unloaded a list of all leaked addresses and where they were allocated 341 * will be dumped to the console. Enabling this feature has a significant 342 * impact on performance but it makes finding memory leaks straight forward. 343 * 344 * Not surprisingly with debugging enabled the xmem_locks are very highly 345 * contended particularly on xfree(). If we want to run with this detailed 346 * debugging enabled for anything other than debugging we need to minimize 347 * the contention by moving to a lock per xmem_table entry model. 348 * 349 * ./configure --enable-debug-kmem-tracking 350 */ 351 #ifdef DEBUG_KMEM_TRACKING 352 353 #include <linux/hash.h> 354 #include <linux/ctype.h> 355 356 #define KMEM_HASH_BITS 10 357 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) 358 359 typedef struct kmem_debug { 360 struct hlist_node kd_hlist; /* Hash node linkage */ 361 struct list_head kd_list; /* List of all allocations */ 362 void *kd_addr; /* Allocation pointer */ 363 size_t kd_size; /* Allocation size */ 364 const char *kd_func; /* Allocation function */ 365 int kd_line; /* Allocation line */ 366 } kmem_debug_t; 367 368 static spinlock_t kmem_lock; 369 static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; 370 static struct list_head kmem_list; 371 372 static kmem_debug_t * 373 kmem_del_init(spinlock_t *lock, struct hlist_head *table, 374 int bits, const void *addr) 375 { 376 struct hlist_head *head; 377 struct hlist_node *node = NULL; 378 struct kmem_debug *p; 379 unsigned long flags; 380 381 spin_lock_irqsave(lock, flags); 382 383 head = &table[hash_ptr((void *)addr, bits)]; 384 hlist_for_each(node, head) { 385 p = list_entry(node, struct kmem_debug, kd_hlist); 386 if (p->kd_addr == addr) { 387 hlist_del_init(&p->kd_hlist); 388 list_del_init(&p->kd_list); 389 spin_unlock_irqrestore(lock, flags); 390 return (p); 391 } 392 } 393 394 spin_unlock_irqrestore(lock, flags); 395 396 return (NULL); 397 } 398 399 inline void * 400 spl_kmem_alloc_track(size_t size, int flags, 401 const char *func, int line, int node) 402 { 403 void *ptr = NULL; 404 kmem_debug_t *dptr; 405 unsigned long irq_flags; 406 407 dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); 408 if (dptr == NULL) 409 return (NULL); 410 411 dptr->kd_func = __strdup(func, flags); 412 if (dptr->kd_func == NULL) { 413 kfree(dptr); 414 return (NULL); 415 } 416 417 ptr = spl_kmem_alloc_debug(size, flags, node); 418 if (ptr == NULL) { 419 kfree(dptr->kd_func); 420 kfree(dptr); 421 return (NULL); 422 } 423 424 INIT_HLIST_NODE(&dptr->kd_hlist); 425 INIT_LIST_HEAD(&dptr->kd_list); 426 427 dptr->kd_addr = ptr; 428 dptr->kd_size = size; 429 dptr->kd_line = line; 430 431 spin_lock_irqsave(&kmem_lock, irq_flags); 432 hlist_add_head(&dptr->kd_hlist, 433 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); 434 list_add_tail(&dptr->kd_list, &kmem_list); 435 spin_unlock_irqrestore(&kmem_lock, irq_flags); 436 437 return (ptr); 438 } 439 440 inline void 441 spl_kmem_free_track(const void *ptr, size_t size) 442 { 443 kmem_debug_t *dptr; 444 445 /* Ignore NULL pointer since we haven't tracked it at all */ 446 if (ptr == NULL) 447 return; 448 449 /* Must exist in hash due to kmem_alloc() */ 450 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); 451 ASSERT3P(dptr, !=, NULL); 452 ASSERT3S(dptr->kd_size, ==, size); 453 454 kfree(dptr->kd_func); 455 kfree(dptr); 456 457 spl_kmem_free_debug(ptr, size); 458 } 459 #endif /* DEBUG_KMEM_TRACKING */ 460 #endif /* DEBUG_KMEM */ 461 462 /* 463 * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. 464 */ 465 void * 466 spl_kmem_alloc(size_t size, int flags, const char *func, int line) 467 { 468 ASSERT0(flags & ~KM_PUBLIC_MASK); 469 470 #if !defined(DEBUG_KMEM) 471 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 472 #elif !defined(DEBUG_KMEM_TRACKING) 473 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 474 #else 475 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 476 #endif 477 } 478 EXPORT_SYMBOL(spl_kmem_alloc); 479 480 void * 481 spl_kmem_zalloc(size_t size, int flags, const char *func, int line) 482 { 483 ASSERT0(flags & ~KM_PUBLIC_MASK); 484 485 flags |= KM_ZERO; 486 487 #if !defined(DEBUG_KMEM) 488 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 489 #elif !defined(DEBUG_KMEM_TRACKING) 490 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 491 #else 492 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 493 #endif 494 } 495 EXPORT_SYMBOL(spl_kmem_zalloc); 496 497 void 498 spl_kmem_free(const void *buf, size_t size) 499 { 500 #if !defined(DEBUG_KMEM) 501 return (spl_kmem_free_impl(buf, size)); 502 #elif !defined(DEBUG_KMEM_TRACKING) 503 return (spl_kmem_free_debug(buf, size)); 504 #else 505 return (spl_kmem_free_track(buf, size)); 506 #endif 507 } 508 EXPORT_SYMBOL(spl_kmem_free); 509 510 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) 511 static char * 512 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) 513 { 514 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; 515 int i, flag = 1; 516 517 ASSERT(str != NULL && len >= 17); 518 memset(str, 0, len); 519 520 /* 521 * Check for a fully printable string, and while we are at 522 * it place the printable characters in the passed buffer. 523 */ 524 for (i = 0; i < size; i++) { 525 str[i] = ((char *)(kd->kd_addr))[i]; 526 if (isprint(str[i])) { 527 continue; 528 } else { 529 /* 530 * Minimum number of printable characters found 531 * to make it worthwhile to print this as ascii. 532 */ 533 if (i > min) 534 break; 535 536 flag = 0; 537 break; 538 } 539 } 540 541 if (!flag) { 542 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", 543 *((uint8_t *)kd->kd_addr), 544 *((uint8_t *)kd->kd_addr + 2), 545 *((uint8_t *)kd->kd_addr + 4), 546 *((uint8_t *)kd->kd_addr + 6), 547 *((uint8_t *)kd->kd_addr + 8), 548 *((uint8_t *)kd->kd_addr + 10), 549 *((uint8_t *)kd->kd_addr + 12), 550 *((uint8_t *)kd->kd_addr + 14)); 551 } 552 553 return (str); 554 } 555 556 static int 557 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) 558 { 559 int i; 560 561 spin_lock_init(lock); 562 INIT_LIST_HEAD(list); 563 564 for (i = 0; i < size; i++) 565 INIT_HLIST_HEAD(&kmem_table[i]); 566 567 return (0); 568 } 569 570 static void 571 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) 572 { 573 unsigned long flags; 574 kmem_debug_t *kd = NULL; 575 char str[17]; 576 577 spin_lock_irqsave(lock, flags); 578 if (!list_empty(list)) 579 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", 580 "size", "data", "func", "line"); 581 582 list_for_each_entry(kd, list, kd_list) { 583 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, 584 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), 585 kd->kd_func, kd->kd_line); 586 } 587 588 spin_unlock_irqrestore(lock, flags); 589 } 590 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ 591 592 int 593 spl_kmem_init(void) 594 { 595 596 #ifdef DEBUG_KMEM 597 kmem_alloc_used_set(0); 598 599 600 601 #ifdef DEBUG_KMEM_TRACKING 602 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); 603 #endif /* DEBUG_KMEM_TRACKING */ 604 #endif /* DEBUG_KMEM */ 605 606 return (0); 607 } 608 609 void 610 spl_kmem_fini(void) 611 { 612 #ifdef DEBUG_KMEM 613 /* 614 * Display all unreclaimed memory addresses, including the 615 * allocation size and the first few bytes of what's located 616 * at that address to aid in debugging. Performance is not 617 * a serious concern here since it is module unload time. 618 */ 619 if (kmem_alloc_used_read() != 0) 620 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", 621 (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); 622 623 #ifdef DEBUG_KMEM_TRACKING 624 spl_kmem_fini_tracking(&kmem_list, &kmem_lock); 625 #endif /* DEBUG_KMEM_TRACKING */ 626 #endif /* DEBUG_KMEM */ 627 } 628