1 /* 2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. 3 * Copyright (C) 2007 The Regents of the University of California. 4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 6 * UCRL-CODE-235197 7 * 8 * This file is part of the SPL, Solaris Porting Layer. 9 * 10 * The SPL is free software; you can redistribute it and/or modify it 11 * under the terms of the GNU General Public License as published by the 12 * Free Software Foundation; either version 2 of the License, or (at your 13 * option) any later version. 14 * 15 * The SPL is distributed in the hope that it will be useful, but WITHOUT 16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18 * for more details. 19 * 20 * You should have received a copy of the GNU General Public License along 21 * with the SPL. If not, see <http://www.gnu.org/licenses/>. 22 */ 23 24 #include <sys/debug.h> 25 #include <sys/sysmacros.h> 26 #include <sys/kmem.h> 27 #include <sys/vmem.h> 28 29 /* BEGIN CSTYLED */ 30 /* 31 * As a general rule kmem_alloc() allocations should be small, preferably 32 * just a few pages since they must by physically contiguous. Therefore, a 33 * rate limited warning will be printed to the console for any kmem_alloc() 34 * which exceeds a reasonable threshold. 35 * 36 * The default warning threshold is set to sixteen pages but capped at 64K to 37 * accommodate systems using large pages. This value was selected to be small 38 * enough to ensure the largest allocations are quickly noticed and fixed. 39 * But large enough to avoid logging any warnings when a allocation size is 40 * larger than optimal but not a serious concern. Since this value is tunable, 41 * developers are encouraged to set it lower when testing so any new largish 42 * allocations are quickly caught. These warnings may be disabled by setting 43 * the threshold to zero. 44 */ 45 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); 46 module_param(spl_kmem_alloc_warn, uint, 0644); 47 MODULE_PARM_DESC(spl_kmem_alloc_warn, 48 "Warning threshold in bytes for a kmem_alloc()"); 49 EXPORT_SYMBOL(spl_kmem_alloc_warn); 50 51 /* 52 * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. 53 * Allocations which are marginally smaller than this limit may succeed but 54 * should still be avoided due to the expense of locating a contiguous range 55 * of free pages. Therefore, a maximum kmem size with reasonable safely 56 * margin of 4x is set. Kmem_alloc() allocations larger than this maximum 57 * will quickly fail. Vmem_alloc() allocations less than or equal to this 58 * value will use kmalloc(), but shift to vmalloc() when exceeding this value. 59 */ 60 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); 61 module_param(spl_kmem_alloc_max, uint, 0644); 62 MODULE_PARM_DESC(spl_kmem_alloc_max, 63 "Maximum size in bytes for a kmem_alloc()"); 64 EXPORT_SYMBOL(spl_kmem_alloc_max); 65 /* END CSTYLED */ 66 67 int 68 kmem_debugging(void) 69 { 70 return (0); 71 } 72 EXPORT_SYMBOL(kmem_debugging); 73 74 char * 75 kmem_vasprintf(const char *fmt, va_list ap) 76 { 77 va_list aq; 78 char *ptr; 79 80 do { 81 va_copy(aq, ap); 82 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); 83 va_end(aq); 84 } while (ptr == NULL); 85 86 return (ptr); 87 } 88 EXPORT_SYMBOL(kmem_vasprintf); 89 90 char * 91 kmem_asprintf(const char *fmt, ...) 92 { 93 va_list ap; 94 char *ptr; 95 96 do { 97 va_start(ap, fmt); 98 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); 99 va_end(ap); 100 } while (ptr == NULL); 101 102 return (ptr); 103 } 104 EXPORT_SYMBOL(kmem_asprintf); 105 106 static char * 107 __strdup(const char *str, int flags) 108 { 109 char *ptr; 110 int n; 111 112 n = strlen(str); 113 ptr = kmalloc(n + 1, kmem_flags_convert(flags)); 114 if (ptr) 115 memcpy(ptr, str, n + 1); 116 117 return (ptr); 118 } 119 120 char * 121 kmem_strdup(const char *str) 122 { 123 return (__strdup(str, KM_SLEEP)); 124 } 125 EXPORT_SYMBOL(kmem_strdup); 126 127 void 128 kmem_strfree(char *str) 129 { 130 kfree(str); 131 } 132 EXPORT_SYMBOL(kmem_strfree); 133 134 void * 135 spl_kvmalloc(size_t size, gfp_t lflags) 136 { 137 #ifdef HAVE_KVMALLOC 138 /* 139 * GFP_KERNEL allocations can safely use kvmalloc which may 140 * improve performance by avoiding a) high latency caused by 141 * vmalloc's on-access allocation, b) performance loss due to 142 * MMU memory address mapping and c) vmalloc locking overhead. 143 * This has the side-effect that the slab statistics will 144 * incorrectly report this as a vmem allocation, but that is 145 * purely cosmetic. 146 */ 147 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 148 return (kvmalloc(size, lflags)); 149 #endif 150 151 gfp_t kmalloc_lflags = lflags; 152 153 if (size > PAGE_SIZE) { 154 /* 155 * We need to set __GFP_NOWARN here since spl_kvmalloc is not 156 * only called by spl_kmem_alloc_impl but can be called 157 * directly with custom lflags, too. In that case 158 * kmem_flags_convert does not get called, which would 159 * implicitly set __GFP_NOWARN. 160 */ 161 kmalloc_lflags |= __GFP_NOWARN; 162 163 /* 164 * N.B. __GFP_RETRY_MAYFAIL is supported only for large 165 * e (>32kB) allocations. 166 * 167 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY 168 * for !costly requests because there is no other way to tell 169 * the allocator that we want to fail rather than retry 170 * endlessly. 171 */ 172 if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || 173 (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 174 kmalloc_lflags |= __GFP_NORETRY; 175 } 176 } 177 178 /* 179 * We first try kmalloc - even for big sizes - and fall back to 180 * spl_vmalloc if that fails. 181 * 182 * For non-__GFP-RECLAIM allocations we always stick to 183 * kmalloc_node, and fail when kmalloc is not successful (returns 184 * NULL). 185 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc 186 * internally uses GPF_KERNEL allocations. 187 */ 188 void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); 189 if (ptr || size <= PAGE_SIZE || 190 (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { 191 return (ptr); 192 } 193 194 return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); 195 } 196 197 /* 198 * General purpose unified implementation of kmem_alloc(). It is an 199 * amalgamation of Linux and Illumos allocator design. It should never be 200 * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains 201 * relatively portable. Consumers may only access this function through 202 * wrappers that enforce the common flags to ensure portability. 203 */ 204 inline void * 205 spl_kmem_alloc_impl(size_t size, int flags, int node) 206 { 207 gfp_t lflags = kmem_flags_convert(flags); 208 void *ptr; 209 210 /* 211 * Log abnormally large allocations and rate limit the console output. 212 * Allocations larger than spl_kmem_alloc_warn should be performed 213 * through the vmem_alloc()/vmem_zalloc() interfaces. 214 */ 215 if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && 216 !(flags & KM_VMEM)) { 217 printk(KERN_WARNING 218 "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" 219 "https://github.com/openzfs/zfs/issues/new\n", 220 (unsigned long)size, flags); 221 dump_stack(); 222 } 223 224 /* 225 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used 226 * unlike kmem_alloc() with KM_SLEEP on Illumos. 227 */ 228 do { 229 /* 230 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max 231 * is unsafe. This must fail for all for kmem_alloc() and 232 * kmem_zalloc() callers. 233 * 234 * For vmem_alloc() and vmem_zalloc() callers it is permissible 235 * to use spl_vmalloc(). However, in general use of 236 * spl_vmalloc() is strongly discouraged because a global lock 237 * must be acquired. Contention on this lock can significantly 238 * impact performance so frequently manipulating the virtual 239 * address space is strongly discouraged. 240 */ 241 if (size > spl_kmem_alloc_max) { 242 if (flags & KM_VMEM) { 243 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); 244 } else { 245 return (NULL); 246 } 247 } else { 248 /* 249 * We use kmalloc when doing kmem_alloc(KM_NOSLEEP), 250 * because kvmalloc/vmalloc may sleep. We also use 251 * kmalloc on systems with limited kernel VA space (e.g. 252 * 32-bit), which have HIGHMEM. Otherwise we use 253 * kvmalloc, which tries to get contiguous physical 254 * memory (fast, like kmalloc) and falls back on using 255 * virtual memory to stitch together pages (slow, like 256 * vmalloc). 257 */ 258 #ifdef CONFIG_HIGHMEM 259 if (flags & KM_VMEM) { 260 #else 261 if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) { 262 #endif 263 ptr = spl_kvmalloc(size, lflags); 264 } else { 265 ptr = kmalloc_node(size, lflags, node); 266 } 267 } 268 269 if (likely(ptr) || (flags & KM_NOSLEEP)) 270 return (ptr); 271 272 /* 273 * Try hard to satisfy the allocation. However, when progress 274 * cannot be made, the allocation is allowed to fail. 275 */ 276 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 277 lflags |= __GFP_RETRY_MAYFAIL; 278 279 /* 280 * Use cond_resched() instead of congestion_wait() to avoid 281 * deadlocking systems where there are no block devices. 282 */ 283 cond_resched(); 284 } while (1); 285 286 return (NULL); 287 } 288 289 inline void 290 spl_kmem_free_impl(const void *buf, size_t size) 291 { 292 if (is_vmalloc_addr(buf)) 293 vfree(buf); 294 else 295 kfree(buf); 296 } 297 298 /* 299 * Memory allocation and accounting for kmem_* * style allocations. When 300 * DEBUG_KMEM is enabled the total memory allocated will be tracked and 301 * any memory leaked will be reported during module unload. 302 * 303 * ./configure --enable-debug-kmem 304 */ 305 #ifdef DEBUG_KMEM 306 307 /* Shim layer memory accounting */ 308 #ifdef HAVE_ATOMIC64_T 309 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); 310 unsigned long long kmem_alloc_max = 0; 311 #else /* HAVE_ATOMIC64_T */ 312 atomic_t kmem_alloc_used = ATOMIC_INIT(0); 313 unsigned long long kmem_alloc_max = 0; 314 #endif /* HAVE_ATOMIC64_T */ 315 316 EXPORT_SYMBOL(kmem_alloc_used); 317 EXPORT_SYMBOL(kmem_alloc_max); 318 319 inline void * 320 spl_kmem_alloc_debug(size_t size, int flags, int node) 321 { 322 void *ptr; 323 324 ptr = spl_kmem_alloc_impl(size, flags, node); 325 if (ptr) { 326 kmem_alloc_used_add(size); 327 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) 328 kmem_alloc_max = kmem_alloc_used_read(); 329 } 330 331 return (ptr); 332 } 333 334 inline void 335 spl_kmem_free_debug(const void *ptr, size_t size) 336 { 337 kmem_alloc_used_sub(size); 338 spl_kmem_free_impl(ptr, size); 339 } 340 341 /* 342 * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked 343 * but also the location of every alloc and free. When the SPL module is 344 * unloaded a list of all leaked addresses and where they were allocated 345 * will be dumped to the console. Enabling this feature has a significant 346 * impact on performance but it makes finding memory leaks straight forward. 347 * 348 * Not surprisingly with debugging enabled the xmem_locks are very highly 349 * contended particularly on xfree(). If we want to run with this detailed 350 * debugging enabled for anything other than debugging we need to minimize 351 * the contention by moving to a lock per xmem_table entry model. 352 * 353 * ./configure --enable-debug-kmem-tracking 354 */ 355 #ifdef DEBUG_KMEM_TRACKING 356 357 #include <linux/hash.h> 358 #include <linux/ctype.h> 359 360 #define KMEM_HASH_BITS 10 361 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) 362 363 typedef struct kmem_debug { 364 struct hlist_node kd_hlist; /* Hash node linkage */ 365 struct list_head kd_list; /* List of all allocations */ 366 void *kd_addr; /* Allocation pointer */ 367 size_t kd_size; /* Allocation size */ 368 const char *kd_func; /* Allocation function */ 369 int kd_line; /* Allocation line */ 370 } kmem_debug_t; 371 372 static spinlock_t kmem_lock; 373 static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; 374 static struct list_head kmem_list; 375 376 static kmem_debug_t * 377 kmem_del_init(spinlock_t *lock, struct hlist_head *table, 378 int bits, const void *addr) 379 { 380 struct hlist_head *head; 381 struct hlist_node *node = NULL; 382 struct kmem_debug *p; 383 unsigned long flags; 384 385 spin_lock_irqsave(lock, flags); 386 387 head = &table[hash_ptr((void *)addr, bits)]; 388 hlist_for_each(node, head) { 389 p = list_entry(node, struct kmem_debug, kd_hlist); 390 if (p->kd_addr == addr) { 391 hlist_del_init(&p->kd_hlist); 392 list_del_init(&p->kd_list); 393 spin_unlock_irqrestore(lock, flags); 394 return (p); 395 } 396 } 397 398 spin_unlock_irqrestore(lock, flags); 399 400 return (NULL); 401 } 402 403 inline void * 404 spl_kmem_alloc_track(size_t size, int flags, 405 const char *func, int line, int node) 406 { 407 void *ptr = NULL; 408 kmem_debug_t *dptr; 409 unsigned long irq_flags; 410 411 dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); 412 if (dptr == NULL) 413 return (NULL); 414 415 dptr->kd_func = __strdup(func, flags); 416 if (dptr->kd_func == NULL) { 417 kfree(dptr); 418 return (NULL); 419 } 420 421 ptr = spl_kmem_alloc_debug(size, flags, node); 422 if (ptr == NULL) { 423 kfree(dptr->kd_func); 424 kfree(dptr); 425 return (NULL); 426 } 427 428 INIT_HLIST_NODE(&dptr->kd_hlist); 429 INIT_LIST_HEAD(&dptr->kd_list); 430 431 dptr->kd_addr = ptr; 432 dptr->kd_size = size; 433 dptr->kd_line = line; 434 435 spin_lock_irqsave(&kmem_lock, irq_flags); 436 hlist_add_head(&dptr->kd_hlist, 437 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); 438 list_add_tail(&dptr->kd_list, &kmem_list); 439 spin_unlock_irqrestore(&kmem_lock, irq_flags); 440 441 return (ptr); 442 } 443 444 inline void 445 spl_kmem_free_track(const void *ptr, size_t size) 446 { 447 kmem_debug_t *dptr; 448 449 /* Ignore NULL pointer since we haven't tracked it at all */ 450 if (ptr == NULL) 451 return; 452 453 /* Must exist in hash due to kmem_alloc() */ 454 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); 455 ASSERT3P(dptr, !=, NULL); 456 ASSERT3S(dptr->kd_size, ==, size); 457 458 kfree(dptr->kd_func); 459 kfree(dptr); 460 461 spl_kmem_free_debug(ptr, size); 462 } 463 #endif /* DEBUG_KMEM_TRACKING */ 464 #endif /* DEBUG_KMEM */ 465 466 /* 467 * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. 468 */ 469 void * 470 spl_kmem_alloc(size_t size, int flags, const char *func, int line) 471 { 472 ASSERT0(flags & ~KM_PUBLIC_MASK); 473 474 #if !defined(DEBUG_KMEM) 475 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 476 #elif !defined(DEBUG_KMEM_TRACKING) 477 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 478 #else 479 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 480 #endif 481 } 482 EXPORT_SYMBOL(spl_kmem_alloc); 483 484 void * 485 spl_kmem_zalloc(size_t size, int flags, const char *func, int line) 486 { 487 ASSERT0(flags & ~KM_PUBLIC_MASK); 488 489 flags |= KM_ZERO; 490 491 #if !defined(DEBUG_KMEM) 492 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 493 #elif !defined(DEBUG_KMEM_TRACKING) 494 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 495 #else 496 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 497 #endif 498 } 499 EXPORT_SYMBOL(spl_kmem_zalloc); 500 501 void 502 spl_kmem_free(const void *buf, size_t size) 503 { 504 #if !defined(DEBUG_KMEM) 505 return (spl_kmem_free_impl(buf, size)); 506 #elif !defined(DEBUG_KMEM_TRACKING) 507 return (spl_kmem_free_debug(buf, size)); 508 #else 509 return (spl_kmem_free_track(buf, size)); 510 #endif 511 } 512 EXPORT_SYMBOL(spl_kmem_free); 513 514 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) 515 static char * 516 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) 517 { 518 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; 519 int i, flag = 1; 520 521 ASSERT(str != NULL && len >= 17); 522 memset(str, 0, len); 523 524 /* 525 * Check for a fully printable string, and while we are at 526 * it place the printable characters in the passed buffer. 527 */ 528 for (i = 0; i < size; i++) { 529 str[i] = ((char *)(kd->kd_addr))[i]; 530 if (isprint(str[i])) { 531 continue; 532 } else { 533 /* 534 * Minimum number of printable characters found 535 * to make it worthwhile to print this as ascii. 536 */ 537 if (i > min) 538 break; 539 540 flag = 0; 541 break; 542 } 543 } 544 545 if (!flag) { 546 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", 547 *((uint8_t *)kd->kd_addr), 548 *((uint8_t *)kd->kd_addr + 2), 549 *((uint8_t *)kd->kd_addr + 4), 550 *((uint8_t *)kd->kd_addr + 6), 551 *((uint8_t *)kd->kd_addr + 8), 552 *((uint8_t *)kd->kd_addr + 10), 553 *((uint8_t *)kd->kd_addr + 12), 554 *((uint8_t *)kd->kd_addr + 14)); 555 } 556 557 return (str); 558 } 559 560 static int 561 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) 562 { 563 int i; 564 565 spin_lock_init(lock); 566 INIT_LIST_HEAD(list); 567 568 for (i = 0; i < size; i++) 569 INIT_HLIST_HEAD(&kmem_table[i]); 570 571 return (0); 572 } 573 574 static void 575 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) 576 { 577 unsigned long flags; 578 kmem_debug_t *kd = NULL; 579 char str[17]; 580 581 spin_lock_irqsave(lock, flags); 582 if (!list_empty(list)) 583 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", 584 "size", "data", "func", "line"); 585 586 list_for_each_entry(kd, list, kd_list) { 587 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, 588 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), 589 kd->kd_func, kd->kd_line); 590 } 591 592 spin_unlock_irqrestore(lock, flags); 593 } 594 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ 595 596 int 597 spl_kmem_init(void) 598 { 599 600 #ifdef DEBUG_KMEM 601 kmem_alloc_used_set(0); 602 603 604 605 #ifdef DEBUG_KMEM_TRACKING 606 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); 607 #endif /* DEBUG_KMEM_TRACKING */ 608 #endif /* DEBUG_KMEM */ 609 610 return (0); 611 } 612 613 void 614 spl_kmem_fini(void) 615 { 616 #ifdef DEBUG_KMEM 617 /* 618 * Display all unreclaimed memory addresses, including the 619 * allocation size and the first few bytes of what's located 620 * at that address to aid in debugging. Performance is not 621 * a serious concern here since it is module unload time. 622 */ 623 if (kmem_alloc_used_read() != 0) 624 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", 625 (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); 626 627 #ifdef DEBUG_KMEM_TRACKING 628 spl_kmem_fini_tracking(&kmem_list, &kmem_lock); 629 #endif /* DEBUG_KMEM_TRACKING */ 630 #endif /* DEBUG_KMEM */ 631 } 632