1 /* 2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. 3 * Copyright (C) 2007 The Regents of the University of California. 4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 6 * UCRL-CODE-235197 7 * 8 * This file is part of the SPL, Solaris Porting Layer. 9 * 10 * The SPL is free software; you can redistribute it and/or modify it 11 * under the terms of the GNU General Public License as published by the 12 * Free Software Foundation; either version 2 of the License, or (at your 13 * option) any later version. 14 * 15 * The SPL is distributed in the hope that it will be useful, but WITHOUT 16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18 * for more details. 19 * 20 * You should have received a copy of the GNU General Public License along 21 * with the SPL. If not, see <http://www.gnu.org/licenses/>. 22 */ 23 24 #include <sys/debug.h> 25 #include <sys/sysmacros.h> 26 #include <sys/kmem.h> 27 #include <sys/vmem.h> 28 29 /* BEGIN CSTYLED */ 30 /* 31 * As a general rule kmem_alloc() allocations should be small, preferably 32 * just a few pages since they must by physically contiguous. Therefore, a 33 * rate limited warning will be printed to the console for any kmem_alloc() 34 * which exceeds a reasonable threshold. 35 * 36 * The default warning threshold is set to sixteen pages but capped at 64K to 37 * accommodate systems using large pages. This value was selected to be small 38 * enough to ensure the largest allocations are quickly noticed and fixed. 39 * But large enough to avoid logging any warnings when a allocation size is 40 * larger than optimal but not a serious concern. Since this value is tunable, 41 * developers are encouraged to set it lower when testing so any new largish 42 * allocations are quickly caught. These warnings may be disabled by setting 43 * the threshold to zero. 44 */ 45 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); 46 module_param(spl_kmem_alloc_warn, uint, 0644); 47 MODULE_PARM_DESC(spl_kmem_alloc_warn, 48 "Warning threshold in bytes for a kmem_alloc()"); 49 EXPORT_SYMBOL(spl_kmem_alloc_warn); 50 51 /* 52 * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. 53 * Allocations which are marginally smaller than this limit may succeed but 54 * should still be avoided due to the expense of locating a contiguous range 55 * of free pages. Therefore, a maximum kmem size with reasonable safely 56 * margin of 4x is set. Kmem_alloc() allocations larger than this maximum 57 * will quickly fail. Vmem_alloc() allocations less than or equal to this 58 * value will use kmalloc(), but shift to vmalloc() when exceeding this value. 59 */ 60 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); 61 module_param(spl_kmem_alloc_max, uint, 0644); 62 MODULE_PARM_DESC(spl_kmem_alloc_max, 63 "Maximum size in bytes for a kmem_alloc()"); 64 EXPORT_SYMBOL(spl_kmem_alloc_max); 65 /* END CSTYLED */ 66 67 int 68 kmem_debugging(void) 69 { 70 return (0); 71 } 72 EXPORT_SYMBOL(kmem_debugging); 73 74 char * 75 kmem_vasprintf(const char *fmt, va_list ap) 76 { 77 va_list aq; 78 char *ptr; 79 80 do { 81 va_copy(aq, ap); 82 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); 83 va_end(aq); 84 } while (ptr == NULL); 85 86 return (ptr); 87 } 88 EXPORT_SYMBOL(kmem_vasprintf); 89 90 char * 91 kmem_asprintf(const char *fmt, ...) 92 { 93 va_list ap; 94 char *ptr; 95 96 do { 97 va_start(ap, fmt); 98 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); 99 va_end(ap); 100 } while (ptr == NULL); 101 102 return (ptr); 103 } 104 EXPORT_SYMBOL(kmem_asprintf); 105 106 static char * 107 __strdup(const char *str, int flags) 108 { 109 char *ptr; 110 int n; 111 112 n = strlen(str); 113 ptr = kmalloc(n + 1, kmem_flags_convert(flags)); 114 if (ptr) 115 memcpy(ptr, str, n + 1); 116 117 return (ptr); 118 } 119 120 char * 121 kmem_strdup(const char *str) 122 { 123 return (__strdup(str, KM_SLEEP)); 124 } 125 EXPORT_SYMBOL(kmem_strdup); 126 127 void 128 kmem_strfree(char *str) 129 { 130 kfree(str); 131 } 132 EXPORT_SYMBOL(kmem_strfree); 133 134 void * 135 spl_kvmalloc(size_t size, gfp_t lflags) 136 { 137 /* 138 * GFP_KERNEL allocations can safely use kvmalloc which may 139 * improve performance by avoiding a) high latency caused by 140 * vmalloc's on-access allocation, b) performance loss due to 141 * MMU memory address mapping and c) vmalloc locking overhead. 142 * This has the side-effect that the slab statistics will 143 * incorrectly report this as a vmem allocation, but that is 144 * purely cosmetic. 145 */ 146 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 147 return (kvmalloc(size, lflags)); 148 149 gfp_t kmalloc_lflags = lflags; 150 151 if (size > PAGE_SIZE) { 152 /* 153 * We need to set __GFP_NOWARN here since spl_kvmalloc is not 154 * only called by spl_kmem_alloc_impl but can be called 155 * directly with custom lflags, too. In that case 156 * kmem_flags_convert does not get called, which would 157 * implicitly set __GFP_NOWARN. 158 */ 159 kmalloc_lflags |= __GFP_NOWARN; 160 161 /* 162 * N.B. __GFP_RETRY_MAYFAIL is supported only for large 163 * e (>32kB) allocations. 164 * 165 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY 166 * for !costly requests because there is no other way to tell 167 * the allocator that we want to fail rather than retry 168 * endlessly. 169 */ 170 if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || 171 (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 172 kmalloc_lflags |= __GFP_NORETRY; 173 } 174 } 175 176 /* 177 * We first try kmalloc - even for big sizes - and fall back to 178 * spl_vmalloc if that fails. 179 * 180 * For non-__GFP-RECLAIM allocations we always stick to 181 * kmalloc_node, and fail when kmalloc is not successful (returns 182 * NULL). 183 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc 184 * internally uses GPF_KERNEL allocations. 185 */ 186 void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); 187 if (ptr || size <= PAGE_SIZE || 188 (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { 189 return (ptr); 190 } 191 192 return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); 193 } 194 195 /* 196 * General purpose unified implementation of kmem_alloc(). It is an 197 * amalgamation of Linux and Illumos allocator design. It should never be 198 * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains 199 * relatively portable. Consumers may only access this function through 200 * wrappers that enforce the common flags to ensure portability. 201 */ 202 inline void * 203 spl_kmem_alloc_impl(size_t size, int flags, int node) 204 { 205 gfp_t lflags = kmem_flags_convert(flags); 206 void *ptr; 207 208 /* 209 * Log abnormally large allocations and rate limit the console output. 210 * Allocations larger than spl_kmem_alloc_warn should be performed 211 * through the vmem_alloc()/vmem_zalloc() interfaces. 212 */ 213 if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && 214 !(flags & KM_VMEM)) { 215 printk(KERN_WARNING 216 "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" 217 "https://github.com/openzfs/zfs/issues/new\n", 218 (unsigned long)size, flags); 219 dump_stack(); 220 } 221 222 /* 223 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used 224 * unlike kmem_alloc() with KM_SLEEP on Illumos. 225 */ 226 do { 227 /* 228 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max 229 * is unsafe. This must fail for all for kmem_alloc() and 230 * kmem_zalloc() callers. 231 * 232 * For vmem_alloc() and vmem_zalloc() callers it is permissible 233 * to use spl_vmalloc(). However, in general use of 234 * spl_vmalloc() is strongly discouraged because a global lock 235 * must be acquired. Contention on this lock can significantly 236 * impact performance so frequently manipulating the virtual 237 * address space is strongly discouraged. 238 */ 239 if (size > spl_kmem_alloc_max) { 240 if (flags & KM_VMEM) { 241 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); 242 } else { 243 return (NULL); 244 } 245 } else { 246 /* 247 * We use kmalloc when doing kmem_alloc(KM_NOSLEEP), 248 * because kvmalloc/vmalloc may sleep. We also use 249 * kmalloc on systems with limited kernel VA space (e.g. 250 * 32-bit), which have HIGHMEM. Otherwise we use 251 * kvmalloc, which tries to get contiguous physical 252 * memory (fast, like kmalloc) and falls back on using 253 * virtual memory to stitch together pages (slow, like 254 * vmalloc). 255 */ 256 #ifdef CONFIG_HIGHMEM 257 if (flags & KM_VMEM) { 258 #else 259 if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) { 260 #endif 261 ptr = spl_kvmalloc(size, lflags); 262 } else { 263 ptr = kmalloc_node(size, lflags, node); 264 } 265 } 266 267 if (likely(ptr) || (flags & KM_NOSLEEP)) 268 return (ptr); 269 270 /* 271 * Try hard to satisfy the allocation. However, when progress 272 * cannot be made, the allocation is allowed to fail. 273 */ 274 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 275 lflags |= __GFP_RETRY_MAYFAIL; 276 277 /* 278 * Use cond_resched() instead of congestion_wait() to avoid 279 * deadlocking systems where there are no block devices. 280 */ 281 cond_resched(); 282 } while (1); 283 284 return (NULL); 285 } 286 287 inline void 288 spl_kmem_free_impl(const void *buf, size_t size) 289 { 290 if (is_vmalloc_addr(buf)) 291 vfree(buf); 292 else 293 kfree(buf); 294 } 295 296 /* 297 * Memory allocation and accounting for kmem_* * style allocations. When 298 * DEBUG_KMEM is enabled the total memory allocated will be tracked and 299 * any memory leaked will be reported during module unload. 300 * 301 * ./configure --enable-debug-kmem 302 */ 303 #ifdef DEBUG_KMEM 304 305 /* Shim layer memory accounting */ 306 #ifdef HAVE_ATOMIC64_T 307 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); 308 unsigned long long kmem_alloc_max = 0; 309 #else /* HAVE_ATOMIC64_T */ 310 atomic_t kmem_alloc_used = ATOMIC_INIT(0); 311 unsigned long long kmem_alloc_max = 0; 312 #endif /* HAVE_ATOMIC64_T */ 313 314 EXPORT_SYMBOL(kmem_alloc_used); 315 EXPORT_SYMBOL(kmem_alloc_max); 316 317 inline void * 318 spl_kmem_alloc_debug(size_t size, int flags, int node) 319 { 320 void *ptr; 321 322 ptr = spl_kmem_alloc_impl(size, flags, node); 323 if (ptr) { 324 kmem_alloc_used_add(size); 325 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) 326 kmem_alloc_max = kmem_alloc_used_read(); 327 } 328 329 return (ptr); 330 } 331 332 inline void 333 spl_kmem_free_debug(const void *ptr, size_t size) 334 { 335 kmem_alloc_used_sub(size); 336 spl_kmem_free_impl(ptr, size); 337 } 338 339 /* 340 * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked 341 * but also the location of every alloc and free. When the SPL module is 342 * unloaded a list of all leaked addresses and where they were allocated 343 * will be dumped to the console. Enabling this feature has a significant 344 * impact on performance but it makes finding memory leaks straight forward. 345 * 346 * Not surprisingly with debugging enabled the xmem_locks are very highly 347 * contended particularly on xfree(). If we want to run with this detailed 348 * debugging enabled for anything other than debugging we need to minimize 349 * the contention by moving to a lock per xmem_table entry model. 350 * 351 * ./configure --enable-debug-kmem-tracking 352 */ 353 #ifdef DEBUG_KMEM_TRACKING 354 355 #include <linux/hash.h> 356 #include <linux/ctype.h> 357 358 #define KMEM_HASH_BITS 10 359 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) 360 361 typedef struct kmem_debug { 362 struct hlist_node kd_hlist; /* Hash node linkage */ 363 struct list_head kd_list; /* List of all allocations */ 364 void *kd_addr; /* Allocation pointer */ 365 size_t kd_size; /* Allocation size */ 366 const char *kd_func; /* Allocation function */ 367 int kd_line; /* Allocation line */ 368 } kmem_debug_t; 369 370 static spinlock_t kmem_lock; 371 static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; 372 static struct list_head kmem_list; 373 374 static kmem_debug_t * 375 kmem_del_init(spinlock_t *lock, struct hlist_head *table, 376 int bits, const void *addr) 377 { 378 struct hlist_head *head; 379 struct hlist_node *node = NULL; 380 struct kmem_debug *p; 381 unsigned long flags; 382 383 spin_lock_irqsave(lock, flags); 384 385 head = &table[hash_ptr((void *)addr, bits)]; 386 hlist_for_each(node, head) { 387 p = list_entry(node, struct kmem_debug, kd_hlist); 388 if (p->kd_addr == addr) { 389 hlist_del_init(&p->kd_hlist); 390 list_del_init(&p->kd_list); 391 spin_unlock_irqrestore(lock, flags); 392 return (p); 393 } 394 } 395 396 spin_unlock_irqrestore(lock, flags); 397 398 return (NULL); 399 } 400 401 inline void * 402 spl_kmem_alloc_track(size_t size, int flags, 403 const char *func, int line, int node) 404 { 405 void *ptr = NULL; 406 kmem_debug_t *dptr; 407 unsigned long irq_flags; 408 409 dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); 410 if (dptr == NULL) 411 return (NULL); 412 413 dptr->kd_func = __strdup(func, flags); 414 if (dptr->kd_func == NULL) { 415 kfree(dptr); 416 return (NULL); 417 } 418 419 ptr = spl_kmem_alloc_debug(size, flags, node); 420 if (ptr == NULL) { 421 kfree(dptr->kd_func); 422 kfree(dptr); 423 return (NULL); 424 } 425 426 INIT_HLIST_NODE(&dptr->kd_hlist); 427 INIT_LIST_HEAD(&dptr->kd_list); 428 429 dptr->kd_addr = ptr; 430 dptr->kd_size = size; 431 dptr->kd_line = line; 432 433 spin_lock_irqsave(&kmem_lock, irq_flags); 434 hlist_add_head(&dptr->kd_hlist, 435 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); 436 list_add_tail(&dptr->kd_list, &kmem_list); 437 spin_unlock_irqrestore(&kmem_lock, irq_flags); 438 439 return (ptr); 440 } 441 442 inline void 443 spl_kmem_free_track(const void *ptr, size_t size) 444 { 445 kmem_debug_t *dptr; 446 447 /* Ignore NULL pointer since we haven't tracked it at all */ 448 if (ptr == NULL) 449 return; 450 451 /* Must exist in hash due to kmem_alloc() */ 452 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); 453 ASSERT3P(dptr, !=, NULL); 454 ASSERT3S(dptr->kd_size, ==, size); 455 456 kfree(dptr->kd_func); 457 kfree(dptr); 458 459 spl_kmem_free_debug(ptr, size); 460 } 461 #endif /* DEBUG_KMEM_TRACKING */ 462 #endif /* DEBUG_KMEM */ 463 464 /* 465 * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. 466 */ 467 void * 468 spl_kmem_alloc(size_t size, int flags, const char *func, int line) 469 { 470 ASSERT0(flags & ~KM_PUBLIC_MASK); 471 472 #if !defined(DEBUG_KMEM) 473 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 474 #elif !defined(DEBUG_KMEM_TRACKING) 475 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 476 #else 477 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 478 #endif 479 } 480 EXPORT_SYMBOL(spl_kmem_alloc); 481 482 void * 483 spl_kmem_zalloc(size_t size, int flags, const char *func, int line) 484 { 485 ASSERT0(flags & ~KM_PUBLIC_MASK); 486 487 flags |= KM_ZERO; 488 489 #if !defined(DEBUG_KMEM) 490 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 491 #elif !defined(DEBUG_KMEM_TRACKING) 492 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 493 #else 494 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 495 #endif 496 } 497 EXPORT_SYMBOL(spl_kmem_zalloc); 498 499 void 500 spl_kmem_free(const void *buf, size_t size) 501 { 502 #if !defined(DEBUG_KMEM) 503 return (spl_kmem_free_impl(buf, size)); 504 #elif !defined(DEBUG_KMEM_TRACKING) 505 return (spl_kmem_free_debug(buf, size)); 506 #else 507 return (spl_kmem_free_track(buf, size)); 508 #endif 509 } 510 EXPORT_SYMBOL(spl_kmem_free); 511 512 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) 513 static char * 514 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) 515 { 516 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; 517 int i, flag = 1; 518 519 ASSERT(str != NULL && len >= 17); 520 memset(str, 0, len); 521 522 /* 523 * Check for a fully printable string, and while we are at 524 * it place the printable characters in the passed buffer. 525 */ 526 for (i = 0; i < size; i++) { 527 str[i] = ((char *)(kd->kd_addr))[i]; 528 if (isprint(str[i])) { 529 continue; 530 } else { 531 /* 532 * Minimum number of printable characters found 533 * to make it worthwhile to print this as ascii. 534 */ 535 if (i > min) 536 break; 537 538 flag = 0; 539 break; 540 } 541 } 542 543 if (!flag) { 544 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", 545 *((uint8_t *)kd->kd_addr), 546 *((uint8_t *)kd->kd_addr + 2), 547 *((uint8_t *)kd->kd_addr + 4), 548 *((uint8_t *)kd->kd_addr + 6), 549 *((uint8_t *)kd->kd_addr + 8), 550 *((uint8_t *)kd->kd_addr + 10), 551 *((uint8_t *)kd->kd_addr + 12), 552 *((uint8_t *)kd->kd_addr + 14)); 553 } 554 555 return (str); 556 } 557 558 static int 559 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) 560 { 561 int i; 562 563 spin_lock_init(lock); 564 INIT_LIST_HEAD(list); 565 566 for (i = 0; i < size; i++) 567 INIT_HLIST_HEAD(&kmem_table[i]); 568 569 return (0); 570 } 571 572 static void 573 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) 574 { 575 unsigned long flags; 576 kmem_debug_t *kd = NULL; 577 char str[17]; 578 579 spin_lock_irqsave(lock, flags); 580 if (!list_empty(list)) 581 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", 582 "size", "data", "func", "line"); 583 584 list_for_each_entry(kd, list, kd_list) { 585 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, 586 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), 587 kd->kd_func, kd->kd_line); 588 } 589 590 spin_unlock_irqrestore(lock, flags); 591 } 592 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ 593 594 int 595 spl_kmem_init(void) 596 { 597 598 #ifdef DEBUG_KMEM 599 kmem_alloc_used_set(0); 600 601 602 603 #ifdef DEBUG_KMEM_TRACKING 604 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); 605 #endif /* DEBUG_KMEM_TRACKING */ 606 #endif /* DEBUG_KMEM */ 607 608 return (0); 609 } 610 611 void 612 spl_kmem_fini(void) 613 { 614 #ifdef DEBUG_KMEM 615 /* 616 * Display all unreclaimed memory addresses, including the 617 * allocation size and the first few bytes of what's located 618 * at that address to aid in debugging. Performance is not 619 * a serious concern here since it is module unload time. 620 */ 621 if (kmem_alloc_used_read() != 0) 622 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", 623 (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); 624 625 #ifdef DEBUG_KMEM_TRACKING 626 spl_kmem_fini_tracking(&kmem_list, &kmem_lock); 627 #endif /* DEBUG_KMEM_TRACKING */ 628 #endif /* DEBUG_KMEM */ 629 } 630