1 /* 2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. 3 * Copyright (C) 2007 The Regents of the University of California. 4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 6 * UCRL-CODE-235197 7 * 8 * This file is part of the SPL, Solaris Porting Layer. 9 * For details, see <http://zfsonlinux.org/>. 10 * 11 * The SPL is free software; you can redistribute it and/or modify it 12 * under the terms of the GNU General Public License as published by the 13 * Free Software Foundation; either version 2 of the License, or (at your 14 * option) any later version. 15 * 16 * The SPL is distributed in the hope that it will be useful, but WITHOUT 17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 19 * for more details. 20 * 21 * You should have received a copy of the GNU General Public License along 22 * with the SPL. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25 #include <sys/debug.h> 26 #include <sys/sysmacros.h> 27 #include <sys/kmem.h> 28 #include <sys/vmem.h> 29 30 /* 31 * As a general rule kmem_alloc() allocations should be small, preferably 32 * just a few pages since they must by physically contiguous. Therefore, a 33 * rate limited warning will be printed to the console for any kmem_alloc() 34 * which exceeds a reasonable threshold. 35 * 36 * The default warning threshold is set to sixteen pages but capped at 64K to 37 * accommodate systems using large pages. This value was selected to be small 38 * enough to ensure the largest allocations are quickly noticed and fixed. 39 * But large enough to avoid logging any warnings when a allocation size is 40 * larger than optimal but not a serious concern. Since this value is tunable, 41 * developers are encouraged to set it lower when testing so any new largish 42 * allocations are quickly caught. These warnings may be disabled by setting 43 * the threshold to zero. 44 */ 45 /* BEGIN CSTYLED */ 46 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); 47 module_param(spl_kmem_alloc_warn, uint, 0644); 48 MODULE_PARM_DESC(spl_kmem_alloc_warn, 49 "Warning threshold in bytes for a kmem_alloc()"); 50 EXPORT_SYMBOL(spl_kmem_alloc_warn); 51 52 /* 53 * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. 54 * Allocations which are marginally smaller than this limit may succeed but 55 * should still be avoided due to the expense of locating a contiguous range 56 * of free pages. Therefore, a maximum kmem size with reasonable safely 57 * margin of 4x is set. Kmem_alloc() allocations larger than this maximum 58 * will quickly fail. Vmem_alloc() allocations less than or equal to this 59 * value will use kmalloc(), but shift to vmalloc() when exceeding this value. 60 */ 61 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); 62 module_param(spl_kmem_alloc_max, uint, 0644); 63 MODULE_PARM_DESC(spl_kmem_alloc_max, 64 "Maximum size in bytes for a kmem_alloc()"); 65 EXPORT_SYMBOL(spl_kmem_alloc_max); 66 /* END CSTYLED */ 67 68 int 69 kmem_debugging(void) 70 { 71 return (0); 72 } 73 EXPORT_SYMBOL(kmem_debugging); 74 75 char * 76 kmem_vasprintf(const char *fmt, va_list ap) 77 { 78 va_list aq; 79 char *ptr; 80 81 do { 82 va_copy(aq, ap); 83 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); 84 va_end(aq); 85 } while (ptr == NULL); 86 87 return (ptr); 88 } 89 EXPORT_SYMBOL(kmem_vasprintf); 90 91 char * 92 kmem_asprintf(const char *fmt, ...) 93 { 94 va_list ap; 95 char *ptr; 96 97 do { 98 va_start(ap, fmt); 99 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); 100 va_end(ap); 101 } while (ptr == NULL); 102 103 return (ptr); 104 } 105 EXPORT_SYMBOL(kmem_asprintf); 106 107 static char * 108 __strdup(const char *str, int flags) 109 { 110 char *ptr; 111 int n; 112 113 n = strlen(str); 114 ptr = kmalloc(n + 1, kmem_flags_convert(flags)); 115 if (ptr) 116 memcpy(ptr, str, n + 1); 117 118 return (ptr); 119 } 120 121 char * 122 kmem_strdup(const char *str) 123 { 124 return (__strdup(str, KM_SLEEP)); 125 } 126 EXPORT_SYMBOL(kmem_strdup); 127 128 void 129 kmem_strfree(char *str) 130 { 131 kfree(str); 132 } 133 EXPORT_SYMBOL(kmem_strfree); 134 135 void * 136 spl_kvmalloc(size_t size, gfp_t lflags) 137 { 138 #ifdef HAVE_KVMALLOC 139 /* 140 * GFP_KERNEL allocations can safely use kvmalloc which may 141 * improve performance by avoiding a) high latency caused by 142 * vmalloc's on-access allocation, b) performance loss due to 143 * MMU memory address mapping and c) vmalloc locking overhead. 144 * This has the side-effect that the slab statistics will 145 * incorrectly report this as a vmem allocation, but that is 146 * purely cosmetic. 147 */ 148 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 149 return (kvmalloc(size, lflags)); 150 #endif 151 152 gfp_t kmalloc_lflags = lflags; 153 154 if (size > PAGE_SIZE) { 155 /* 156 * We need to set __GFP_NOWARN here since spl_kvmalloc is not 157 * only called by spl_kmem_alloc_impl but can be called 158 * directly with custom lflags, too. In that case 159 * kmem_flags_convert does not get called, which would 160 * implicitly set __GFP_NOWARN. 161 */ 162 kmalloc_lflags |= __GFP_NOWARN; 163 164 /* 165 * N.B. __GFP_RETRY_MAYFAIL is supported only for large 166 * e (>32kB) allocations. 167 * 168 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY 169 * for !costly requests because there is no other way to tell 170 * the allocator that we want to fail rather than retry 171 * endlessly. 172 */ 173 if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || 174 (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 175 kmalloc_lflags |= __GFP_NORETRY; 176 } 177 } 178 179 /* 180 * We first try kmalloc - even for big sizes - and fall back to 181 * spl_vmalloc if that fails. 182 * 183 * For non-__GFP-RECLAIM allocations we always stick to 184 * kmalloc_node, and fail when kmalloc is not successful (returns 185 * NULL). 186 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc 187 * internally uses GPF_KERNEL allocations. 188 */ 189 void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); 190 if (ptr || size <= PAGE_SIZE || 191 (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { 192 return (ptr); 193 } 194 195 return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); 196 } 197 198 /* 199 * General purpose unified implementation of kmem_alloc(). It is an 200 * amalgamation of Linux and Illumos allocator design. It should never be 201 * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains 202 * relatively portable. Consumers may only access this function through 203 * wrappers that enforce the common flags to ensure portability. 204 */ 205 inline void * 206 spl_kmem_alloc_impl(size_t size, int flags, int node) 207 { 208 gfp_t lflags = kmem_flags_convert(flags); 209 void *ptr; 210 211 /* 212 * Log abnormally large allocations and rate limit the console output. 213 * Allocations larger than spl_kmem_alloc_warn should be performed 214 * through the vmem_alloc()/vmem_zalloc() interfaces. 215 */ 216 if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && 217 !(flags & KM_VMEM)) { 218 printk(KERN_WARNING 219 "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" 220 "https://github.com/zfsonlinux/zfs/issues/new\n", 221 (unsigned long)size, flags); 222 dump_stack(); 223 } 224 225 /* 226 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used 227 * unlike kmem_alloc() with KM_SLEEP on Illumos. 228 */ 229 do { 230 /* 231 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max 232 * is unsafe. This must fail for all for kmem_alloc() and 233 * kmem_zalloc() callers. 234 * 235 * For vmem_alloc() and vmem_zalloc() callers it is permissible 236 * to use spl_vmalloc(). However, in general use of 237 * spl_vmalloc() is strongly discouraged because a global lock 238 * must be acquired. Contention on this lock can significantly 239 * impact performance so frequently manipulating the virtual 240 * address space is strongly discouraged. 241 */ 242 if (size > spl_kmem_alloc_max) { 243 if (flags & KM_VMEM) { 244 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); 245 } else { 246 return (NULL); 247 } 248 } else { 249 if (flags & KM_VMEM) { 250 ptr = spl_kvmalloc(size, lflags); 251 } else { 252 ptr = kmalloc_node(size, lflags, node); 253 } 254 } 255 256 if (likely(ptr) || (flags & KM_NOSLEEP)) 257 return (ptr); 258 259 /* 260 * Try hard to satisfy the allocation. However, when progress 261 * cannot be made, the allocation is allowed to fail. 262 */ 263 if ((lflags & GFP_KERNEL) == GFP_KERNEL) 264 lflags |= __GFP_RETRY_MAYFAIL; 265 266 /* 267 * Use cond_resched() instead of congestion_wait() to avoid 268 * deadlocking systems where there are no block devices. 269 */ 270 cond_resched(); 271 } while (1); 272 273 return (NULL); 274 } 275 276 inline void 277 spl_kmem_free_impl(const void *buf, size_t size) 278 { 279 if (is_vmalloc_addr(buf)) 280 vfree(buf); 281 else 282 kfree(buf); 283 } 284 285 /* 286 * Memory allocation and accounting for kmem_* * style allocations. When 287 * DEBUG_KMEM is enabled the total memory allocated will be tracked and 288 * any memory leaked will be reported during module unload. 289 * 290 * ./configure --enable-debug-kmem 291 */ 292 #ifdef DEBUG_KMEM 293 294 /* Shim layer memory accounting */ 295 #ifdef HAVE_ATOMIC64_T 296 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); 297 unsigned long long kmem_alloc_max = 0; 298 #else /* HAVE_ATOMIC64_T */ 299 atomic_t kmem_alloc_used = ATOMIC_INIT(0); 300 unsigned long long kmem_alloc_max = 0; 301 #endif /* HAVE_ATOMIC64_T */ 302 303 EXPORT_SYMBOL(kmem_alloc_used); 304 EXPORT_SYMBOL(kmem_alloc_max); 305 306 inline void * 307 spl_kmem_alloc_debug(size_t size, int flags, int node) 308 { 309 void *ptr; 310 311 ptr = spl_kmem_alloc_impl(size, flags, node); 312 if (ptr) { 313 kmem_alloc_used_add(size); 314 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) 315 kmem_alloc_max = kmem_alloc_used_read(); 316 } 317 318 return (ptr); 319 } 320 321 inline void 322 spl_kmem_free_debug(const void *ptr, size_t size) 323 { 324 kmem_alloc_used_sub(size); 325 spl_kmem_free_impl(ptr, size); 326 } 327 328 /* 329 * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked 330 * but also the location of every alloc and free. When the SPL module is 331 * unloaded a list of all leaked addresses and where they were allocated 332 * will be dumped to the console. Enabling this feature has a significant 333 * impact on performance but it makes finding memory leaks straight forward. 334 * 335 * Not surprisingly with debugging enabled the xmem_locks are very highly 336 * contended particularly on xfree(). If we want to run with this detailed 337 * debugging enabled for anything other than debugging we need to minimize 338 * the contention by moving to a lock per xmem_table entry model. 339 * 340 * ./configure --enable-debug-kmem-tracking 341 */ 342 #ifdef DEBUG_KMEM_TRACKING 343 344 #include <linux/hash.h> 345 #include <linux/ctype.h> 346 347 #define KMEM_HASH_BITS 10 348 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) 349 350 typedef struct kmem_debug { 351 struct hlist_node kd_hlist; /* Hash node linkage */ 352 struct list_head kd_list; /* List of all allocations */ 353 void *kd_addr; /* Allocation pointer */ 354 size_t kd_size; /* Allocation size */ 355 const char *kd_func; /* Allocation function */ 356 int kd_line; /* Allocation line */ 357 } kmem_debug_t; 358 359 static spinlock_t kmem_lock; 360 static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; 361 static struct list_head kmem_list; 362 363 static kmem_debug_t * 364 kmem_del_init(spinlock_t *lock, struct hlist_head *table, 365 int bits, const void *addr) 366 { 367 struct hlist_head *head; 368 struct hlist_node *node = NULL; 369 struct kmem_debug *p; 370 unsigned long flags; 371 372 spin_lock_irqsave(lock, flags); 373 374 head = &table[hash_ptr((void *)addr, bits)]; 375 hlist_for_each(node, head) { 376 p = list_entry(node, struct kmem_debug, kd_hlist); 377 if (p->kd_addr == addr) { 378 hlist_del_init(&p->kd_hlist); 379 list_del_init(&p->kd_list); 380 spin_unlock_irqrestore(lock, flags); 381 return (p); 382 } 383 } 384 385 spin_unlock_irqrestore(lock, flags); 386 387 return (NULL); 388 } 389 390 inline void * 391 spl_kmem_alloc_track(size_t size, int flags, 392 const char *func, int line, int node) 393 { 394 void *ptr = NULL; 395 kmem_debug_t *dptr; 396 unsigned long irq_flags; 397 398 dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); 399 if (dptr == NULL) 400 return (NULL); 401 402 dptr->kd_func = __strdup(func, flags); 403 if (dptr->kd_func == NULL) { 404 kfree(dptr); 405 return (NULL); 406 } 407 408 ptr = spl_kmem_alloc_debug(size, flags, node); 409 if (ptr == NULL) { 410 kfree(dptr->kd_func); 411 kfree(dptr); 412 return (NULL); 413 } 414 415 INIT_HLIST_NODE(&dptr->kd_hlist); 416 INIT_LIST_HEAD(&dptr->kd_list); 417 418 dptr->kd_addr = ptr; 419 dptr->kd_size = size; 420 dptr->kd_line = line; 421 422 spin_lock_irqsave(&kmem_lock, irq_flags); 423 hlist_add_head(&dptr->kd_hlist, 424 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); 425 list_add_tail(&dptr->kd_list, &kmem_list); 426 spin_unlock_irqrestore(&kmem_lock, irq_flags); 427 428 return (ptr); 429 } 430 431 inline void 432 spl_kmem_free_track(const void *ptr, size_t size) 433 { 434 kmem_debug_t *dptr; 435 436 /* Ignore NULL pointer since we haven't tracked it at all */ 437 if (ptr == NULL) 438 return; 439 440 /* Must exist in hash due to kmem_alloc() */ 441 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); 442 ASSERT3P(dptr, !=, NULL); 443 ASSERT3S(dptr->kd_size, ==, size); 444 445 kfree(dptr->kd_func); 446 kfree(dptr); 447 448 spl_kmem_free_debug(ptr, size); 449 } 450 #endif /* DEBUG_KMEM_TRACKING */ 451 #endif /* DEBUG_KMEM */ 452 453 /* 454 * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. 455 */ 456 void * 457 spl_kmem_alloc(size_t size, int flags, const char *func, int line) 458 { 459 ASSERT0(flags & ~KM_PUBLIC_MASK); 460 461 #if !defined(DEBUG_KMEM) 462 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 463 #elif !defined(DEBUG_KMEM_TRACKING) 464 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 465 #else 466 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 467 #endif 468 } 469 EXPORT_SYMBOL(spl_kmem_alloc); 470 471 void * 472 spl_kmem_zalloc(size_t size, int flags, const char *func, int line) 473 { 474 ASSERT0(flags & ~KM_PUBLIC_MASK); 475 476 flags |= KM_ZERO; 477 478 #if !defined(DEBUG_KMEM) 479 return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); 480 #elif !defined(DEBUG_KMEM_TRACKING) 481 return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); 482 #else 483 return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); 484 #endif 485 } 486 EXPORT_SYMBOL(spl_kmem_zalloc); 487 488 void 489 spl_kmem_free(const void *buf, size_t size) 490 { 491 #if !defined(DEBUG_KMEM) 492 return (spl_kmem_free_impl(buf, size)); 493 #elif !defined(DEBUG_KMEM_TRACKING) 494 return (spl_kmem_free_debug(buf, size)); 495 #else 496 return (spl_kmem_free_track(buf, size)); 497 #endif 498 } 499 EXPORT_SYMBOL(spl_kmem_free); 500 501 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) 502 static char * 503 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) 504 { 505 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; 506 int i, flag = 1; 507 508 ASSERT(str != NULL && len >= 17); 509 memset(str, 0, len); 510 511 /* 512 * Check for a fully printable string, and while we are at 513 * it place the printable characters in the passed buffer. 514 */ 515 for (i = 0; i < size; i++) { 516 str[i] = ((char *)(kd->kd_addr))[i]; 517 if (isprint(str[i])) { 518 continue; 519 } else { 520 /* 521 * Minimum number of printable characters found 522 * to make it worthwhile to print this as ascii. 523 */ 524 if (i > min) 525 break; 526 527 flag = 0; 528 break; 529 } 530 } 531 532 if (!flag) { 533 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", 534 *((uint8_t *)kd->kd_addr), 535 *((uint8_t *)kd->kd_addr + 2), 536 *((uint8_t *)kd->kd_addr + 4), 537 *((uint8_t *)kd->kd_addr + 6), 538 *((uint8_t *)kd->kd_addr + 8), 539 *((uint8_t *)kd->kd_addr + 10), 540 *((uint8_t *)kd->kd_addr + 12), 541 *((uint8_t *)kd->kd_addr + 14)); 542 } 543 544 return (str); 545 } 546 547 static int 548 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) 549 { 550 int i; 551 552 spin_lock_init(lock); 553 INIT_LIST_HEAD(list); 554 555 for (i = 0; i < size; i++) 556 INIT_HLIST_HEAD(&kmem_table[i]); 557 558 return (0); 559 } 560 561 static void 562 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) 563 { 564 unsigned long flags; 565 kmem_debug_t *kd = NULL; 566 char str[17]; 567 568 spin_lock_irqsave(lock, flags); 569 if (!list_empty(list)) 570 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", 571 "size", "data", "func", "line"); 572 573 list_for_each_entry(kd, list, kd_list) { 574 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, 575 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), 576 kd->kd_func, kd->kd_line); 577 } 578 579 spin_unlock_irqrestore(lock, flags); 580 } 581 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ 582 583 int 584 spl_kmem_init(void) 585 { 586 587 #ifdef DEBUG_KMEM 588 kmem_alloc_used_set(0); 589 590 591 592 #ifdef DEBUG_KMEM_TRACKING 593 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); 594 #endif /* DEBUG_KMEM_TRACKING */ 595 #endif /* DEBUG_KMEM */ 596 597 return (0); 598 } 599 600 void 601 spl_kmem_fini(void) 602 { 603 #ifdef DEBUG_KMEM 604 /* 605 * Display all unreclaimed memory addresses, including the 606 * allocation size and the first few bytes of what's located 607 * at that address to aid in debugging. Performance is not 608 * a serious concern here since it is module unload time. 609 */ 610 if (kmem_alloc_used_read() != 0) 611 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", 612 (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); 613 614 #ifdef DEBUG_KMEM_TRACKING 615 spl_kmem_fini_tracking(&kmem_list, &kmem_lock); 616 #endif /* DEBUG_KMEM_TRACKING */ 617 #endif /* DEBUG_KMEM */ 618 } 619