1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_param.h" 34 35 #include <sys/param.h> 36 #include <sys/conf.h> 37 #include <sys/malloc.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/domain.h> 41 #include <sys/eventhandler.h> 42 #include <sys/kernel.h> 43 #include <sys/limits.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/protosw.h> 47 #include <sys/smp.h> 48 #include <sys/sysctl.h> 49 50 #include <vm/vm.h> 51 #include <vm/vm_extern.h> 52 #include <vm/vm_kern.h> 53 #include <vm/vm_page.h> 54 #include <vm/vm_map.h> 55 #include <vm/uma.h> 56 #include <vm/uma_dbg.h> 57 58 /* 59 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 60 * Zones. 61 * 62 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 63 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 64 * administrator so desires. 65 * 66 * Mbufs are allocated from a UMA Master Zone called the Mbuf 67 * Zone. 68 * 69 * Additionally, FreeBSD provides a Packet Zone, which it 70 * configures as a Secondary Zone to the Mbuf Master Zone, 71 * thus sharing backend Slab kegs with the Mbuf Master Zone. 72 * 73 * Thus common-case allocations and locking are simplified: 74 * 75 * m_clget() m_getcl() 76 * | | 77 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 78 * | | [ Packet ] | 79 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 80 * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] 81 * | \________ | 82 * [ Cluster Keg ] \ / 83 * | [ Mbuf Keg ] 84 * [ Cluster Slabs ] | 85 * | [ Mbuf Slabs ] 86 * \____________(VM)_________________/ 87 * 88 * 89 * Whenever an object is allocated with uma_zalloc() out of 90 * one of the Zones its _ctor_ function is executed. The same 91 * for any deallocation through uma_zfree() the _dtor_ function 92 * is executed. 93 * 94 * Caches are per-CPU and are filled from the Master Zone. 95 * 96 * Whenever an object is allocated from the underlying global 97 * memory pool it gets pre-initialized with the _zinit_ functions. 98 * When the Keg's are overfull objects get decommissioned with 99 * _zfini_ functions and free'd back to the global memory pool. 100 * 101 */ 102 103 int nmbufs; /* limits number of mbufs */ 104 int nmbclusters; /* limits number of mbuf clusters */ 105 int nmbjumbop; /* limits number of page size jumbo clusters */ 106 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 107 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 108 109 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 110 111 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 112 "Maximum real memory allocatable to various mbuf types"); 113 114 /* 115 * tunable_mbinit() has to be run before any mbuf allocations are done. 116 */ 117 static void 118 tunable_mbinit(void *dummy) 119 { 120 quad_t realmem; 121 122 /* 123 * The default limit for all mbuf related memory is 1/2 of all 124 * available kernel memory (physical or kmem). 125 * At most it can be 3/4 of available kernel memory. 126 */ 127 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 128 maxmbufmem = realmem / 2; 129 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 130 if (maxmbufmem > realmem / 4 * 3) 131 maxmbufmem = realmem / 4 * 3; 132 133 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 134 if (nmbclusters == 0) 135 nmbclusters = maxmbufmem / MCLBYTES / 4; 136 137 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 138 if (nmbjumbop == 0) 139 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 140 141 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 142 if (nmbjumbo9 == 0) 143 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 144 145 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 146 if (nmbjumbo16 == 0) 147 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 148 149 /* 150 * We need at least as many mbufs as we have clusters of 151 * the various types added together. 152 */ 153 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 154 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 155 nmbufs = lmax(maxmbufmem / MSIZE / 5, 156 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 157 } 158 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 159 160 static int 161 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 162 { 163 int error, newnmbclusters; 164 165 newnmbclusters = nmbclusters; 166 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 167 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 168 if (newnmbclusters > nmbclusters && 169 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 170 nmbclusters = newnmbclusters; 171 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 172 EVENTHANDLER_INVOKE(nmbclusters_change); 173 } else 174 error = EINVAL; 175 } 176 return (error); 177 } 178 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, 179 &nmbclusters, 0, sysctl_nmbclusters, "IU", 180 "Maximum number of mbuf clusters allowed"); 181 182 static int 183 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 184 { 185 int error, newnmbjumbop; 186 187 newnmbjumbop = nmbjumbop; 188 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 189 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 190 if (newnmbjumbop > nmbjumbop && 191 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 192 nmbjumbop = newnmbjumbop; 193 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 194 } else 195 error = EINVAL; 196 } 197 return (error); 198 } 199 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, 200 &nmbjumbop, 0, sysctl_nmbjumbop, "IU", 201 "Maximum number of mbuf page size jumbo clusters allowed"); 202 203 static int 204 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 205 { 206 int error, newnmbjumbo9; 207 208 newnmbjumbo9 = nmbjumbo9; 209 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 210 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 211 if (newnmbjumbo9 > nmbjumbo9 && 212 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 213 nmbjumbo9 = newnmbjumbo9; 214 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 215 } else 216 error = EINVAL; 217 } 218 return (error); 219 } 220 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, 221 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", 222 "Maximum number of mbuf 9k jumbo clusters allowed"); 223 224 static int 225 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 226 { 227 int error, newnmbjumbo16; 228 229 newnmbjumbo16 = nmbjumbo16; 230 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 231 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 232 if (newnmbjumbo16 > nmbjumbo16 && 233 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 234 nmbjumbo16 = newnmbjumbo16; 235 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 236 } else 237 error = EINVAL; 238 } 239 return (error); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, 242 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", 243 "Maximum number of mbuf 16k jumbo clusters allowed"); 244 245 static int 246 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 247 { 248 int error, newnmbufs; 249 250 newnmbufs = nmbufs; 251 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 252 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 253 if (newnmbufs > nmbufs) { 254 nmbufs = newnmbufs; 255 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 256 EVENTHANDLER_INVOKE(nmbufs_change); 257 } else 258 error = EINVAL; 259 } 260 return (error); 261 } 262 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, 263 &nmbufs, 0, sysctl_nmbufs, "IU", 264 "Maximum number of mbufs allowed"); 265 266 /* 267 * Zones from which we allocate. 268 */ 269 uma_zone_t zone_mbuf; 270 uma_zone_t zone_clust; 271 uma_zone_t zone_pack; 272 uma_zone_t zone_jumbop; 273 uma_zone_t zone_jumbo9; 274 uma_zone_t zone_jumbo16; 275 276 /* 277 * Local prototypes. 278 */ 279 static int mb_ctor_mbuf(void *, int, void *, int); 280 static int mb_ctor_clust(void *, int, void *, int); 281 static int mb_ctor_pack(void *, int, void *, int); 282 static void mb_dtor_mbuf(void *, int, void *); 283 static void mb_dtor_pack(void *, int, void *); 284 static int mb_zinit_pack(void *, int, int); 285 static void mb_zfini_pack(void *, int); 286 static void mb_reclaim(uma_zone_t, int); 287 static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 288 289 /* Ensure that MSIZE is a power of 2. */ 290 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 291 292 /* 293 * Initialize FreeBSD Network buffer allocation. 294 */ 295 static void 296 mbuf_init(void *dummy) 297 { 298 299 /* 300 * Configure UMA zones for Mbufs, Clusters, and Packets. 301 */ 302 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 303 mb_ctor_mbuf, mb_dtor_mbuf, 304 #ifdef INVARIANTS 305 trash_init, trash_fini, 306 #else 307 NULL, NULL, 308 #endif 309 MSIZE - 1, UMA_ZONE_MAXBUCKET); 310 if (nmbufs > 0) 311 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 312 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 313 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 314 315 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 316 mb_ctor_clust, 317 #ifdef INVARIANTS 318 trash_dtor, trash_init, trash_fini, 319 #else 320 NULL, NULL, NULL, 321 #endif 322 UMA_ALIGN_PTR, 0); 323 if (nmbclusters > 0) 324 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 325 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 326 uma_zone_set_maxaction(zone_clust, mb_reclaim); 327 328 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 329 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 330 331 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 332 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 333 mb_ctor_clust, 334 #ifdef INVARIANTS 335 trash_dtor, trash_init, trash_fini, 336 #else 337 NULL, NULL, NULL, 338 #endif 339 UMA_ALIGN_PTR, 0); 340 if (nmbjumbop > 0) 341 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 342 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 343 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 344 345 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 346 mb_ctor_clust, 347 #ifdef INVARIANTS 348 trash_dtor, trash_init, trash_fini, 349 #else 350 NULL, NULL, NULL, 351 #endif 352 UMA_ALIGN_PTR, 0); 353 uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); 354 if (nmbjumbo9 > 0) 355 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 356 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 357 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 358 359 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 360 mb_ctor_clust, 361 #ifdef INVARIANTS 362 trash_dtor, trash_init, trash_fini, 363 #else 364 NULL, NULL, NULL, 365 #endif 366 UMA_ALIGN_PTR, 0); 367 uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); 368 if (nmbjumbo16 > 0) 369 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 370 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 371 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 372 373 /* 374 * Hook event handler for low-memory situation, used to 375 * drain protocols and push data back to the caches (UMA 376 * later pushes it back to VM). 377 */ 378 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 379 EVENTHANDLER_PRI_FIRST); 380 } 381 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 382 383 #ifdef NETDUMP 384 /* 385 * netdump makes use of a pre-allocated pool of mbufs and clusters. When 386 * netdump is configured, we initialize a set of UMA cache zones which return 387 * items from this pool. At panic-time, the regular UMA zone pointers are 388 * overwritten with those of the cache zones so that drivers may allocate and 389 * free mbufs and clusters without attempting to allocate physical memory. 390 * 391 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 392 * the purpose of caching clusters, we treat them as mbufs. 393 */ 394 static struct mbufq nd_mbufq = 395 { STAILQ_HEAD_INITIALIZER(nd_mbufq.mq_head), 0, INT_MAX }; 396 static struct mbufq nd_clustq = 397 { STAILQ_HEAD_INITIALIZER(nd_clustq.mq_head), 0, INT_MAX }; 398 399 static int nd_clsize; 400 static uma_zone_t nd_zone_mbuf; 401 static uma_zone_t nd_zone_clust; 402 static uma_zone_t nd_zone_pack; 403 404 static int 405 nd_buf_import(void *arg, void **store, int count, int domain __unused, 406 int flags) 407 { 408 struct mbufq *q; 409 struct mbuf *m; 410 int i; 411 412 KASSERT(!dumping, ("%s: ran out of pre-allocated mbufs", __func__)); 413 414 q = arg; 415 416 for (i = 0; i < count; i++) { 417 m = mbufq_dequeue(q); 418 if (m == NULL) 419 break; 420 trash_init(m, q == &nd_mbufq ? MSIZE : nd_clsize, flags); 421 store[i] = m; 422 } 423 return (i); 424 } 425 426 static void 427 nd_buf_release(void *arg, void **store, int count) 428 { 429 struct mbufq *q; 430 struct mbuf *m; 431 int i; 432 433 q = arg; 434 435 for (i = 0; i < count; i++) { 436 m = store[i]; 437 (void)mbufq_enqueue(q, m); 438 } 439 } 440 441 static int 442 nd_pack_import(void *arg __unused, void **store, int count, int domain __unused, 443 int flags __unused) 444 { 445 struct mbuf *m; 446 void *clust; 447 int i; 448 449 KASSERT(!dumping, ("%s: ran out of pre-allocated mbufs", __func__)); 450 451 for (i = 0; i < count; i++) { 452 m = m_get(MT_DATA, M_NOWAIT); 453 if (m == NULL) 454 break; 455 clust = uma_zalloc(nd_zone_clust, M_NOWAIT); 456 if (clust == NULL) { 457 m_free(m); 458 break; 459 } 460 mb_ctor_clust(clust, nd_clsize, m, 0); 461 store[i] = m; 462 } 463 return (i); 464 } 465 466 static void 467 nd_pack_release(void *arg __unused, void **store, int count) 468 { 469 struct mbuf *m; 470 void *clust; 471 int i; 472 473 for (i = 0; i < count; i++) { 474 m = store[i]; 475 clust = m->m_ext.ext_buf; 476 uma_zfree(nd_zone_clust, clust); 477 uma_zfree(nd_zone_mbuf, m); 478 } 479 } 480 481 /* 482 * Free the pre-allocated mbufs and clusters reserved for netdump, and destroy 483 * the corresponding UMA cache zones. 484 */ 485 void 486 netdump_mbuf_drain(void) 487 { 488 struct mbuf *m; 489 void *item; 490 491 if (nd_zone_mbuf != NULL) { 492 uma_zdestroy(nd_zone_mbuf); 493 nd_zone_mbuf = NULL; 494 } 495 if (nd_zone_clust != NULL) { 496 uma_zdestroy(nd_zone_clust); 497 nd_zone_clust = NULL; 498 } 499 if (nd_zone_pack != NULL) { 500 uma_zdestroy(nd_zone_pack); 501 nd_zone_pack = NULL; 502 } 503 504 while ((m = mbufq_dequeue(&nd_mbufq)) != NULL) 505 m_free(m); 506 while ((item = mbufq_dequeue(&nd_clustq)) != NULL) 507 uma_zfree(m_getzone(nd_clsize), item); 508 } 509 510 /* 511 * Callback invoked immediately prior to starting a netdump. 512 */ 513 void 514 netdump_mbuf_dump(void) 515 { 516 517 /* 518 * All cluster zones return buffers of the size requested by the 519 * drivers. It's up to the driver to reinitialize the zones if the 520 * MTU of a netdump-enabled interface changes. 521 */ 522 printf("netdump: overwriting mbuf zone pointers\n"); 523 zone_mbuf = nd_zone_mbuf; 524 zone_clust = nd_zone_clust; 525 zone_pack = nd_zone_pack; 526 zone_jumbop = nd_zone_clust; 527 zone_jumbo9 = nd_zone_clust; 528 zone_jumbo16 = nd_zone_clust; 529 } 530 531 /* 532 * Reinitialize the netdump mbuf+cluster pool and cache zones. 533 */ 534 void 535 netdump_mbuf_reinit(int nmbuf, int nclust, int clsize) 536 { 537 struct mbuf *m; 538 void *item; 539 540 netdump_mbuf_drain(); 541 542 nd_clsize = clsize; 543 544 nd_zone_mbuf = uma_zcache_create("netdump_" MBUF_MEM_NAME, 545 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, 546 #ifdef INVARIANTS 547 trash_init, trash_fini, 548 #else 549 NULL, NULL, 550 #endif 551 nd_buf_import, nd_buf_release, 552 &nd_mbufq, UMA_ZONE_NOBUCKET); 553 554 nd_zone_clust = uma_zcache_create("netdump_" MBUF_CLUSTER_MEM_NAME, 555 clsize, mb_ctor_clust, 556 #ifdef INVARIANTS 557 trash_dtor, trash_init, trash_fini, 558 #else 559 NULL, NULL, NULL, 560 #endif 561 nd_buf_import, nd_buf_release, 562 &nd_clustq, UMA_ZONE_NOBUCKET); 563 564 nd_zone_pack = uma_zcache_create("netdump_" MBUF_PACKET_MEM_NAME, 565 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 566 nd_pack_import, nd_pack_release, 567 NULL, UMA_ZONE_NOBUCKET); 568 569 while (nmbuf-- > 0) { 570 m = m_get(MT_DATA, M_WAITOK); 571 uma_zfree(nd_zone_mbuf, m); 572 } 573 while (nclust-- > 0) { 574 item = uma_zalloc(m_getzone(nd_clsize), M_WAITOK); 575 uma_zfree(nd_zone_clust, item); 576 } 577 } 578 #endif /* NETDUMP */ 579 580 /* 581 * UMA backend page allocator for the jumbo frame zones. 582 * 583 * Allocates kernel virtual memory that is backed by contiguous physical 584 * pages. 585 */ 586 static void * 587 mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 588 int wait) 589 { 590 591 /* Inform UMA that this allocator uses kernel_map/object. */ 592 *flags = UMA_SLAB_KERNEL; 593 return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 594 (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); 595 } 596 597 /* 598 * Constructor for Mbuf master zone. 599 * 600 * The 'arg' pointer points to a mb_args structure which 601 * contains call-specific information required to support the 602 * mbuf allocation API. See mbuf.h. 603 */ 604 static int 605 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 606 { 607 struct mbuf *m; 608 struct mb_args *args; 609 int error; 610 int flags; 611 short type; 612 613 #ifdef INVARIANTS 614 trash_ctor(mem, size, arg, how); 615 #endif 616 args = (struct mb_args *)arg; 617 type = args->type; 618 619 /* 620 * The mbuf is initialized later. The caller has the 621 * responsibility to set up any MAC labels too. 622 */ 623 if (type == MT_NOINIT) 624 return (0); 625 626 m = (struct mbuf *)mem; 627 flags = args->flags; 628 MPASS((flags & M_NOFREE) == 0); 629 630 error = m_init(m, how, type, flags); 631 632 return (error); 633 } 634 635 /* 636 * The Mbuf master zone destructor. 637 */ 638 static void 639 mb_dtor_mbuf(void *mem, int size, void *arg) 640 { 641 struct mbuf *m; 642 unsigned long flags; 643 644 m = (struct mbuf *)mem; 645 flags = (unsigned long)arg; 646 647 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 648 if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 649 m_tag_delete_chain(m, NULL); 650 #ifdef INVARIANTS 651 trash_dtor(mem, size, arg); 652 #endif 653 } 654 655 /* 656 * The Mbuf Packet zone destructor. 657 */ 658 static void 659 mb_dtor_pack(void *mem, int size, void *arg) 660 { 661 struct mbuf *m; 662 663 m = (struct mbuf *)mem; 664 if ((m->m_flags & M_PKTHDR) != 0) 665 m_tag_delete_chain(m, NULL); 666 667 /* Make sure we've got a clean cluster back. */ 668 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 669 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 670 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 671 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 672 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 673 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 674 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 675 #ifdef INVARIANTS 676 trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); 677 #endif 678 /* 679 * If there are processes blocked on zone_clust, waiting for pages 680 * to be freed up, * cause them to be woken up by draining the 681 * packet zone. We are exposed to a race here * (in the check for 682 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 683 * is deliberate. We don't want to acquire the zone lock for every 684 * mbuf free. 685 */ 686 if (uma_zone_exhausted_nolock(zone_clust)) 687 zone_drain(zone_pack); 688 } 689 690 /* 691 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 692 * 693 * Here the 'arg' pointer points to the Mbuf which we 694 * are configuring cluster storage for. If 'arg' is 695 * empty we allocate just the cluster without setting 696 * the mbuf to it. See mbuf.h. 697 */ 698 static int 699 mb_ctor_clust(void *mem, int size, void *arg, int how) 700 { 701 struct mbuf *m; 702 703 #ifdef INVARIANTS 704 trash_ctor(mem, size, arg, how); 705 #endif 706 m = (struct mbuf *)arg; 707 if (m != NULL) { 708 m->m_ext.ext_buf = (char *)mem; 709 m->m_data = m->m_ext.ext_buf; 710 m->m_flags |= M_EXT; 711 m->m_ext.ext_free = NULL; 712 m->m_ext.ext_arg1 = NULL; 713 m->m_ext.ext_arg2 = NULL; 714 m->m_ext.ext_size = size; 715 m->m_ext.ext_type = m_gettype(size); 716 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 717 m->m_ext.ext_count = 1; 718 } 719 720 return (0); 721 } 722 723 /* 724 * The Packet secondary zone's init routine, executed on the 725 * object's transition from mbuf keg slab to zone cache. 726 */ 727 static int 728 mb_zinit_pack(void *mem, int size, int how) 729 { 730 struct mbuf *m; 731 732 m = (struct mbuf *)mem; /* m is virgin. */ 733 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 734 m->m_ext.ext_buf == NULL) 735 return (ENOMEM); 736 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 737 #ifdef INVARIANTS 738 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 739 #endif 740 return (0); 741 } 742 743 /* 744 * The Packet secondary zone's fini routine, executed on the 745 * object's transition from zone cache to keg slab. 746 */ 747 static void 748 mb_zfini_pack(void *mem, int size) 749 { 750 struct mbuf *m; 751 752 m = (struct mbuf *)mem; 753 #ifdef INVARIANTS 754 trash_fini(m->m_ext.ext_buf, MCLBYTES); 755 #endif 756 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 757 #ifdef INVARIANTS 758 trash_dtor(mem, size, NULL); 759 #endif 760 } 761 762 /* 763 * The "packet" keg constructor. 764 */ 765 static int 766 mb_ctor_pack(void *mem, int size, void *arg, int how) 767 { 768 struct mbuf *m; 769 struct mb_args *args; 770 int error, flags; 771 short type; 772 773 m = (struct mbuf *)mem; 774 args = (struct mb_args *)arg; 775 flags = args->flags; 776 type = args->type; 777 MPASS((flags & M_NOFREE) == 0); 778 779 #ifdef INVARIANTS 780 trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); 781 #endif 782 783 error = m_init(m, how, type, flags); 784 785 /* m_ext is already initialized. */ 786 m->m_data = m->m_ext.ext_buf; 787 m->m_flags = (flags | M_EXT); 788 789 return (error); 790 } 791 792 /* 793 * This is the protocol drain routine. Called by UMA whenever any of the 794 * mbuf zones is closed to its limit. 795 * 796 * No locks should be held when this is called. The drain routines have to 797 * presently acquire some locks which raises the possibility of lock order 798 * reversal. 799 */ 800 static void 801 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 802 { 803 struct domain *dp; 804 struct protosw *pr; 805 806 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); 807 808 for (dp = domains; dp != NULL; dp = dp->dom_next) 809 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 810 if (pr->pr_drain != NULL) 811 (*pr->pr_drain)(); 812 } 813 814 /* 815 * Clean up after mbufs with M_EXT storage attached to them if the 816 * reference count hits 1. 817 */ 818 void 819 mb_free_ext(struct mbuf *m) 820 { 821 volatile u_int *refcnt; 822 struct mbuf *mref; 823 int freembuf; 824 825 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 826 827 /* See if this is the mbuf that holds the embedded refcount. */ 828 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 829 refcnt = &m->m_ext.ext_count; 830 mref = m; 831 } else { 832 KASSERT(m->m_ext.ext_cnt != NULL, 833 ("%s: no refcounting pointer on %p", __func__, m)); 834 refcnt = m->m_ext.ext_cnt; 835 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 836 } 837 838 /* 839 * Check if the header is embedded in the cluster. It is 840 * important that we can't touch any of the mbuf fields 841 * after we have freed the external storage, since mbuf 842 * could have been embedded in it. For now, the mbufs 843 * embedded into the cluster are always of type EXT_EXTREF, 844 * and for this type we won't free the mref. 845 */ 846 if (m->m_flags & M_NOFREE) { 847 freembuf = 0; 848 KASSERT(m->m_ext.ext_type == EXT_EXTREF, 849 ("%s: no-free mbuf %p has wrong type", __func__, m)); 850 } else 851 freembuf = 1; 852 853 /* Free attached storage if this mbuf is the only reference to it. */ 854 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 855 switch (m->m_ext.ext_type) { 856 case EXT_PACKET: 857 /* The packet zone is special. */ 858 if (*refcnt == 0) 859 *refcnt = 1; 860 uma_zfree(zone_pack, mref); 861 break; 862 case EXT_CLUSTER: 863 uma_zfree(zone_clust, m->m_ext.ext_buf); 864 uma_zfree(zone_mbuf, mref); 865 break; 866 case EXT_JUMBOP: 867 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 868 uma_zfree(zone_mbuf, mref); 869 break; 870 case EXT_JUMBO9: 871 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 872 uma_zfree(zone_mbuf, mref); 873 break; 874 case EXT_JUMBO16: 875 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 876 uma_zfree(zone_mbuf, mref); 877 break; 878 case EXT_SFBUF: 879 case EXT_NET_DRV: 880 case EXT_MOD_TYPE: 881 case EXT_DISPOSABLE: 882 KASSERT(mref->m_ext.ext_free != NULL, 883 ("%s: ext_free not set", __func__)); 884 mref->m_ext.ext_free(mref); 885 uma_zfree(zone_mbuf, mref); 886 break; 887 case EXT_EXTREF: 888 KASSERT(m->m_ext.ext_free != NULL, 889 ("%s: ext_free not set", __func__)); 890 m->m_ext.ext_free(m); 891 break; 892 default: 893 KASSERT(m->m_ext.ext_type == 0, 894 ("%s: unknown ext_type", __func__)); 895 } 896 } 897 898 if (freembuf && m != mref) 899 uma_zfree(zone_mbuf, m); 900 } 901 902 /* 903 * Official mbuf(9) allocation KPI for stack and drivers: 904 * 905 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 906 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 907 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 908 * m_clget() - attach cluster to already allocated mbuf. 909 * m_cljget() - attach jumbo cluster to already allocated mbuf. 910 * m_get2() - allocate minimum mbuf that would fit size argument. 911 * m_getm2() - allocate a chain of mbufs/clusters. 912 * m_extadd() - attach external cluster to mbuf. 913 * 914 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 915 * m_freem() - free chain of mbufs. 916 */ 917 918 int 919 m_clget(struct mbuf *m, int how) 920 { 921 922 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 923 __func__, m)); 924 m->m_ext.ext_buf = (char *)NULL; 925 uma_zalloc_arg(zone_clust, m, how); 926 /* 927 * On a cluster allocation failure, drain the packet zone and retry, 928 * we might be able to loosen a few clusters up on the drain. 929 */ 930 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 931 zone_drain(zone_pack); 932 uma_zalloc_arg(zone_clust, m, how); 933 } 934 MBUF_PROBE2(m__clget, m, how); 935 return (m->m_flags & M_EXT); 936 } 937 938 /* 939 * m_cljget() is different from m_clget() as it can allocate clusters without 940 * attaching them to an mbuf. In that case the return value is the pointer 941 * to the cluster of the requested size. If an mbuf was specified, it gets 942 * the cluster attached to it and the return value can be safely ignored. 943 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 944 */ 945 void * 946 m_cljget(struct mbuf *m, int how, int size) 947 { 948 uma_zone_t zone; 949 void *retval; 950 951 if (m != NULL) { 952 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 953 __func__, m)); 954 m->m_ext.ext_buf = NULL; 955 } 956 957 zone = m_getzone(size); 958 retval = uma_zalloc_arg(zone, m, how); 959 960 MBUF_PROBE4(m__cljget, m, how, size, retval); 961 962 return (retval); 963 } 964 965 /* 966 * m_get2() allocates minimum mbuf that would fit "size" argument. 967 */ 968 struct mbuf * 969 m_get2(int size, int how, short type, int flags) 970 { 971 struct mb_args args; 972 struct mbuf *m, *n; 973 974 args.flags = flags; 975 args.type = type; 976 977 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 978 return (uma_zalloc_arg(zone_mbuf, &args, how)); 979 if (size <= MCLBYTES) 980 return (uma_zalloc_arg(zone_pack, &args, how)); 981 982 if (size > MJUMPAGESIZE) 983 return (NULL); 984 985 m = uma_zalloc_arg(zone_mbuf, &args, how); 986 if (m == NULL) 987 return (NULL); 988 989 n = uma_zalloc_arg(zone_jumbop, m, how); 990 if (n == NULL) { 991 uma_zfree(zone_mbuf, m); 992 return (NULL); 993 } 994 995 return (m); 996 } 997 998 /* 999 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1000 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1001 */ 1002 struct mbuf * 1003 m_getjcl(int how, short type, int flags, int size) 1004 { 1005 struct mb_args args; 1006 struct mbuf *m, *n; 1007 uma_zone_t zone; 1008 1009 if (size == MCLBYTES) 1010 return m_getcl(how, type, flags); 1011 1012 args.flags = flags; 1013 args.type = type; 1014 1015 m = uma_zalloc_arg(zone_mbuf, &args, how); 1016 if (m == NULL) 1017 return (NULL); 1018 1019 zone = m_getzone(size); 1020 n = uma_zalloc_arg(zone, m, how); 1021 if (n == NULL) { 1022 uma_zfree(zone_mbuf, m); 1023 return (NULL); 1024 } 1025 return (m); 1026 } 1027 1028 /* 1029 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1030 * best) and return a pointer to the top of the allocated chain. If an 1031 * existing mbuf chain is provided, then we will append the new chain 1032 * to the existing one but still return the top of the newly allocated 1033 * chain. 1034 */ 1035 struct mbuf * 1036 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1037 { 1038 struct mbuf *mb, *nm = NULL, *mtail = NULL; 1039 1040 KASSERT(len >= 0, ("%s: len is < 0", __func__)); 1041 1042 /* Validate flags. */ 1043 flags &= (M_PKTHDR | M_EOR); 1044 1045 /* Packet header mbuf must be first in chain. */ 1046 if ((flags & M_PKTHDR) && m != NULL) 1047 flags &= ~M_PKTHDR; 1048 1049 /* Loop and append maximum sized mbufs to the chain tail. */ 1050 while (len > 0) { 1051 if (len > MCLBYTES) 1052 mb = m_getjcl(how, type, (flags & M_PKTHDR), 1053 MJUMPAGESIZE); 1054 else if (len >= MINCLSIZE) 1055 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1056 else if (flags & M_PKTHDR) 1057 mb = m_gethdr(how, type); 1058 else 1059 mb = m_get(how, type); 1060 1061 /* Fail the whole operation if one mbuf can't be allocated. */ 1062 if (mb == NULL) { 1063 if (nm != NULL) 1064 m_freem(nm); 1065 return (NULL); 1066 } 1067 1068 /* Book keeping. */ 1069 len -= M_SIZE(mb); 1070 if (mtail != NULL) 1071 mtail->m_next = mb; 1072 else 1073 nm = mb; 1074 mtail = mb; 1075 flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ 1076 } 1077 if (flags & M_EOR) 1078 mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ 1079 1080 /* If mbuf was supplied, append new chain to the end of it. */ 1081 if (m != NULL) { 1082 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) 1083 ; 1084 mtail->m_next = nm; 1085 mtail->m_flags &= ~M_EOR; 1086 } else 1087 m = nm; 1088 1089 return (m); 1090 } 1091 1092 /*- 1093 * Configure a provided mbuf to refer to the provided external storage 1094 * buffer and setup a reference count for said buffer. 1095 * 1096 * Arguments: 1097 * mb The existing mbuf to which to attach the provided buffer. 1098 * buf The address of the provided external storage buffer. 1099 * size The size of the provided buffer. 1100 * freef A pointer to a routine that is responsible for freeing the 1101 * provided external storage buffer. 1102 * args A pointer to an argument structure (of any type) to be passed 1103 * to the provided freef routine (may be NULL). 1104 * flags Any other flags to be passed to the provided mbuf. 1105 * type The type that the external storage buffer should be 1106 * labeled with. 1107 * 1108 * Returns: 1109 * Nothing. 1110 */ 1111 void 1112 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1113 void *arg1, void *arg2, int flags, int type) 1114 { 1115 1116 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1117 1118 mb->m_flags |= (M_EXT | flags); 1119 mb->m_ext.ext_buf = buf; 1120 mb->m_data = mb->m_ext.ext_buf; 1121 mb->m_ext.ext_size = size; 1122 mb->m_ext.ext_free = freef; 1123 mb->m_ext.ext_arg1 = arg1; 1124 mb->m_ext.ext_arg2 = arg2; 1125 mb->m_ext.ext_type = type; 1126 1127 if (type != EXT_EXTREF) { 1128 mb->m_ext.ext_count = 1; 1129 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1130 } else 1131 mb->m_ext.ext_flags = 0; 1132 } 1133 1134 /* 1135 * Free an entire chain of mbufs and associated external buffers, if 1136 * applicable. 1137 */ 1138 void 1139 m_freem(struct mbuf *mb) 1140 { 1141 1142 MBUF_PROBE1(m__freem, mb); 1143 while (mb != NULL) 1144 mb = m_free(mb); 1145 } 1146