1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_param.h" 34 35 #include <sys/param.h> 36 #include <sys/conf.h> 37 #include <sys/domainset.h> 38 #include <sys/malloc.h> 39 #include <sys/systm.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/eventhandler.h> 43 #include <sys/kernel.h> 44 #include <sys/limits.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/sysctl.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_map.h> 60 #include <vm/uma.h> 61 #include <vm/uma_dbg.h> 62 63 /* 64 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 65 * Zones. 66 * 67 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 68 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 69 * administrator so desires. 70 * 71 * Mbufs are allocated from a UMA Master Zone called the Mbuf 72 * Zone. 73 * 74 * Additionally, FreeBSD provides a Packet Zone, which it 75 * configures as a Secondary Zone to the Mbuf Master Zone, 76 * thus sharing backend Slab kegs with the Mbuf Master Zone. 77 * 78 * Thus common-case allocations and locking are simplified: 79 * 80 * m_clget() m_getcl() 81 * | | 82 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 83 * | | [ Packet ] | 84 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 85 * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] 86 * | \________ | 87 * [ Cluster Keg ] \ / 88 * | [ Mbuf Keg ] 89 * [ Cluster Slabs ] | 90 * | [ Mbuf Slabs ] 91 * \____________(VM)_________________/ 92 * 93 * 94 * Whenever an object is allocated with uma_zalloc() out of 95 * one of the Zones its _ctor_ function is executed. The same 96 * for any deallocation through uma_zfree() the _dtor_ function 97 * is executed. 98 * 99 * Caches are per-CPU and are filled from the Master Zone. 100 * 101 * Whenever an object is allocated from the underlying global 102 * memory pool it gets pre-initialized with the _zinit_ functions. 103 * When the Keg's are overfull objects get decommissioned with 104 * _zfini_ functions and free'd back to the global memory pool. 105 * 106 */ 107 108 int nmbufs; /* limits number of mbufs */ 109 int nmbclusters; /* limits number of mbuf clusters */ 110 int nmbjumbop; /* limits number of page size jumbo clusters */ 111 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 112 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 113 114 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 115 116 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 117 "Maximum real memory allocatable to various mbuf types"); 118 119 static counter_u64_t snd_tag_count; 120 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, 121 &snd_tag_count, "# of active mbuf send tags"); 122 123 /* 124 * tunable_mbinit() has to be run before any mbuf allocations are done. 125 */ 126 static void 127 tunable_mbinit(void *dummy) 128 { 129 quad_t realmem; 130 131 /* 132 * The default limit for all mbuf related memory is 1/2 of all 133 * available kernel memory (physical or kmem). 134 * At most it can be 3/4 of available kernel memory. 135 */ 136 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 137 maxmbufmem = realmem / 2; 138 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 139 if (maxmbufmem > realmem / 4 * 3) 140 maxmbufmem = realmem / 4 * 3; 141 142 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 143 if (nmbclusters == 0) 144 nmbclusters = maxmbufmem / MCLBYTES / 4; 145 146 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 147 if (nmbjumbop == 0) 148 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 149 150 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 151 if (nmbjumbo9 == 0) 152 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 153 154 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 155 if (nmbjumbo16 == 0) 156 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 157 158 /* 159 * We need at least as many mbufs as we have clusters of 160 * the various types added together. 161 */ 162 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 163 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 164 nmbufs = lmax(maxmbufmem / MSIZE / 5, 165 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 166 } 167 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 168 169 static int 170 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 171 { 172 int error, newnmbclusters; 173 174 newnmbclusters = nmbclusters; 175 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 176 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 177 if (newnmbclusters > nmbclusters && 178 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 179 nmbclusters = newnmbclusters; 180 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 181 EVENTHANDLER_INVOKE(nmbclusters_change); 182 } else 183 error = EINVAL; 184 } 185 return (error); 186 } 187 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, 188 &nmbclusters, 0, sysctl_nmbclusters, "IU", 189 "Maximum number of mbuf clusters allowed"); 190 191 static int 192 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 193 { 194 int error, newnmbjumbop; 195 196 newnmbjumbop = nmbjumbop; 197 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 198 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 199 if (newnmbjumbop > nmbjumbop && 200 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 201 nmbjumbop = newnmbjumbop; 202 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 203 } else 204 error = EINVAL; 205 } 206 return (error); 207 } 208 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, 209 &nmbjumbop, 0, sysctl_nmbjumbop, "IU", 210 "Maximum number of mbuf page size jumbo clusters allowed"); 211 212 static int 213 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 214 { 215 int error, newnmbjumbo9; 216 217 newnmbjumbo9 = nmbjumbo9; 218 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 219 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 220 if (newnmbjumbo9 > nmbjumbo9 && 221 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 222 nmbjumbo9 = newnmbjumbo9; 223 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 224 } else 225 error = EINVAL; 226 } 227 return (error); 228 } 229 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, 230 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", 231 "Maximum number of mbuf 9k jumbo clusters allowed"); 232 233 static int 234 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 235 { 236 int error, newnmbjumbo16; 237 238 newnmbjumbo16 = nmbjumbo16; 239 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 240 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 241 if (newnmbjumbo16 > nmbjumbo16 && 242 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 243 nmbjumbo16 = newnmbjumbo16; 244 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 245 } else 246 error = EINVAL; 247 } 248 return (error); 249 } 250 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, 251 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", 252 "Maximum number of mbuf 16k jumbo clusters allowed"); 253 254 static int 255 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 256 { 257 int error, newnmbufs; 258 259 newnmbufs = nmbufs; 260 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 261 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 262 if (newnmbufs > nmbufs) { 263 nmbufs = newnmbufs; 264 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 265 EVENTHANDLER_INVOKE(nmbufs_change); 266 } else 267 error = EINVAL; 268 } 269 return (error); 270 } 271 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, 272 &nmbufs, 0, sysctl_nmbufs, "IU", 273 "Maximum number of mbufs allowed"); 274 275 /* 276 * Zones from which we allocate. 277 */ 278 uma_zone_t zone_mbuf; 279 uma_zone_t zone_clust; 280 uma_zone_t zone_pack; 281 uma_zone_t zone_jumbop; 282 uma_zone_t zone_jumbo9; 283 uma_zone_t zone_jumbo16; 284 285 /* 286 * Local prototypes. 287 */ 288 static int mb_ctor_mbuf(void *, int, void *, int); 289 static int mb_ctor_clust(void *, int, void *, int); 290 static int mb_ctor_pack(void *, int, void *, int); 291 static void mb_dtor_mbuf(void *, int, void *); 292 static void mb_dtor_pack(void *, int, void *); 293 static int mb_zinit_pack(void *, int, int); 294 static void mb_zfini_pack(void *, int); 295 static void mb_reclaim(uma_zone_t, int); 296 static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 297 298 /* Ensure that MSIZE is a power of 2. */ 299 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 300 301 /* 302 * Initialize FreeBSD Network buffer allocation. 303 */ 304 static void 305 mbuf_init(void *dummy) 306 { 307 308 /* 309 * Configure UMA zones for Mbufs, Clusters, and Packets. 310 */ 311 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 312 mb_ctor_mbuf, mb_dtor_mbuf, 313 #ifdef INVARIANTS 314 trash_init, trash_fini, 315 #else 316 NULL, NULL, 317 #endif 318 MSIZE - 1, UMA_ZONE_MAXBUCKET); 319 if (nmbufs > 0) 320 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 321 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 322 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 323 324 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 325 mb_ctor_clust, 326 #ifdef INVARIANTS 327 trash_dtor, trash_init, trash_fini, 328 #else 329 NULL, NULL, NULL, 330 #endif 331 UMA_ALIGN_PTR, 0); 332 if (nmbclusters > 0) 333 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 334 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 335 uma_zone_set_maxaction(zone_clust, mb_reclaim); 336 337 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 338 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 339 340 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 341 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 342 mb_ctor_clust, 343 #ifdef INVARIANTS 344 trash_dtor, trash_init, trash_fini, 345 #else 346 NULL, NULL, NULL, 347 #endif 348 UMA_ALIGN_PTR, 0); 349 if (nmbjumbop > 0) 350 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 351 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 352 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 353 354 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 355 mb_ctor_clust, 356 #ifdef INVARIANTS 357 trash_dtor, trash_init, trash_fini, 358 #else 359 NULL, NULL, NULL, 360 #endif 361 UMA_ALIGN_PTR, 0); 362 uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); 363 if (nmbjumbo9 > 0) 364 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 365 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 366 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 367 368 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 369 mb_ctor_clust, 370 #ifdef INVARIANTS 371 trash_dtor, trash_init, trash_fini, 372 #else 373 NULL, NULL, NULL, 374 #endif 375 UMA_ALIGN_PTR, 0); 376 uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); 377 if (nmbjumbo16 > 0) 378 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 379 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 380 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 381 382 /* 383 * Hook event handler for low-memory situation, used to 384 * drain protocols and push data back to the caches (UMA 385 * later pushes it back to VM). 386 */ 387 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 388 EVENTHANDLER_PRI_FIRST); 389 390 snd_tag_count = counter_u64_alloc(M_WAITOK); 391 } 392 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 393 394 #ifdef NETDUMP 395 /* 396 * netdump makes use of a pre-allocated pool of mbufs and clusters. When 397 * netdump is configured, we initialize a set of UMA cache zones which return 398 * items from this pool. At panic-time, the regular UMA zone pointers are 399 * overwritten with those of the cache zones so that drivers may allocate and 400 * free mbufs and clusters without attempting to allocate physical memory. 401 * 402 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 403 * the purpose of caching clusters, we treat them as mbufs. 404 */ 405 static struct mbufq nd_mbufq = 406 { STAILQ_HEAD_INITIALIZER(nd_mbufq.mq_head), 0, INT_MAX }; 407 static struct mbufq nd_clustq = 408 { STAILQ_HEAD_INITIALIZER(nd_clustq.mq_head), 0, INT_MAX }; 409 410 static int nd_clsize; 411 static uma_zone_t nd_zone_mbuf; 412 static uma_zone_t nd_zone_clust; 413 static uma_zone_t nd_zone_pack; 414 415 static int 416 nd_buf_import(void *arg, void **store, int count, int domain __unused, 417 int flags) 418 { 419 struct mbufq *q; 420 struct mbuf *m; 421 int i; 422 423 q = arg; 424 425 for (i = 0; i < count; i++) { 426 m = mbufq_dequeue(q); 427 if (m == NULL) 428 break; 429 trash_init(m, q == &nd_mbufq ? MSIZE : nd_clsize, flags); 430 store[i] = m; 431 } 432 KASSERT((flags & M_WAITOK) == 0 || i == count, 433 ("%s: ran out of pre-allocated mbufs", __func__)); 434 return (i); 435 } 436 437 static void 438 nd_buf_release(void *arg, void **store, int count) 439 { 440 struct mbufq *q; 441 struct mbuf *m; 442 int i; 443 444 q = arg; 445 446 for (i = 0; i < count; i++) { 447 m = store[i]; 448 (void)mbufq_enqueue(q, m); 449 } 450 } 451 452 static int 453 nd_pack_import(void *arg __unused, void **store, int count, int domain __unused, 454 int flags __unused) 455 { 456 struct mbuf *m; 457 void *clust; 458 int i; 459 460 for (i = 0; i < count; i++) { 461 m = m_get(MT_DATA, M_NOWAIT); 462 if (m == NULL) 463 break; 464 clust = uma_zalloc(nd_zone_clust, M_NOWAIT); 465 if (clust == NULL) { 466 m_free(m); 467 break; 468 } 469 mb_ctor_clust(clust, nd_clsize, m, 0); 470 store[i] = m; 471 } 472 KASSERT((flags & M_WAITOK) == 0 || i == count, 473 ("%s: ran out of pre-allocated mbufs", __func__)); 474 return (i); 475 } 476 477 static void 478 nd_pack_release(void *arg __unused, void **store, int count) 479 { 480 struct mbuf *m; 481 void *clust; 482 int i; 483 484 for (i = 0; i < count; i++) { 485 m = store[i]; 486 clust = m->m_ext.ext_buf; 487 uma_zfree(nd_zone_clust, clust); 488 uma_zfree(nd_zone_mbuf, m); 489 } 490 } 491 492 /* 493 * Free the pre-allocated mbufs and clusters reserved for netdump, and destroy 494 * the corresponding UMA cache zones. 495 */ 496 void 497 netdump_mbuf_drain(void) 498 { 499 struct mbuf *m; 500 void *item; 501 502 if (nd_zone_mbuf != NULL) { 503 uma_zdestroy(nd_zone_mbuf); 504 nd_zone_mbuf = NULL; 505 } 506 if (nd_zone_clust != NULL) { 507 uma_zdestroy(nd_zone_clust); 508 nd_zone_clust = NULL; 509 } 510 if (nd_zone_pack != NULL) { 511 uma_zdestroy(nd_zone_pack); 512 nd_zone_pack = NULL; 513 } 514 515 while ((m = mbufq_dequeue(&nd_mbufq)) != NULL) 516 m_free(m); 517 while ((item = mbufq_dequeue(&nd_clustq)) != NULL) 518 uma_zfree(m_getzone(nd_clsize), item); 519 } 520 521 /* 522 * Callback invoked immediately prior to starting a netdump. 523 */ 524 void 525 netdump_mbuf_dump(void) 526 { 527 528 /* 529 * All cluster zones return buffers of the size requested by the 530 * drivers. It's up to the driver to reinitialize the zones if the 531 * MTU of a netdump-enabled interface changes. 532 */ 533 printf("netdump: overwriting mbuf zone pointers\n"); 534 zone_mbuf = nd_zone_mbuf; 535 zone_clust = nd_zone_clust; 536 zone_pack = nd_zone_pack; 537 zone_jumbop = nd_zone_clust; 538 zone_jumbo9 = nd_zone_clust; 539 zone_jumbo16 = nd_zone_clust; 540 } 541 542 /* 543 * Reinitialize the netdump mbuf+cluster pool and cache zones. 544 */ 545 void 546 netdump_mbuf_reinit(int nmbuf, int nclust, int clsize) 547 { 548 struct mbuf *m; 549 void *item; 550 551 netdump_mbuf_drain(); 552 553 nd_clsize = clsize; 554 555 nd_zone_mbuf = uma_zcache_create("netdump_" MBUF_MEM_NAME, 556 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, 557 #ifdef INVARIANTS 558 trash_init, trash_fini, 559 #else 560 NULL, NULL, 561 #endif 562 nd_buf_import, nd_buf_release, 563 &nd_mbufq, UMA_ZONE_NOBUCKET); 564 565 nd_zone_clust = uma_zcache_create("netdump_" MBUF_CLUSTER_MEM_NAME, 566 clsize, mb_ctor_clust, 567 #ifdef INVARIANTS 568 trash_dtor, trash_init, trash_fini, 569 #else 570 NULL, NULL, NULL, 571 #endif 572 nd_buf_import, nd_buf_release, 573 &nd_clustq, UMA_ZONE_NOBUCKET); 574 575 nd_zone_pack = uma_zcache_create("netdump_" MBUF_PACKET_MEM_NAME, 576 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 577 nd_pack_import, nd_pack_release, 578 NULL, UMA_ZONE_NOBUCKET); 579 580 while (nmbuf-- > 0) { 581 m = m_get(MT_DATA, M_WAITOK); 582 uma_zfree(nd_zone_mbuf, m); 583 } 584 while (nclust-- > 0) { 585 item = uma_zalloc(m_getzone(nd_clsize), M_WAITOK); 586 uma_zfree(nd_zone_clust, item); 587 } 588 } 589 #endif /* NETDUMP */ 590 591 /* 592 * UMA backend page allocator for the jumbo frame zones. 593 * 594 * Allocates kernel virtual memory that is backed by contiguous physical 595 * pages. 596 */ 597 static void * 598 mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 599 int wait) 600 { 601 602 /* Inform UMA that this allocator uses kernel_map/object. */ 603 *flags = UMA_SLAB_KERNEL; 604 return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), 605 bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, 606 VM_MEMATTR_DEFAULT)); 607 } 608 609 /* 610 * Constructor for Mbuf master zone. 611 * 612 * The 'arg' pointer points to a mb_args structure which 613 * contains call-specific information required to support the 614 * mbuf allocation API. See mbuf.h. 615 */ 616 static int 617 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 618 { 619 struct mbuf *m; 620 struct mb_args *args; 621 int error; 622 int flags; 623 short type; 624 625 #ifdef INVARIANTS 626 trash_ctor(mem, size, arg, how); 627 #endif 628 args = (struct mb_args *)arg; 629 type = args->type; 630 631 /* 632 * The mbuf is initialized later. The caller has the 633 * responsibility to set up any MAC labels too. 634 */ 635 if (type == MT_NOINIT) 636 return (0); 637 638 m = (struct mbuf *)mem; 639 flags = args->flags; 640 MPASS((flags & M_NOFREE) == 0); 641 642 error = m_init(m, how, type, flags); 643 644 return (error); 645 } 646 647 /* 648 * The Mbuf master zone destructor. 649 */ 650 static void 651 mb_dtor_mbuf(void *mem, int size, void *arg) 652 { 653 struct mbuf *m; 654 unsigned long flags; 655 656 m = (struct mbuf *)mem; 657 flags = (unsigned long)arg; 658 659 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 660 if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 661 m_tag_delete_chain(m, NULL); 662 #ifdef INVARIANTS 663 trash_dtor(mem, size, arg); 664 #endif 665 } 666 667 /* 668 * The Mbuf Packet zone destructor. 669 */ 670 static void 671 mb_dtor_pack(void *mem, int size, void *arg) 672 { 673 struct mbuf *m; 674 675 m = (struct mbuf *)mem; 676 if ((m->m_flags & M_PKTHDR) != 0) 677 m_tag_delete_chain(m, NULL); 678 679 /* Make sure we've got a clean cluster back. */ 680 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 681 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 682 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 683 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 684 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 685 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 686 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 687 #ifdef INVARIANTS 688 trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); 689 #endif 690 /* 691 * If there are processes blocked on zone_clust, waiting for pages 692 * to be freed up, * cause them to be woken up by draining the 693 * packet zone. We are exposed to a race here * (in the check for 694 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 695 * is deliberate. We don't want to acquire the zone lock for every 696 * mbuf free. 697 */ 698 if (uma_zone_exhausted_nolock(zone_clust)) 699 zone_drain(zone_pack); 700 } 701 702 /* 703 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 704 * 705 * Here the 'arg' pointer points to the Mbuf which we 706 * are configuring cluster storage for. If 'arg' is 707 * empty we allocate just the cluster without setting 708 * the mbuf to it. See mbuf.h. 709 */ 710 static int 711 mb_ctor_clust(void *mem, int size, void *arg, int how) 712 { 713 struct mbuf *m; 714 715 #ifdef INVARIANTS 716 trash_ctor(mem, size, arg, how); 717 #endif 718 m = (struct mbuf *)arg; 719 if (m != NULL) { 720 m->m_ext.ext_buf = (char *)mem; 721 m->m_data = m->m_ext.ext_buf; 722 m->m_flags |= M_EXT; 723 m->m_ext.ext_free = NULL; 724 m->m_ext.ext_arg1 = NULL; 725 m->m_ext.ext_arg2 = NULL; 726 m->m_ext.ext_size = size; 727 m->m_ext.ext_type = m_gettype(size); 728 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 729 m->m_ext.ext_count = 1; 730 } 731 732 return (0); 733 } 734 735 /* 736 * The Packet secondary zone's init routine, executed on the 737 * object's transition from mbuf keg slab to zone cache. 738 */ 739 static int 740 mb_zinit_pack(void *mem, int size, int how) 741 { 742 struct mbuf *m; 743 744 m = (struct mbuf *)mem; /* m is virgin. */ 745 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 746 m->m_ext.ext_buf == NULL) 747 return (ENOMEM); 748 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 749 #ifdef INVARIANTS 750 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 751 #endif 752 return (0); 753 } 754 755 /* 756 * The Packet secondary zone's fini routine, executed on the 757 * object's transition from zone cache to keg slab. 758 */ 759 static void 760 mb_zfini_pack(void *mem, int size) 761 { 762 struct mbuf *m; 763 764 m = (struct mbuf *)mem; 765 #ifdef INVARIANTS 766 trash_fini(m->m_ext.ext_buf, MCLBYTES); 767 #endif 768 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 769 #ifdef INVARIANTS 770 trash_dtor(mem, size, NULL); 771 #endif 772 } 773 774 /* 775 * The "packet" keg constructor. 776 */ 777 static int 778 mb_ctor_pack(void *mem, int size, void *arg, int how) 779 { 780 struct mbuf *m; 781 struct mb_args *args; 782 int error, flags; 783 short type; 784 785 m = (struct mbuf *)mem; 786 args = (struct mb_args *)arg; 787 flags = args->flags; 788 type = args->type; 789 MPASS((flags & M_NOFREE) == 0); 790 791 #ifdef INVARIANTS 792 trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); 793 #endif 794 795 error = m_init(m, how, type, flags); 796 797 /* m_ext is already initialized. */ 798 m->m_data = m->m_ext.ext_buf; 799 m->m_flags = (flags | M_EXT); 800 801 return (error); 802 } 803 804 /* 805 * This is the protocol drain routine. Called by UMA whenever any of the 806 * mbuf zones is closed to its limit. 807 * 808 * No locks should be held when this is called. The drain routines have to 809 * presently acquire some locks which raises the possibility of lock order 810 * reversal. 811 */ 812 static void 813 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 814 { 815 struct domain *dp; 816 struct protosw *pr; 817 818 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); 819 820 for (dp = domains; dp != NULL; dp = dp->dom_next) 821 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 822 if (pr->pr_drain != NULL) 823 (*pr->pr_drain)(); 824 } 825 826 /* 827 * Clean up after mbufs with M_EXT storage attached to them if the 828 * reference count hits 1. 829 */ 830 void 831 mb_free_ext(struct mbuf *m) 832 { 833 volatile u_int *refcnt; 834 struct mbuf *mref; 835 int freembuf; 836 837 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 838 839 /* See if this is the mbuf that holds the embedded refcount. */ 840 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 841 refcnt = &m->m_ext.ext_count; 842 mref = m; 843 } else { 844 KASSERT(m->m_ext.ext_cnt != NULL, 845 ("%s: no refcounting pointer on %p", __func__, m)); 846 refcnt = m->m_ext.ext_cnt; 847 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 848 } 849 850 /* 851 * Check if the header is embedded in the cluster. It is 852 * important that we can't touch any of the mbuf fields 853 * after we have freed the external storage, since mbuf 854 * could have been embedded in it. For now, the mbufs 855 * embedded into the cluster are always of type EXT_EXTREF, 856 * and for this type we won't free the mref. 857 */ 858 if (m->m_flags & M_NOFREE) { 859 freembuf = 0; 860 KASSERT(m->m_ext.ext_type == EXT_EXTREF || 861 m->m_ext.ext_type == EXT_RXRING, 862 ("%s: no-free mbuf %p has wrong type", __func__, m)); 863 } else 864 freembuf = 1; 865 866 /* Free attached storage if this mbuf is the only reference to it. */ 867 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 868 switch (m->m_ext.ext_type) { 869 case EXT_PACKET: 870 /* The packet zone is special. */ 871 if (*refcnt == 0) 872 *refcnt = 1; 873 uma_zfree(zone_pack, mref); 874 break; 875 case EXT_CLUSTER: 876 uma_zfree(zone_clust, m->m_ext.ext_buf); 877 uma_zfree(zone_mbuf, mref); 878 break; 879 case EXT_JUMBOP: 880 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 881 uma_zfree(zone_mbuf, mref); 882 break; 883 case EXT_JUMBO9: 884 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 885 uma_zfree(zone_mbuf, mref); 886 break; 887 case EXT_JUMBO16: 888 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 889 uma_zfree(zone_mbuf, mref); 890 break; 891 case EXT_SFBUF: 892 case EXT_NET_DRV: 893 case EXT_MOD_TYPE: 894 case EXT_DISPOSABLE: 895 KASSERT(mref->m_ext.ext_free != NULL, 896 ("%s: ext_free not set", __func__)); 897 mref->m_ext.ext_free(mref); 898 uma_zfree(zone_mbuf, mref); 899 break; 900 case EXT_EXTREF: 901 KASSERT(m->m_ext.ext_free != NULL, 902 ("%s: ext_free not set", __func__)); 903 m->m_ext.ext_free(m); 904 break; 905 case EXT_RXRING: 906 KASSERT(m->m_ext.ext_free == NULL, 907 ("%s: ext_free is set", __func__)); 908 break; 909 default: 910 KASSERT(m->m_ext.ext_type == 0, 911 ("%s: unknown ext_type", __func__)); 912 } 913 } 914 915 if (freembuf && m != mref) 916 uma_zfree(zone_mbuf, m); 917 } 918 919 /* 920 * Official mbuf(9) allocation KPI for stack and drivers: 921 * 922 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 923 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 924 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 925 * m_clget() - attach cluster to already allocated mbuf. 926 * m_cljget() - attach jumbo cluster to already allocated mbuf. 927 * m_get2() - allocate minimum mbuf that would fit size argument. 928 * m_getm2() - allocate a chain of mbufs/clusters. 929 * m_extadd() - attach external cluster to mbuf. 930 * 931 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 932 * m_freem() - free chain of mbufs. 933 */ 934 935 int 936 m_clget(struct mbuf *m, int how) 937 { 938 939 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 940 __func__, m)); 941 m->m_ext.ext_buf = (char *)NULL; 942 uma_zalloc_arg(zone_clust, m, how); 943 /* 944 * On a cluster allocation failure, drain the packet zone and retry, 945 * we might be able to loosen a few clusters up on the drain. 946 */ 947 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 948 zone_drain(zone_pack); 949 uma_zalloc_arg(zone_clust, m, how); 950 } 951 MBUF_PROBE2(m__clget, m, how); 952 return (m->m_flags & M_EXT); 953 } 954 955 /* 956 * m_cljget() is different from m_clget() as it can allocate clusters without 957 * attaching them to an mbuf. In that case the return value is the pointer 958 * to the cluster of the requested size. If an mbuf was specified, it gets 959 * the cluster attached to it and the return value can be safely ignored. 960 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 961 */ 962 void * 963 m_cljget(struct mbuf *m, int how, int size) 964 { 965 uma_zone_t zone; 966 void *retval; 967 968 if (m != NULL) { 969 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 970 __func__, m)); 971 m->m_ext.ext_buf = NULL; 972 } 973 974 zone = m_getzone(size); 975 retval = uma_zalloc_arg(zone, m, how); 976 977 MBUF_PROBE4(m__cljget, m, how, size, retval); 978 979 return (retval); 980 } 981 982 /* 983 * m_get2() allocates minimum mbuf that would fit "size" argument. 984 */ 985 struct mbuf * 986 m_get2(int size, int how, short type, int flags) 987 { 988 struct mb_args args; 989 struct mbuf *m, *n; 990 991 args.flags = flags; 992 args.type = type; 993 994 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 995 return (uma_zalloc_arg(zone_mbuf, &args, how)); 996 if (size <= MCLBYTES) 997 return (uma_zalloc_arg(zone_pack, &args, how)); 998 999 if (size > MJUMPAGESIZE) 1000 return (NULL); 1001 1002 m = uma_zalloc_arg(zone_mbuf, &args, how); 1003 if (m == NULL) 1004 return (NULL); 1005 1006 n = uma_zalloc_arg(zone_jumbop, m, how); 1007 if (n == NULL) { 1008 uma_zfree(zone_mbuf, m); 1009 return (NULL); 1010 } 1011 1012 return (m); 1013 } 1014 1015 /* 1016 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1017 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1018 */ 1019 struct mbuf * 1020 m_getjcl(int how, short type, int flags, int size) 1021 { 1022 struct mb_args args; 1023 struct mbuf *m, *n; 1024 uma_zone_t zone; 1025 1026 if (size == MCLBYTES) 1027 return m_getcl(how, type, flags); 1028 1029 args.flags = flags; 1030 args.type = type; 1031 1032 m = uma_zalloc_arg(zone_mbuf, &args, how); 1033 if (m == NULL) 1034 return (NULL); 1035 1036 zone = m_getzone(size); 1037 n = uma_zalloc_arg(zone, m, how); 1038 if (n == NULL) { 1039 uma_zfree(zone_mbuf, m); 1040 return (NULL); 1041 } 1042 return (m); 1043 } 1044 1045 /* 1046 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1047 * best) and return a pointer to the top of the allocated chain. If an 1048 * existing mbuf chain is provided, then we will append the new chain 1049 * to the existing one and return a pointer to the provided mbuf. 1050 */ 1051 struct mbuf * 1052 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1053 { 1054 struct mbuf *mb, *nm = NULL, *mtail = NULL; 1055 1056 KASSERT(len >= 0, ("%s: len is < 0", __func__)); 1057 1058 /* Validate flags. */ 1059 flags &= (M_PKTHDR | M_EOR); 1060 1061 /* Packet header mbuf must be first in chain. */ 1062 if ((flags & M_PKTHDR) && m != NULL) 1063 flags &= ~M_PKTHDR; 1064 1065 /* Loop and append maximum sized mbufs to the chain tail. */ 1066 while (len > 0) { 1067 if (len > MCLBYTES) 1068 mb = m_getjcl(how, type, (flags & M_PKTHDR), 1069 MJUMPAGESIZE); 1070 else if (len >= MINCLSIZE) 1071 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1072 else if (flags & M_PKTHDR) 1073 mb = m_gethdr(how, type); 1074 else 1075 mb = m_get(how, type); 1076 1077 /* Fail the whole operation if one mbuf can't be allocated. */ 1078 if (mb == NULL) { 1079 if (nm != NULL) 1080 m_freem(nm); 1081 return (NULL); 1082 } 1083 1084 /* Book keeping. */ 1085 len -= M_SIZE(mb); 1086 if (mtail != NULL) 1087 mtail->m_next = mb; 1088 else 1089 nm = mb; 1090 mtail = mb; 1091 flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ 1092 } 1093 if (flags & M_EOR) 1094 mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ 1095 1096 /* If mbuf was supplied, append new chain to the end of it. */ 1097 if (m != NULL) { 1098 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) 1099 ; 1100 mtail->m_next = nm; 1101 mtail->m_flags &= ~M_EOR; 1102 } else 1103 m = nm; 1104 1105 return (m); 1106 } 1107 1108 /*- 1109 * Configure a provided mbuf to refer to the provided external storage 1110 * buffer and setup a reference count for said buffer. 1111 * 1112 * Arguments: 1113 * mb The existing mbuf to which to attach the provided buffer. 1114 * buf The address of the provided external storage buffer. 1115 * size The size of the provided buffer. 1116 * freef A pointer to a routine that is responsible for freeing the 1117 * provided external storage buffer. 1118 * args A pointer to an argument structure (of any type) to be passed 1119 * to the provided freef routine (may be NULL). 1120 * flags Any other flags to be passed to the provided mbuf. 1121 * type The type that the external storage buffer should be 1122 * labeled with. 1123 * 1124 * Returns: 1125 * Nothing. 1126 */ 1127 void 1128 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1129 void *arg1, void *arg2, int flags, int type) 1130 { 1131 1132 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1133 1134 mb->m_flags |= (M_EXT | flags); 1135 mb->m_ext.ext_buf = buf; 1136 mb->m_data = mb->m_ext.ext_buf; 1137 mb->m_ext.ext_size = size; 1138 mb->m_ext.ext_free = freef; 1139 mb->m_ext.ext_arg1 = arg1; 1140 mb->m_ext.ext_arg2 = arg2; 1141 mb->m_ext.ext_type = type; 1142 1143 if (type != EXT_EXTREF) { 1144 mb->m_ext.ext_count = 1; 1145 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1146 } else 1147 mb->m_ext.ext_flags = 0; 1148 } 1149 1150 /* 1151 * Free an entire chain of mbufs and associated external buffers, if 1152 * applicable. 1153 */ 1154 void 1155 m_freem(struct mbuf *mb) 1156 { 1157 1158 MBUF_PROBE1(m__freem, mb); 1159 while (mb != NULL) 1160 mb = m_free(mb); 1161 } 1162 1163 void 1164 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp) 1165 { 1166 1167 if_ref(ifp); 1168 mst->ifp = ifp; 1169 refcount_init(&mst->refcount, 1); 1170 counter_u64_add(snd_tag_count, 1); 1171 } 1172 1173 void 1174 m_snd_tag_destroy(struct m_snd_tag *mst) 1175 { 1176 struct ifnet *ifp; 1177 1178 ifp = mst->ifp; 1179 ifp->if_snd_tag_free(mst); 1180 if_rele(ifp); 1181 counter_u64_add(snd_tag_count, -1); 1182 } 1183