1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_param.h" 34 35 #include <sys/param.h> 36 #include <sys/malloc.h> 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/domain.h> 41 #include <sys/eventhandler.h> 42 #include <sys/kernel.h> 43 #include <sys/limits.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/protosw.h> 47 #include <sys/smp.h> 48 #include <sys/sysctl.h> 49 50 #include <vm/vm.h> 51 #include <vm/vm_extern.h> 52 #include <vm/vm_kern.h> 53 #include <vm/vm_page.h> 54 #include <vm/vm_map.h> 55 #include <vm/uma.h> 56 #include <vm/uma_dbg.h> 57 58 /* 59 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 60 * Zones. 61 * 62 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 63 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 64 * administrator so desires. 65 * 66 * Mbufs are allocated from a UMA Master Zone called the Mbuf 67 * Zone. 68 * 69 * Additionally, FreeBSD provides a Packet Zone, which it 70 * configures as a Secondary Zone to the Mbuf Master Zone, 71 * thus sharing backend Slab kegs with the Mbuf Master Zone. 72 * 73 * Thus common-case allocations and locking are simplified: 74 * 75 * m_clget() m_getcl() 76 * | | 77 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 78 * | | [ Packet ] | 79 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 80 * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] 81 * | \________ | 82 * [ Cluster Keg ] \ / 83 * | [ Mbuf Keg ] 84 * [ Cluster Slabs ] | 85 * | [ Mbuf Slabs ] 86 * \____________(VM)_________________/ 87 * 88 * 89 * Whenever an object is allocated with uma_zalloc() out of 90 * one of the Zones its _ctor_ function is executed. The same 91 * for any deallocation through uma_zfree() the _dtor_ function 92 * is executed. 93 * 94 * Caches are per-CPU and are filled from the Master Zone. 95 * 96 * Whenever an object is allocated from the underlying global 97 * memory pool it gets pre-initialized with the _zinit_ functions. 98 * When the Keg's are overfull objects get decommissioned with 99 * _zfini_ functions and free'd back to the global memory pool. 100 * 101 */ 102 103 int nmbufs; /* limits number of mbufs */ 104 int nmbclusters; /* limits number of mbuf clusters */ 105 int nmbjumbop; /* limits number of page size jumbo clusters */ 106 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 107 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 108 109 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 110 111 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 112 "Maximum real memory allocatable to various mbuf types"); 113 114 /* 115 * tunable_mbinit() has to be run before any mbuf allocations are done. 116 */ 117 static void 118 tunable_mbinit(void *dummy) 119 { 120 quad_t realmem; 121 122 /* 123 * The default limit for all mbuf related memory is 1/2 of all 124 * available kernel memory (physical or kmem). 125 * At most it can be 3/4 of available kernel memory. 126 */ 127 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 128 maxmbufmem = realmem / 2; 129 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 130 if (maxmbufmem > realmem / 4 * 3) 131 maxmbufmem = realmem / 4 * 3; 132 133 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 134 if (nmbclusters == 0) 135 nmbclusters = maxmbufmem / MCLBYTES / 4; 136 137 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 138 if (nmbjumbop == 0) 139 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 140 141 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 142 if (nmbjumbo9 == 0) 143 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 144 145 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 146 if (nmbjumbo16 == 0) 147 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 148 149 /* 150 * We need at least as many mbufs as we have clusters of 151 * the various types added together. 152 */ 153 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 154 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 155 nmbufs = lmax(maxmbufmem / MSIZE / 5, 156 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 157 } 158 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 159 160 static int 161 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 162 { 163 int error, newnmbclusters; 164 165 newnmbclusters = nmbclusters; 166 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 167 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 168 if (newnmbclusters > nmbclusters && 169 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 170 nmbclusters = newnmbclusters; 171 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 172 EVENTHANDLER_INVOKE(nmbclusters_change); 173 } else 174 error = EINVAL; 175 } 176 return (error); 177 } 178 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, 179 &nmbclusters, 0, sysctl_nmbclusters, "IU", 180 "Maximum number of mbuf clusters allowed"); 181 182 static int 183 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 184 { 185 int error, newnmbjumbop; 186 187 newnmbjumbop = nmbjumbop; 188 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 189 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 190 if (newnmbjumbop > nmbjumbop && 191 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 192 nmbjumbop = newnmbjumbop; 193 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 194 } else 195 error = EINVAL; 196 } 197 return (error); 198 } 199 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, 200 &nmbjumbop, 0, sysctl_nmbjumbop, "IU", 201 "Maximum number of mbuf page size jumbo clusters allowed"); 202 203 static int 204 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 205 { 206 int error, newnmbjumbo9; 207 208 newnmbjumbo9 = nmbjumbo9; 209 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 210 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 211 if (newnmbjumbo9 > nmbjumbo9 && 212 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 213 nmbjumbo9 = newnmbjumbo9; 214 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 215 } else 216 error = EINVAL; 217 } 218 return (error); 219 } 220 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, 221 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", 222 "Maximum number of mbuf 9k jumbo clusters allowed"); 223 224 static int 225 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 226 { 227 int error, newnmbjumbo16; 228 229 newnmbjumbo16 = nmbjumbo16; 230 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 231 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 232 if (newnmbjumbo16 > nmbjumbo16 && 233 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 234 nmbjumbo16 = newnmbjumbo16; 235 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 236 } else 237 error = EINVAL; 238 } 239 return (error); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, 242 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", 243 "Maximum number of mbuf 16k jumbo clusters allowed"); 244 245 static int 246 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 247 { 248 int error, newnmbufs; 249 250 newnmbufs = nmbufs; 251 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 252 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 253 if (newnmbufs > nmbufs) { 254 nmbufs = newnmbufs; 255 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 256 EVENTHANDLER_INVOKE(nmbufs_change); 257 } else 258 error = EINVAL; 259 } 260 return (error); 261 } 262 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, 263 &nmbufs, 0, sysctl_nmbufs, "IU", 264 "Maximum number of mbufs allowed"); 265 266 /* 267 * Zones from which we allocate. 268 */ 269 uma_zone_t zone_mbuf; 270 uma_zone_t zone_clust; 271 uma_zone_t zone_pack; 272 uma_zone_t zone_jumbop; 273 uma_zone_t zone_jumbo9; 274 uma_zone_t zone_jumbo16; 275 276 /* 277 * Local prototypes. 278 */ 279 static int mb_ctor_mbuf(void *, int, void *, int); 280 static int mb_ctor_clust(void *, int, void *, int); 281 static int mb_ctor_pack(void *, int, void *, int); 282 static void mb_dtor_mbuf(void *, int, void *); 283 static void mb_dtor_pack(void *, int, void *); 284 static int mb_zinit_pack(void *, int, int); 285 static void mb_zfini_pack(void *, int); 286 static void mb_reclaim(uma_zone_t, int); 287 static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 288 289 /* Ensure that MSIZE is a power of 2. */ 290 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 291 292 /* 293 * Initialize FreeBSD Network buffer allocation. 294 */ 295 static void 296 mbuf_init(void *dummy) 297 { 298 299 /* 300 * Configure UMA zones for Mbufs, Clusters, and Packets. 301 */ 302 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 303 mb_ctor_mbuf, mb_dtor_mbuf, 304 #ifdef INVARIANTS 305 trash_init, trash_fini, 306 #else 307 NULL, NULL, 308 #endif 309 MSIZE - 1, UMA_ZONE_MAXBUCKET); 310 if (nmbufs > 0) 311 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 312 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 313 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 314 315 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 316 mb_ctor_clust, 317 #ifdef INVARIANTS 318 trash_dtor, trash_init, trash_fini, 319 #else 320 NULL, NULL, NULL, 321 #endif 322 UMA_ALIGN_PTR, 0); 323 if (nmbclusters > 0) 324 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 325 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 326 uma_zone_set_maxaction(zone_clust, mb_reclaim); 327 328 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 329 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 330 331 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 332 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 333 mb_ctor_clust, 334 #ifdef INVARIANTS 335 trash_dtor, trash_init, trash_fini, 336 #else 337 NULL, NULL, NULL, 338 #endif 339 UMA_ALIGN_PTR, 0); 340 if (nmbjumbop > 0) 341 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 342 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 343 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 344 345 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 346 mb_ctor_clust, 347 #ifdef INVARIANTS 348 trash_dtor, trash_init, trash_fini, 349 #else 350 NULL, NULL, NULL, 351 #endif 352 UMA_ALIGN_PTR, 0); 353 uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); 354 if (nmbjumbo9 > 0) 355 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 356 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 357 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 358 359 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 360 mb_ctor_clust, 361 #ifdef INVARIANTS 362 trash_dtor, trash_init, trash_fini, 363 #else 364 NULL, NULL, NULL, 365 #endif 366 UMA_ALIGN_PTR, 0); 367 uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); 368 if (nmbjumbo16 > 0) 369 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 370 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 371 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 372 373 /* 374 * Hook event handler for low-memory situation, used to 375 * drain protocols and push data back to the caches (UMA 376 * later pushes it back to VM). 377 */ 378 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 379 EVENTHANDLER_PRI_FIRST); 380 } 381 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 382 383 #ifdef NETDUMP 384 /* 385 * netdump makes use of a pre-allocated pool of mbufs and clusters. When 386 * netdump is configured, we initialize a set of UMA cache zones which return 387 * items from this pool. At panic-time, the regular UMA zone pointers are 388 * overwritten with those of the cache zones so that drivers may allocate and 389 * free mbufs and clusters without attempting to allocate physical memory. 390 * 391 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 392 * the purpose of caching clusters, we treat them as mbufs. 393 */ 394 static struct mbufq nd_mbufq = 395 { STAILQ_HEAD_INITIALIZER(nd_mbufq.mq_head), 0, INT_MAX }; 396 static struct mbufq nd_clustq = 397 { STAILQ_HEAD_INITIALIZER(nd_clustq.mq_head), 0, INT_MAX }; 398 399 static int nd_clsize; 400 static uma_zone_t nd_zone_mbuf; 401 static uma_zone_t nd_zone_clust; 402 static uma_zone_t nd_zone_pack; 403 404 static int 405 nd_buf_import(void *arg, void **store, int count, int domain __unused, 406 int flags) 407 { 408 struct mbufq *q; 409 struct mbuf *m; 410 int i; 411 412 q = arg; 413 414 for (i = 0; i < count; i++) { 415 m = mbufq_dequeue(q); 416 if (m == NULL) 417 break; 418 trash_init(m, q == &nd_mbufq ? MSIZE : nd_clsize, flags); 419 store[i] = m; 420 } 421 return (i); 422 } 423 424 static void 425 nd_buf_release(void *arg, void **store, int count) 426 { 427 struct mbufq *q; 428 struct mbuf *m; 429 int i; 430 431 q = arg; 432 433 for (i = 0; i < count; i++) { 434 m = store[i]; 435 (void)mbufq_enqueue(q, m); 436 } 437 } 438 439 static int 440 nd_pack_import(void *arg __unused, void **store, int count, int domain __unused, 441 int flags __unused) 442 { 443 struct mbuf *m; 444 void *clust; 445 int i; 446 447 for (i = 0; i < count; i++) { 448 m = m_get(MT_DATA, M_NOWAIT); 449 if (m == NULL) 450 break; 451 clust = uma_zalloc(nd_zone_clust, M_NOWAIT); 452 if (clust == NULL) { 453 m_free(m); 454 break; 455 } 456 mb_ctor_clust(clust, nd_clsize, m, 0); 457 store[i] = m; 458 } 459 return (i); 460 } 461 462 static void 463 nd_pack_release(void *arg __unused, void **store, int count) 464 { 465 struct mbuf *m; 466 void *clust; 467 int i; 468 469 for (i = 0; i < count; i++) { 470 m = store[i]; 471 clust = m->m_ext.ext_buf; 472 uma_zfree(nd_zone_clust, clust); 473 uma_zfree(nd_zone_mbuf, m); 474 } 475 } 476 477 /* 478 * Free the pre-allocated mbufs and clusters reserved for netdump, and destroy 479 * the corresponding UMA cache zones. 480 */ 481 void 482 netdump_mbuf_drain(void) 483 { 484 struct mbuf *m; 485 void *item; 486 487 if (nd_zone_mbuf != NULL) { 488 uma_zdestroy(nd_zone_mbuf); 489 nd_zone_mbuf = NULL; 490 } 491 if (nd_zone_clust != NULL) { 492 uma_zdestroy(nd_zone_clust); 493 nd_zone_clust = NULL; 494 } 495 if (nd_zone_pack != NULL) { 496 uma_zdestroy(nd_zone_pack); 497 nd_zone_pack = NULL; 498 } 499 500 while ((m = mbufq_dequeue(&nd_mbufq)) != NULL) 501 m_free(m); 502 while ((item = mbufq_dequeue(&nd_clustq)) != NULL) 503 uma_zfree(m_getzone(nd_clsize), item); 504 } 505 506 /* 507 * Callback invoked immediately prior to starting a netdump. 508 */ 509 void 510 netdump_mbuf_dump(void) 511 { 512 513 /* 514 * All cluster zones return buffers of the size requested by the 515 * drivers. It's up to the driver to reinitialize the zones if the 516 * MTU of a netdump-enabled interface changes. 517 */ 518 printf("netdump: overwriting mbuf zone pointers\n"); 519 zone_mbuf = nd_zone_mbuf; 520 zone_clust = nd_zone_clust; 521 zone_pack = nd_zone_pack; 522 zone_jumbop = nd_zone_clust; 523 zone_jumbo9 = nd_zone_clust; 524 zone_jumbo16 = nd_zone_clust; 525 } 526 527 /* 528 * Reinitialize the netdump mbuf+cluster pool and cache zones. 529 */ 530 void 531 netdump_mbuf_reinit(int nmbuf, int nclust, int clsize) 532 { 533 struct mbuf *m; 534 void *item; 535 536 netdump_mbuf_drain(); 537 538 nd_clsize = clsize; 539 540 nd_zone_mbuf = uma_zcache_create("netdump_" MBUF_MEM_NAME, 541 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, 542 #ifdef INVARIANTS 543 trash_init, trash_fini, 544 #else 545 NULL, NULL, 546 #endif 547 nd_buf_import, nd_buf_release, 548 &nd_mbufq, UMA_ZONE_NOBUCKET); 549 550 nd_zone_clust = uma_zcache_create("netdump_" MBUF_CLUSTER_MEM_NAME, 551 clsize, mb_ctor_clust, 552 #ifdef INVARIANTS 553 trash_dtor, trash_init, trash_fini, 554 #else 555 NULL, NULL, NULL, 556 #endif 557 nd_buf_import, nd_buf_release, 558 &nd_clustq, UMA_ZONE_NOBUCKET); 559 560 nd_zone_pack = uma_zcache_create("netdump_" MBUF_PACKET_MEM_NAME, 561 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 562 nd_pack_import, nd_pack_release, 563 NULL, UMA_ZONE_NOBUCKET); 564 565 while (nmbuf-- > 0) { 566 m = m_get(MT_DATA, M_WAITOK); 567 uma_zfree(nd_zone_mbuf, m); 568 } 569 while (nclust-- > 0) { 570 item = uma_zalloc(m_getzone(nd_clsize), M_WAITOK); 571 uma_zfree(nd_zone_clust, item); 572 } 573 } 574 #endif /* NETDUMP */ 575 576 /* 577 * UMA backend page allocator for the jumbo frame zones. 578 * 579 * Allocates kernel virtual memory that is backed by contiguous physical 580 * pages. 581 */ 582 static void * 583 mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 584 int wait) 585 { 586 587 /* Inform UMA that this allocator uses kernel_map/object. */ 588 *flags = UMA_SLAB_KERNEL; 589 return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 590 (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); 591 } 592 593 /* 594 * Constructor for Mbuf master zone. 595 * 596 * The 'arg' pointer points to a mb_args structure which 597 * contains call-specific information required to support the 598 * mbuf allocation API. See mbuf.h. 599 */ 600 static int 601 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 602 { 603 struct mbuf *m; 604 struct mb_args *args; 605 int error; 606 int flags; 607 short type; 608 609 #ifdef INVARIANTS 610 trash_ctor(mem, size, arg, how); 611 #endif 612 args = (struct mb_args *)arg; 613 type = args->type; 614 615 /* 616 * The mbuf is initialized later. The caller has the 617 * responsibility to set up any MAC labels too. 618 */ 619 if (type == MT_NOINIT) 620 return (0); 621 622 m = (struct mbuf *)mem; 623 flags = args->flags; 624 MPASS((flags & M_NOFREE) == 0); 625 626 error = m_init(m, how, type, flags); 627 628 return (error); 629 } 630 631 /* 632 * The Mbuf master zone destructor. 633 */ 634 static void 635 mb_dtor_mbuf(void *mem, int size, void *arg) 636 { 637 struct mbuf *m; 638 unsigned long flags; 639 640 m = (struct mbuf *)mem; 641 flags = (unsigned long)arg; 642 643 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 644 if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 645 m_tag_delete_chain(m, NULL); 646 #ifdef INVARIANTS 647 trash_dtor(mem, size, arg); 648 #endif 649 } 650 651 /* 652 * The Mbuf Packet zone destructor. 653 */ 654 static void 655 mb_dtor_pack(void *mem, int size, void *arg) 656 { 657 struct mbuf *m; 658 659 m = (struct mbuf *)mem; 660 if ((m->m_flags & M_PKTHDR) != 0) 661 m_tag_delete_chain(m, NULL); 662 663 /* Make sure we've got a clean cluster back. */ 664 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 665 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 666 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 667 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 668 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 669 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 670 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 671 #ifdef INVARIANTS 672 trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); 673 #endif 674 /* 675 * If there are processes blocked on zone_clust, waiting for pages 676 * to be freed up, * cause them to be woken up by draining the 677 * packet zone. We are exposed to a race here * (in the check for 678 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 679 * is deliberate. We don't want to acquire the zone lock for every 680 * mbuf free. 681 */ 682 if (uma_zone_exhausted_nolock(zone_clust)) 683 zone_drain(zone_pack); 684 } 685 686 /* 687 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 688 * 689 * Here the 'arg' pointer points to the Mbuf which we 690 * are configuring cluster storage for. If 'arg' is 691 * empty we allocate just the cluster without setting 692 * the mbuf to it. See mbuf.h. 693 */ 694 static int 695 mb_ctor_clust(void *mem, int size, void *arg, int how) 696 { 697 struct mbuf *m; 698 699 #ifdef INVARIANTS 700 trash_ctor(mem, size, arg, how); 701 #endif 702 m = (struct mbuf *)arg; 703 if (m != NULL) { 704 m->m_ext.ext_buf = (char *)mem; 705 m->m_data = m->m_ext.ext_buf; 706 m->m_flags |= M_EXT; 707 m->m_ext.ext_free = NULL; 708 m->m_ext.ext_arg1 = NULL; 709 m->m_ext.ext_arg2 = NULL; 710 m->m_ext.ext_size = size; 711 m->m_ext.ext_type = m_gettype(size); 712 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 713 m->m_ext.ext_count = 1; 714 } 715 716 return (0); 717 } 718 719 /* 720 * The Packet secondary zone's init routine, executed on the 721 * object's transition from mbuf keg slab to zone cache. 722 */ 723 static int 724 mb_zinit_pack(void *mem, int size, int how) 725 { 726 struct mbuf *m; 727 728 m = (struct mbuf *)mem; /* m is virgin. */ 729 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 730 m->m_ext.ext_buf == NULL) 731 return (ENOMEM); 732 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 733 #ifdef INVARIANTS 734 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 735 #endif 736 return (0); 737 } 738 739 /* 740 * The Packet secondary zone's fini routine, executed on the 741 * object's transition from zone cache to keg slab. 742 */ 743 static void 744 mb_zfini_pack(void *mem, int size) 745 { 746 struct mbuf *m; 747 748 m = (struct mbuf *)mem; 749 #ifdef INVARIANTS 750 trash_fini(m->m_ext.ext_buf, MCLBYTES); 751 #endif 752 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 753 #ifdef INVARIANTS 754 trash_dtor(mem, size, NULL); 755 #endif 756 } 757 758 /* 759 * The "packet" keg constructor. 760 */ 761 static int 762 mb_ctor_pack(void *mem, int size, void *arg, int how) 763 { 764 struct mbuf *m; 765 struct mb_args *args; 766 int error, flags; 767 short type; 768 769 m = (struct mbuf *)mem; 770 args = (struct mb_args *)arg; 771 flags = args->flags; 772 type = args->type; 773 MPASS((flags & M_NOFREE) == 0); 774 775 #ifdef INVARIANTS 776 trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); 777 #endif 778 779 error = m_init(m, how, type, flags); 780 781 /* m_ext is already initialized. */ 782 m->m_data = m->m_ext.ext_buf; 783 m->m_flags = (flags | M_EXT); 784 785 return (error); 786 } 787 788 /* 789 * This is the protocol drain routine. Called by UMA whenever any of the 790 * mbuf zones is closed to its limit. 791 * 792 * No locks should be held when this is called. The drain routines have to 793 * presently acquire some locks which raises the possibility of lock order 794 * reversal. 795 */ 796 static void 797 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 798 { 799 struct domain *dp; 800 struct protosw *pr; 801 802 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); 803 804 for (dp = domains; dp != NULL; dp = dp->dom_next) 805 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 806 if (pr->pr_drain != NULL) 807 (*pr->pr_drain)(); 808 } 809 810 /* 811 * Clean up after mbufs with M_EXT storage attached to them if the 812 * reference count hits 1. 813 */ 814 void 815 mb_free_ext(struct mbuf *m) 816 { 817 volatile u_int *refcnt; 818 struct mbuf *mref; 819 int freembuf; 820 821 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 822 823 /* See if this is the mbuf that holds the embedded refcount. */ 824 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 825 refcnt = &m->m_ext.ext_count; 826 mref = m; 827 } else { 828 KASSERT(m->m_ext.ext_cnt != NULL, 829 ("%s: no refcounting pointer on %p", __func__, m)); 830 refcnt = m->m_ext.ext_cnt; 831 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 832 } 833 834 /* 835 * Check if the header is embedded in the cluster. It is 836 * important that we can't touch any of the mbuf fields 837 * after we have freed the external storage, since mbuf 838 * could have been embedded in it. For now, the mbufs 839 * embedded into the cluster are always of type EXT_EXTREF, 840 * and for this type we won't free the mref. 841 */ 842 if (m->m_flags & M_NOFREE) { 843 freembuf = 0; 844 KASSERT(m->m_ext.ext_type == EXT_EXTREF, 845 ("%s: no-free mbuf %p has wrong type", __func__, m)); 846 } else 847 freembuf = 1; 848 849 /* Free attached storage if this mbuf is the only reference to it. */ 850 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 851 switch (m->m_ext.ext_type) { 852 case EXT_PACKET: 853 /* The packet zone is special. */ 854 if (*refcnt == 0) 855 *refcnt = 1; 856 uma_zfree(zone_pack, mref); 857 break; 858 case EXT_CLUSTER: 859 uma_zfree(zone_clust, m->m_ext.ext_buf); 860 uma_zfree(zone_mbuf, mref); 861 break; 862 case EXT_JUMBOP: 863 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 864 uma_zfree(zone_mbuf, mref); 865 break; 866 case EXT_JUMBO9: 867 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 868 uma_zfree(zone_mbuf, mref); 869 break; 870 case EXT_JUMBO16: 871 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 872 uma_zfree(zone_mbuf, mref); 873 break; 874 case EXT_SFBUF: 875 case EXT_NET_DRV: 876 case EXT_MOD_TYPE: 877 case EXT_DISPOSABLE: 878 KASSERT(mref->m_ext.ext_free != NULL, 879 ("%s: ext_free not set", __func__)); 880 mref->m_ext.ext_free(mref); 881 uma_zfree(zone_mbuf, mref); 882 break; 883 case EXT_EXTREF: 884 KASSERT(m->m_ext.ext_free != NULL, 885 ("%s: ext_free not set", __func__)); 886 m->m_ext.ext_free(m); 887 break; 888 default: 889 KASSERT(m->m_ext.ext_type == 0, 890 ("%s: unknown ext_type", __func__)); 891 } 892 } 893 894 if (freembuf && m != mref) 895 uma_zfree(zone_mbuf, m); 896 } 897 898 /* 899 * Official mbuf(9) allocation KPI for stack and drivers: 900 * 901 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 902 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 903 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 904 * m_clget() - attach cluster to already allocated mbuf. 905 * m_cljget() - attach jumbo cluster to already allocated mbuf. 906 * m_get2() - allocate minimum mbuf that would fit size argument. 907 * m_getm2() - allocate a chain of mbufs/clusters. 908 * m_extadd() - attach external cluster to mbuf. 909 * 910 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 911 * m_freem() - free chain of mbufs. 912 */ 913 914 int 915 m_clget(struct mbuf *m, int how) 916 { 917 918 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 919 __func__, m)); 920 m->m_ext.ext_buf = (char *)NULL; 921 uma_zalloc_arg(zone_clust, m, how); 922 /* 923 * On a cluster allocation failure, drain the packet zone and retry, 924 * we might be able to loosen a few clusters up on the drain. 925 */ 926 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 927 zone_drain(zone_pack); 928 uma_zalloc_arg(zone_clust, m, how); 929 } 930 MBUF_PROBE2(m__clget, m, how); 931 return (m->m_flags & M_EXT); 932 } 933 934 /* 935 * m_cljget() is different from m_clget() as it can allocate clusters without 936 * attaching them to an mbuf. In that case the return value is the pointer 937 * to the cluster of the requested size. If an mbuf was specified, it gets 938 * the cluster attached to it and the return value can be safely ignored. 939 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 940 */ 941 void * 942 m_cljget(struct mbuf *m, int how, int size) 943 { 944 uma_zone_t zone; 945 void *retval; 946 947 if (m != NULL) { 948 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 949 __func__, m)); 950 m->m_ext.ext_buf = NULL; 951 } 952 953 zone = m_getzone(size); 954 retval = uma_zalloc_arg(zone, m, how); 955 956 MBUF_PROBE4(m__cljget, m, how, size, retval); 957 958 return (retval); 959 } 960 961 /* 962 * m_get2() allocates minimum mbuf that would fit "size" argument. 963 */ 964 struct mbuf * 965 m_get2(int size, int how, short type, int flags) 966 { 967 struct mb_args args; 968 struct mbuf *m, *n; 969 970 args.flags = flags; 971 args.type = type; 972 973 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 974 return (uma_zalloc_arg(zone_mbuf, &args, how)); 975 if (size <= MCLBYTES) 976 return (uma_zalloc_arg(zone_pack, &args, how)); 977 978 if (size > MJUMPAGESIZE) 979 return (NULL); 980 981 m = uma_zalloc_arg(zone_mbuf, &args, how); 982 if (m == NULL) 983 return (NULL); 984 985 n = uma_zalloc_arg(zone_jumbop, m, how); 986 if (n == NULL) { 987 uma_zfree(zone_mbuf, m); 988 return (NULL); 989 } 990 991 return (m); 992 } 993 994 /* 995 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 996 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 997 */ 998 struct mbuf * 999 m_getjcl(int how, short type, int flags, int size) 1000 { 1001 struct mb_args args; 1002 struct mbuf *m, *n; 1003 uma_zone_t zone; 1004 1005 if (size == MCLBYTES) 1006 return m_getcl(how, type, flags); 1007 1008 args.flags = flags; 1009 args.type = type; 1010 1011 m = uma_zalloc_arg(zone_mbuf, &args, how); 1012 if (m == NULL) 1013 return (NULL); 1014 1015 zone = m_getzone(size); 1016 n = uma_zalloc_arg(zone, m, how); 1017 if (n == NULL) { 1018 uma_zfree(zone_mbuf, m); 1019 return (NULL); 1020 } 1021 return (m); 1022 } 1023 1024 /* 1025 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1026 * best) and return a pointer to the top of the allocated chain. If an 1027 * existing mbuf chain is provided, then we will append the new chain 1028 * to the existing one but still return the top of the newly allocated 1029 * chain. 1030 */ 1031 struct mbuf * 1032 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1033 { 1034 struct mbuf *mb, *nm = NULL, *mtail = NULL; 1035 1036 KASSERT(len >= 0, ("%s: len is < 0", __func__)); 1037 1038 /* Validate flags. */ 1039 flags &= (M_PKTHDR | M_EOR); 1040 1041 /* Packet header mbuf must be first in chain. */ 1042 if ((flags & M_PKTHDR) && m != NULL) 1043 flags &= ~M_PKTHDR; 1044 1045 /* Loop and append maximum sized mbufs to the chain tail. */ 1046 while (len > 0) { 1047 if (len > MCLBYTES) 1048 mb = m_getjcl(how, type, (flags & M_PKTHDR), 1049 MJUMPAGESIZE); 1050 else if (len >= MINCLSIZE) 1051 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1052 else if (flags & M_PKTHDR) 1053 mb = m_gethdr(how, type); 1054 else 1055 mb = m_get(how, type); 1056 1057 /* Fail the whole operation if one mbuf can't be allocated. */ 1058 if (mb == NULL) { 1059 if (nm != NULL) 1060 m_freem(nm); 1061 return (NULL); 1062 } 1063 1064 /* Book keeping. */ 1065 len -= M_SIZE(mb); 1066 if (mtail != NULL) 1067 mtail->m_next = mb; 1068 else 1069 nm = mb; 1070 mtail = mb; 1071 flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ 1072 } 1073 if (flags & M_EOR) 1074 mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ 1075 1076 /* If mbuf was supplied, append new chain to the end of it. */ 1077 if (m != NULL) { 1078 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) 1079 ; 1080 mtail->m_next = nm; 1081 mtail->m_flags &= ~M_EOR; 1082 } else 1083 m = nm; 1084 1085 return (m); 1086 } 1087 1088 /*- 1089 * Configure a provided mbuf to refer to the provided external storage 1090 * buffer and setup a reference count for said buffer. 1091 * 1092 * Arguments: 1093 * mb The existing mbuf to which to attach the provided buffer. 1094 * buf The address of the provided external storage buffer. 1095 * size The size of the provided buffer. 1096 * freef A pointer to a routine that is responsible for freeing the 1097 * provided external storage buffer. 1098 * args A pointer to an argument structure (of any type) to be passed 1099 * to the provided freef routine (may be NULL). 1100 * flags Any other flags to be passed to the provided mbuf. 1101 * type The type that the external storage buffer should be 1102 * labeled with. 1103 * 1104 * Returns: 1105 * Nothing. 1106 */ 1107 void 1108 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1109 void *arg1, void *arg2, int flags, int type) 1110 { 1111 1112 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1113 1114 mb->m_flags |= (M_EXT | flags); 1115 mb->m_ext.ext_buf = buf; 1116 mb->m_data = mb->m_ext.ext_buf; 1117 mb->m_ext.ext_size = size; 1118 mb->m_ext.ext_free = freef; 1119 mb->m_ext.ext_arg1 = arg1; 1120 mb->m_ext.ext_arg2 = arg2; 1121 mb->m_ext.ext_type = type; 1122 1123 if (type != EXT_EXTREF) { 1124 mb->m_ext.ext_count = 1; 1125 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1126 } else 1127 mb->m_ext.ext_flags = 0; 1128 } 1129 1130 /* 1131 * Free an entire chain of mbufs and associated external buffers, if 1132 * applicable. 1133 */ 1134 void 1135 m_freem(struct mbuf *mb) 1136 { 1137 1138 MBUF_PROBE1(m__freem, mb); 1139 while (mb != NULL) 1140 mb = m_free(mb); 1141 } 1142