1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_param.h" 34 #include "opt_kern_tls.h" 35 36 #include <sys/param.h> 37 #include <sys/conf.h> 38 #include <sys/domainset.h> 39 #include <sys/malloc.h> 40 #include <sys/systm.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/eventhandler.h> 44 #include <sys/kernel.h> 45 #include <sys/ktls.h> 46 #include <sys/limits.h> 47 #include <sys/lock.h> 48 #include <sys/mutex.h> 49 #include <sys/protosw.h> 50 #include <sys/refcount.h> 51 #include <sys/sf_buf.h> 52 #include <sys/smp.h> 53 #include <sys/socket.h> 54 #include <sys/sysctl.h> 55 56 #include <net/if.h> 57 #include <net/if_var.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_extern.h> 61 #include <vm/vm_kern.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_map.h> 65 #include <vm/uma.h> 66 #include <vm/uma_dbg.h> 67 68 /* 69 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 70 * Zones. 71 * 72 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 73 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 74 * administrator so desires. 75 * 76 * Mbufs are allocated from a UMA Primary Zone called the Mbuf 77 * Zone. 78 * 79 * Additionally, FreeBSD provides a Packet Zone, which it 80 * configures as a Secondary Zone to the Mbuf Primary Zone, 81 * thus sharing backend Slab kegs with the Mbuf Primary Zone. 82 * 83 * Thus common-case allocations and locking are simplified: 84 * 85 * m_clget() m_getcl() 86 * | | 87 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 88 * | | [ Packet ] | 89 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 90 * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] 91 * | \________ | 92 * [ Cluster Keg ] \ / 93 * | [ Mbuf Keg ] 94 * [ Cluster Slabs ] | 95 * | [ Mbuf Slabs ] 96 * \____________(VM)_________________/ 97 * 98 * 99 * Whenever an object is allocated with uma_zalloc() out of 100 * one of the Zones its _ctor_ function is executed. The same 101 * for any deallocation through uma_zfree() the _dtor_ function 102 * is executed. 103 * 104 * Caches are per-CPU and are filled from the Primary Zone. 105 * 106 * Whenever an object is allocated from the underlying global 107 * memory pool it gets pre-initialized with the _zinit_ functions. 108 * When the Keg's are overfull objects get decommissioned with 109 * _zfini_ functions and free'd back to the global memory pool. 110 * 111 */ 112 113 int nmbufs; /* limits number of mbufs */ 114 int nmbclusters; /* limits number of mbuf clusters */ 115 int nmbjumbop; /* limits number of page size jumbo clusters */ 116 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 117 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 118 119 bool mb_use_ext_pgs = true; /* use M_EXTPG mbufs for sendfile & TLS */ 120 SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, 121 &mb_use_ext_pgs, 0, 122 "Use unmapped mbufs for sendfile(2) and TLS offload"); 123 124 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 125 126 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 127 "Maximum real memory allocatable to various mbuf types"); 128 129 static counter_u64_t snd_tag_count; 130 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, 131 &snd_tag_count, "# of active mbuf send tags"); 132 133 /* 134 * tunable_mbinit() has to be run before any mbuf allocations are done. 135 */ 136 static void 137 tunable_mbinit(void *dummy) 138 { 139 quad_t realmem; 140 141 /* 142 * The default limit for all mbuf related memory is 1/2 of all 143 * available kernel memory (physical or kmem). 144 * At most it can be 3/4 of available kernel memory. 145 */ 146 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 147 maxmbufmem = realmem / 2; 148 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 149 if (maxmbufmem > realmem / 4 * 3) 150 maxmbufmem = realmem / 4 * 3; 151 152 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 153 if (nmbclusters == 0) 154 nmbclusters = maxmbufmem / MCLBYTES / 4; 155 156 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 157 if (nmbjumbop == 0) 158 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 159 160 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 161 if (nmbjumbo9 == 0) 162 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 163 164 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 165 if (nmbjumbo16 == 0) 166 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 167 168 /* 169 * We need at least as many mbufs as we have clusters of 170 * the various types added together. 171 */ 172 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 173 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 174 nmbufs = lmax(maxmbufmem / MSIZE / 5, 175 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 176 } 177 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 178 179 static int 180 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 181 { 182 int error, newnmbclusters; 183 184 newnmbclusters = nmbclusters; 185 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 186 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 187 if (newnmbclusters > nmbclusters && 188 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 189 nmbclusters = newnmbclusters; 190 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 191 EVENTHANDLER_INVOKE(nmbclusters_change); 192 } else 193 error = EINVAL; 194 } 195 return (error); 196 } 197 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, 198 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0, 199 sysctl_nmbclusters, "IU", 200 "Maximum number of mbuf clusters allowed"); 201 202 static int 203 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 204 { 205 int error, newnmbjumbop; 206 207 newnmbjumbop = nmbjumbop; 208 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 209 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 210 if (newnmbjumbop > nmbjumbop && 211 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 212 nmbjumbop = newnmbjumbop; 213 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 214 } else 215 error = EINVAL; 216 } 217 return (error); 218 } 219 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, 220 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0, 221 sysctl_nmbjumbop, "IU", 222 "Maximum number of mbuf page size jumbo clusters allowed"); 223 224 static int 225 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 226 { 227 int error, newnmbjumbo9; 228 229 newnmbjumbo9 = nmbjumbo9; 230 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 231 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 232 if (newnmbjumbo9 > nmbjumbo9 && 233 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 234 nmbjumbo9 = newnmbjumbo9; 235 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 236 } else 237 error = EINVAL; 238 } 239 return (error); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, 242 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0, 243 sysctl_nmbjumbo9, "IU", 244 "Maximum number of mbuf 9k jumbo clusters allowed"); 245 246 static int 247 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 248 { 249 int error, newnmbjumbo16; 250 251 newnmbjumbo16 = nmbjumbo16; 252 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 253 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 254 if (newnmbjumbo16 > nmbjumbo16 && 255 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 256 nmbjumbo16 = newnmbjumbo16; 257 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 258 } else 259 error = EINVAL; 260 } 261 return (error); 262 } 263 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, 264 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0, 265 sysctl_nmbjumbo16, "IU", 266 "Maximum number of mbuf 16k jumbo clusters allowed"); 267 268 static int 269 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 270 { 271 int error, newnmbufs; 272 273 newnmbufs = nmbufs; 274 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 275 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 276 if (newnmbufs > nmbufs) { 277 nmbufs = newnmbufs; 278 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 279 EVENTHANDLER_INVOKE(nmbufs_change); 280 } else 281 error = EINVAL; 282 } 283 return (error); 284 } 285 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, 286 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 287 &nmbufs, 0, sysctl_nmbufs, "IU", 288 "Maximum number of mbufs allowed"); 289 290 /* 291 * Zones from which we allocate. 292 */ 293 uma_zone_t zone_mbuf; 294 uma_zone_t zone_clust; 295 uma_zone_t zone_pack; 296 uma_zone_t zone_jumbop; 297 uma_zone_t zone_jumbo9; 298 uma_zone_t zone_jumbo16; 299 300 /* 301 * Local prototypes. 302 */ 303 static int mb_ctor_mbuf(void *, int, void *, int); 304 static int mb_ctor_clust(void *, int, void *, int); 305 static int mb_ctor_pack(void *, int, void *, int); 306 static void mb_dtor_mbuf(void *, int, void *); 307 static void mb_dtor_pack(void *, int, void *); 308 static int mb_zinit_pack(void *, int, int); 309 static void mb_zfini_pack(void *, int); 310 static void mb_reclaim(uma_zone_t, int); 311 312 /* Ensure that MSIZE is a power of 2. */ 313 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 314 315 _Static_assert(sizeof(struct mbuf) <= MSIZE, 316 "size of mbuf exceeds MSIZE"); 317 /* 318 * Initialize FreeBSD Network buffer allocation. 319 */ 320 static void 321 mbuf_init(void *dummy) 322 { 323 324 /* 325 * Configure UMA zones for Mbufs, Clusters, and Packets. 326 */ 327 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 328 mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 329 MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); 330 if (nmbufs > 0) 331 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 332 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 333 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 334 335 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 336 mb_ctor_clust, NULL, NULL, NULL, 337 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 338 if (nmbclusters > 0) 339 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 340 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 341 uma_zone_set_maxaction(zone_clust, mb_reclaim); 342 343 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 344 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 345 346 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 347 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 348 mb_ctor_clust, NULL, NULL, NULL, 349 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 350 if (nmbjumbop > 0) 351 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 352 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 353 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 354 355 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 356 mb_ctor_clust, NULL, NULL, NULL, 357 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 358 if (nmbjumbo9 > 0) 359 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 360 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 361 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 362 363 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 364 mb_ctor_clust, NULL, NULL, NULL, 365 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 366 if (nmbjumbo16 > 0) 367 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 368 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 369 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 370 371 /* 372 * Hook event handler for low-memory situation, used to 373 * drain protocols and push data back to the caches (UMA 374 * later pushes it back to VM). 375 */ 376 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 377 EVENTHANDLER_PRI_FIRST); 378 379 snd_tag_count = counter_u64_alloc(M_WAITOK); 380 } 381 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 382 383 #ifdef DEBUGNET 384 /* 385 * debugnet makes use of a pre-allocated pool of mbufs and clusters. When 386 * debugnet is configured, we initialize a set of UMA cache zones which return 387 * items from this pool. At panic-time, the regular UMA zone pointers are 388 * overwritten with those of the cache zones so that drivers may allocate and 389 * free mbufs and clusters without attempting to allocate physical memory. 390 * 391 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 392 * the purpose of caching clusters, we treat them as mbufs. 393 */ 394 static struct mbufq dn_mbufq = 395 { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; 396 static struct mbufq dn_clustq = 397 { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; 398 399 static int dn_clsize; 400 static uma_zone_t dn_zone_mbuf; 401 static uma_zone_t dn_zone_clust; 402 static uma_zone_t dn_zone_pack; 403 404 static struct debugnet_saved_zones { 405 uma_zone_t dsz_mbuf; 406 uma_zone_t dsz_clust; 407 uma_zone_t dsz_pack; 408 uma_zone_t dsz_jumbop; 409 uma_zone_t dsz_jumbo9; 410 uma_zone_t dsz_jumbo16; 411 bool dsz_debugnet_zones_enabled; 412 } dn_saved_zones; 413 414 static int 415 dn_buf_import(void *arg, void **store, int count, int domain __unused, 416 int flags) 417 { 418 struct mbufq *q; 419 struct mbuf *m; 420 int i; 421 422 q = arg; 423 424 for (i = 0; i < count; i++) { 425 m = mbufq_dequeue(q); 426 if (m == NULL) 427 break; 428 trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); 429 store[i] = m; 430 } 431 KASSERT((flags & M_WAITOK) == 0 || i == count, 432 ("%s: ran out of pre-allocated mbufs", __func__)); 433 return (i); 434 } 435 436 static void 437 dn_buf_release(void *arg, void **store, int count) 438 { 439 struct mbufq *q; 440 struct mbuf *m; 441 int i; 442 443 q = arg; 444 445 for (i = 0; i < count; i++) { 446 m = store[i]; 447 (void)mbufq_enqueue(q, m); 448 } 449 } 450 451 static int 452 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, 453 int flags __unused) 454 { 455 struct mbuf *m; 456 void *clust; 457 int i; 458 459 for (i = 0; i < count; i++) { 460 m = m_get(MT_DATA, M_NOWAIT); 461 if (m == NULL) 462 break; 463 clust = uma_zalloc(dn_zone_clust, M_NOWAIT); 464 if (clust == NULL) { 465 m_free(m); 466 break; 467 } 468 mb_ctor_clust(clust, dn_clsize, m, 0); 469 store[i] = m; 470 } 471 KASSERT((flags & M_WAITOK) == 0 || i == count, 472 ("%s: ran out of pre-allocated mbufs", __func__)); 473 return (i); 474 } 475 476 static void 477 dn_pack_release(void *arg __unused, void **store, int count) 478 { 479 struct mbuf *m; 480 void *clust; 481 int i; 482 483 for (i = 0; i < count; i++) { 484 m = store[i]; 485 clust = m->m_ext.ext_buf; 486 uma_zfree(dn_zone_clust, clust); 487 uma_zfree(dn_zone_mbuf, m); 488 } 489 } 490 491 /* 492 * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy 493 * the corresponding UMA cache zones. 494 */ 495 void 496 debugnet_mbuf_drain(void) 497 { 498 struct mbuf *m; 499 void *item; 500 501 if (dn_zone_mbuf != NULL) { 502 uma_zdestroy(dn_zone_mbuf); 503 dn_zone_mbuf = NULL; 504 } 505 if (dn_zone_clust != NULL) { 506 uma_zdestroy(dn_zone_clust); 507 dn_zone_clust = NULL; 508 } 509 if (dn_zone_pack != NULL) { 510 uma_zdestroy(dn_zone_pack); 511 dn_zone_pack = NULL; 512 } 513 514 while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) 515 m_free(m); 516 while ((item = mbufq_dequeue(&dn_clustq)) != NULL) 517 uma_zfree(m_getzone(dn_clsize), item); 518 } 519 520 /* 521 * Callback invoked immediately prior to starting a debugnet connection. 522 */ 523 void 524 debugnet_mbuf_start(void) 525 { 526 527 MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); 528 529 /* Save the old zone pointers to restore when debugnet is closed. */ 530 dn_saved_zones = (struct debugnet_saved_zones) { 531 .dsz_debugnet_zones_enabled = true, 532 .dsz_mbuf = zone_mbuf, 533 .dsz_clust = zone_clust, 534 .dsz_pack = zone_pack, 535 .dsz_jumbop = zone_jumbop, 536 .dsz_jumbo9 = zone_jumbo9, 537 .dsz_jumbo16 = zone_jumbo16, 538 }; 539 540 /* 541 * All cluster zones return buffers of the size requested by the 542 * drivers. It's up to the driver to reinitialize the zones if the 543 * MTU of a debugnet-enabled interface changes. 544 */ 545 printf("debugnet: overwriting mbuf zone pointers\n"); 546 zone_mbuf = dn_zone_mbuf; 547 zone_clust = dn_zone_clust; 548 zone_pack = dn_zone_pack; 549 zone_jumbop = dn_zone_clust; 550 zone_jumbo9 = dn_zone_clust; 551 zone_jumbo16 = dn_zone_clust; 552 } 553 554 /* 555 * Callback invoked when a debugnet connection is closed/finished. 556 */ 557 void 558 debugnet_mbuf_finish(void) 559 { 560 561 MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); 562 563 printf("debugnet: restoring mbuf zone pointers\n"); 564 zone_mbuf = dn_saved_zones.dsz_mbuf; 565 zone_clust = dn_saved_zones.dsz_clust; 566 zone_pack = dn_saved_zones.dsz_pack; 567 zone_jumbop = dn_saved_zones.dsz_jumbop; 568 zone_jumbo9 = dn_saved_zones.dsz_jumbo9; 569 zone_jumbo16 = dn_saved_zones.dsz_jumbo16; 570 571 memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); 572 } 573 574 /* 575 * Reinitialize the debugnet mbuf+cluster pool and cache zones. 576 */ 577 void 578 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) 579 { 580 struct mbuf *m; 581 void *item; 582 583 debugnet_mbuf_drain(); 584 585 dn_clsize = clsize; 586 587 dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, 588 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 589 dn_buf_import, dn_buf_release, 590 &dn_mbufq, UMA_ZONE_NOBUCKET); 591 592 dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, 593 clsize, mb_ctor_clust, NULL, NULL, NULL, 594 dn_buf_import, dn_buf_release, 595 &dn_clustq, UMA_ZONE_NOBUCKET); 596 597 dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, 598 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 599 dn_pack_import, dn_pack_release, 600 NULL, UMA_ZONE_NOBUCKET); 601 602 while (nmbuf-- > 0) { 603 m = m_get(MT_DATA, M_WAITOK); 604 uma_zfree(dn_zone_mbuf, m); 605 } 606 while (nclust-- > 0) { 607 item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); 608 uma_zfree(dn_zone_clust, item); 609 } 610 } 611 #endif /* DEBUGNET */ 612 613 /* 614 * Constructor for Mbuf primary zone. 615 * 616 * The 'arg' pointer points to a mb_args structure which 617 * contains call-specific information required to support the 618 * mbuf allocation API. See mbuf.h. 619 */ 620 static int 621 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 622 { 623 struct mbuf *m; 624 struct mb_args *args; 625 int error; 626 int flags; 627 short type; 628 629 args = (struct mb_args *)arg; 630 type = args->type; 631 632 /* 633 * The mbuf is initialized later. The caller has the 634 * responsibility to set up any MAC labels too. 635 */ 636 if (type == MT_NOINIT) 637 return (0); 638 639 m = (struct mbuf *)mem; 640 flags = args->flags; 641 MPASS((flags & M_NOFREE) == 0); 642 643 error = m_init(m, how, type, flags); 644 645 return (error); 646 } 647 648 /* 649 * The Mbuf primary zone destructor. 650 */ 651 static void 652 mb_dtor_mbuf(void *mem, int size, void *arg) 653 { 654 struct mbuf *m; 655 unsigned long flags; 656 657 m = (struct mbuf *)mem; 658 flags = (unsigned long)arg; 659 660 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 661 KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); 662 if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 663 m_tag_delete_chain(m, NULL); 664 } 665 666 /* 667 * The Mbuf Packet zone destructor. 668 */ 669 static void 670 mb_dtor_pack(void *mem, int size, void *arg) 671 { 672 struct mbuf *m; 673 674 m = (struct mbuf *)mem; 675 if ((m->m_flags & M_PKTHDR) != 0) 676 m_tag_delete_chain(m, NULL); 677 678 /* Make sure we've got a clean cluster back. */ 679 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 680 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 681 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 682 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 683 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 684 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 685 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 686 #if defined(INVARIANTS) && !defined(KMSAN) 687 trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); 688 #endif 689 /* 690 * If there are processes blocked on zone_clust, waiting for pages 691 * to be freed up, cause them to be woken up by draining the 692 * packet zone. We are exposed to a race here (in the check for 693 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 694 * is deliberate. We don't want to acquire the zone lock for every 695 * mbuf free. 696 */ 697 if (uma_zone_exhausted(zone_clust)) 698 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 699 } 700 701 /* 702 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 703 * 704 * Here the 'arg' pointer points to the Mbuf which we 705 * are configuring cluster storage for. If 'arg' is 706 * empty we allocate just the cluster without setting 707 * the mbuf to it. See mbuf.h. 708 */ 709 static int 710 mb_ctor_clust(void *mem, int size, void *arg, int how) 711 { 712 struct mbuf *m; 713 714 m = (struct mbuf *)arg; 715 if (m != NULL) { 716 m->m_ext.ext_buf = (char *)mem; 717 m->m_data = m->m_ext.ext_buf; 718 m->m_flags |= M_EXT; 719 m->m_ext.ext_free = NULL; 720 m->m_ext.ext_arg1 = NULL; 721 m->m_ext.ext_arg2 = NULL; 722 m->m_ext.ext_size = size; 723 m->m_ext.ext_type = m_gettype(size); 724 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 725 m->m_ext.ext_count = 1; 726 } 727 728 return (0); 729 } 730 731 /* 732 * The Packet secondary zone's init routine, executed on the 733 * object's transition from mbuf keg slab to zone cache. 734 */ 735 static int 736 mb_zinit_pack(void *mem, int size, int how) 737 { 738 struct mbuf *m; 739 740 m = (struct mbuf *)mem; /* m is virgin. */ 741 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 742 m->m_ext.ext_buf == NULL) 743 return (ENOMEM); 744 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 745 #if defined(INVARIANTS) && !defined(KMSAN) 746 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 747 #endif 748 return (0); 749 } 750 751 /* 752 * The Packet secondary zone's fini routine, executed on the 753 * object's transition from zone cache to keg slab. 754 */ 755 static void 756 mb_zfini_pack(void *mem, int size) 757 { 758 struct mbuf *m; 759 760 m = (struct mbuf *)mem; 761 #if defined(INVARIANTS) && !defined(KMSAN) 762 trash_fini(m->m_ext.ext_buf, MCLBYTES); 763 #endif 764 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 765 #if defined(INVARIANTS) && !defined(KMSAN) 766 trash_dtor(mem, size, NULL); 767 #endif 768 } 769 770 /* 771 * The "packet" keg constructor. 772 */ 773 static int 774 mb_ctor_pack(void *mem, int size, void *arg, int how) 775 { 776 struct mbuf *m; 777 struct mb_args *args; 778 int error, flags; 779 short type; 780 781 m = (struct mbuf *)mem; 782 args = (struct mb_args *)arg; 783 flags = args->flags; 784 type = args->type; 785 MPASS((flags & M_NOFREE) == 0); 786 787 #if defined(INVARIANTS) && !defined(KMSAN) 788 trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); 789 #endif 790 791 error = m_init(m, how, type, flags); 792 793 /* m_ext is already initialized. */ 794 m->m_data = m->m_ext.ext_buf; 795 m->m_flags = (flags | M_EXT); 796 797 return (error); 798 } 799 800 /* 801 * This is the protocol drain routine. Called by UMA whenever any of the 802 * mbuf zones is closed to its limit. 803 * 804 * No locks should be held when this is called. The drain routines have to 805 * presently acquire some locks which raises the possibility of lock order 806 * reversal. 807 */ 808 static void 809 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 810 { 811 struct epoch_tracker et; 812 struct domain *dp; 813 struct protosw *pr; 814 815 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); 816 817 NET_EPOCH_ENTER(et); 818 for (dp = domains; dp != NULL; dp = dp->dom_next) 819 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 820 if (pr->pr_drain != NULL) 821 (*pr->pr_drain)(); 822 NET_EPOCH_EXIT(et); 823 } 824 825 /* 826 * Free "count" units of I/O from an mbuf chain. They could be held 827 * in M_EXTPG or just as a normal mbuf. This code is intended to be 828 * called in an error path (I/O error, closed connection, etc). 829 */ 830 void 831 mb_free_notready(struct mbuf *m, int count) 832 { 833 int i; 834 835 for (i = 0; i < count && m != NULL; i++) { 836 if ((m->m_flags & M_EXTPG) != 0) { 837 m->m_epg_nrdy--; 838 if (m->m_epg_nrdy != 0) 839 continue; 840 } 841 m = m_free(m); 842 } 843 KASSERT(i == count, ("Removed only %d items from %p", i, m)); 844 } 845 846 /* 847 * Compress an unmapped mbuf into a simple mbuf when it holds a small 848 * amount of data. This is used as a DOS defense to avoid having 849 * small packets tie up wired pages, an ext_pgs structure, and an 850 * mbuf. Since this converts the existing mbuf in place, it can only 851 * be used if there are no other references to 'm'. 852 */ 853 int 854 mb_unmapped_compress(struct mbuf *m) 855 { 856 volatile u_int *refcnt; 857 char buf[MLEN]; 858 859 /* 860 * Assert that 'm' does not have a packet header. If 'm' had 861 * a packet header, it would only be able to hold MHLEN bytes 862 * and m_data would have to be initialized differently. 863 */ 864 KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), 865 ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); 866 KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); 867 868 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 869 refcnt = &m->m_ext.ext_count; 870 } else { 871 KASSERT(m->m_ext.ext_cnt != NULL, 872 ("%s: no refcounting pointer on %p", __func__, m)); 873 refcnt = m->m_ext.ext_cnt; 874 } 875 876 if (*refcnt != 1) 877 return (EBUSY); 878 879 m_copydata(m, 0, m->m_len, buf); 880 881 /* Free the backing pages. */ 882 m->m_ext.ext_free(m); 883 884 /* Turn 'm' into a "normal" mbuf. */ 885 m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); 886 m->m_data = m->m_dat; 887 888 /* Copy data back into m. */ 889 bcopy(buf, mtod(m, char *), m->m_len); 890 891 return (0); 892 } 893 894 /* 895 * These next few routines are used to permit downgrading an unmapped 896 * mbuf to a chain of mapped mbufs. This is used when an interface 897 * doesn't supported unmapped mbufs or if checksums need to be 898 * computed in software. 899 * 900 * Each unmapped mbuf is converted to a chain of mbufs. First, any 901 * TLS header data is stored in a regular mbuf. Second, each page of 902 * unmapped data is stored in an mbuf with an EXT_SFBUF external 903 * cluster. These mbufs use an sf_buf to provide a valid KVA for the 904 * associated physical page. They also hold a reference on the 905 * original M_EXTPG mbuf to ensure the physical page doesn't go away. 906 * Finally, any TLS trailer data is stored in a regular mbuf. 907 * 908 * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF 909 * mbufs. It frees the associated sf_buf and releases its reference 910 * on the original M_EXTPG mbuf. 911 * 912 * _mb_unmapped_to_ext() is a helper function that converts a single 913 * unmapped mbuf into a chain of mbufs. 914 * 915 * mb_unmapped_to_ext() is the public function that walks an mbuf 916 * chain converting any unmapped mbufs to mapped mbufs. It returns 917 * the new chain of unmapped mbufs on success. On failure it frees 918 * the original mbuf chain and returns NULL. 919 */ 920 static void 921 mb_unmapped_free_mext(struct mbuf *m) 922 { 923 struct sf_buf *sf; 924 struct mbuf *old_m; 925 926 sf = m->m_ext.ext_arg1; 927 sf_buf_free(sf); 928 929 /* Drop the reference on the backing M_EXTPG mbuf. */ 930 old_m = m->m_ext.ext_arg2; 931 mb_free_extpg(old_m); 932 } 933 934 static struct mbuf * 935 _mb_unmapped_to_ext(struct mbuf *m) 936 { 937 struct mbuf *m_new, *top, *prev, *mref; 938 struct sf_buf *sf; 939 vm_page_t pg; 940 int i, len, off, pglen, pgoff, seglen, segoff; 941 volatile u_int *refcnt; 942 u_int ref_inc = 0; 943 944 M_ASSERTEXTPG(m); 945 len = m->m_len; 946 KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", 947 __func__, m)); 948 949 /* See if this is the mbuf that holds the embedded refcount. */ 950 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 951 refcnt = &m->m_ext.ext_count; 952 mref = m; 953 } else { 954 KASSERT(m->m_ext.ext_cnt != NULL, 955 ("%s: no refcounting pointer on %p", __func__, m)); 956 refcnt = m->m_ext.ext_cnt; 957 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 958 } 959 960 /* Skip over any data removed from the front. */ 961 off = mtod(m, vm_offset_t); 962 963 top = NULL; 964 if (m->m_epg_hdrlen != 0) { 965 if (off >= m->m_epg_hdrlen) { 966 off -= m->m_epg_hdrlen; 967 } else { 968 seglen = m->m_epg_hdrlen - off; 969 segoff = off; 970 seglen = min(seglen, len); 971 off = 0; 972 len -= seglen; 973 m_new = m_get(M_NOWAIT, MT_DATA); 974 if (m_new == NULL) 975 goto fail; 976 m_new->m_len = seglen; 977 prev = top = m_new; 978 memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], 979 seglen); 980 } 981 } 982 pgoff = m->m_epg_1st_off; 983 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 984 pglen = m_epg_pagelen(m, i, pgoff); 985 if (off >= pglen) { 986 off -= pglen; 987 pgoff = 0; 988 continue; 989 } 990 seglen = pglen - off; 991 segoff = pgoff + off; 992 off = 0; 993 seglen = min(seglen, len); 994 len -= seglen; 995 996 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 997 m_new = m_get(M_NOWAIT, MT_DATA); 998 if (m_new == NULL) 999 goto fail; 1000 if (top == NULL) { 1001 top = prev = m_new; 1002 } else { 1003 prev->m_next = m_new; 1004 prev = m_new; 1005 } 1006 sf = sf_buf_alloc(pg, SFB_NOWAIT); 1007 if (sf == NULL) 1008 goto fail; 1009 1010 ref_inc++; 1011 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, 1012 mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); 1013 m_new->m_data += segoff; 1014 m_new->m_len = seglen; 1015 1016 pgoff = 0; 1017 }; 1018 if (len != 0) { 1019 KASSERT((off + len) <= m->m_epg_trllen, 1020 ("off + len > trail (%d + %d > %d)", off, len, 1021 m->m_epg_trllen)); 1022 m_new = m_get(M_NOWAIT, MT_DATA); 1023 if (m_new == NULL) 1024 goto fail; 1025 if (top == NULL) 1026 top = m_new; 1027 else 1028 prev->m_next = m_new; 1029 m_new->m_len = len; 1030 memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); 1031 } 1032 1033 if (ref_inc != 0) { 1034 /* 1035 * Obtain an additional reference on the old mbuf for 1036 * each created EXT_SFBUF mbuf. They will be dropped 1037 * in mb_unmapped_free_mext(). 1038 */ 1039 if (*refcnt == 1) 1040 *refcnt += ref_inc; 1041 else 1042 atomic_add_int(refcnt, ref_inc); 1043 } 1044 m_free(m); 1045 return (top); 1046 1047 fail: 1048 if (ref_inc != 0) { 1049 /* 1050 * Obtain an additional reference on the old mbuf for 1051 * each created EXT_SFBUF mbuf. They will be 1052 * immediately dropped when these mbufs are freed 1053 * below. 1054 */ 1055 if (*refcnt == 1) 1056 *refcnt += ref_inc; 1057 else 1058 atomic_add_int(refcnt, ref_inc); 1059 } 1060 m_free(m); 1061 m_freem(top); 1062 return (NULL); 1063 } 1064 1065 struct mbuf * 1066 mb_unmapped_to_ext(struct mbuf *top) 1067 { 1068 struct mbuf *m, *next, *prev = NULL; 1069 1070 prev = NULL; 1071 for (m = top; m != NULL; m = next) { 1072 /* m might be freed, so cache the next pointer. */ 1073 next = m->m_next; 1074 if (m->m_flags & M_EXTPG) { 1075 if (prev != NULL) { 1076 /* 1077 * Remove 'm' from the new chain so 1078 * that the 'top' chain terminates 1079 * before 'm' in case 'top' is freed 1080 * due to an error. 1081 */ 1082 prev->m_next = NULL; 1083 } 1084 m = _mb_unmapped_to_ext(m); 1085 if (m == NULL) { 1086 m_freem(top); 1087 m_freem(next); 1088 return (NULL); 1089 } 1090 if (prev == NULL) { 1091 top = m; 1092 } else { 1093 prev->m_next = m; 1094 } 1095 1096 /* 1097 * Replaced one mbuf with a chain, so we must 1098 * find the end of chain. 1099 */ 1100 prev = m_last(m); 1101 } else { 1102 if (prev != NULL) { 1103 prev->m_next = m; 1104 } 1105 prev = m; 1106 } 1107 } 1108 return (top); 1109 } 1110 1111 /* 1112 * Allocate an empty M_EXTPG mbuf. The ext_free routine is 1113 * responsible for freeing any pages backing this mbuf when it is 1114 * freed. 1115 */ 1116 struct mbuf * 1117 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) 1118 { 1119 struct mbuf *m; 1120 1121 m = m_get(how, MT_DATA); 1122 if (m == NULL) 1123 return (NULL); 1124 1125 m->m_epg_npgs = 0; 1126 m->m_epg_nrdy = 0; 1127 m->m_epg_1st_off = 0; 1128 m->m_epg_last_len = 0; 1129 m->m_epg_flags = 0; 1130 m->m_epg_hdrlen = 0; 1131 m->m_epg_trllen = 0; 1132 m->m_epg_tls = NULL; 1133 m->m_epg_so = NULL; 1134 m->m_data = NULL; 1135 m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); 1136 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 1137 m->m_ext.ext_count = 1; 1138 m->m_ext.ext_size = 0; 1139 m->m_ext.ext_free = ext_free; 1140 return (m); 1141 } 1142 1143 /* 1144 * Clean up after mbufs with M_EXT storage attached to them if the 1145 * reference count hits 1. 1146 */ 1147 void 1148 mb_free_ext(struct mbuf *m) 1149 { 1150 volatile u_int *refcnt; 1151 struct mbuf *mref; 1152 int freembuf; 1153 1154 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 1155 1156 /* See if this is the mbuf that holds the embedded refcount. */ 1157 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1158 refcnt = &m->m_ext.ext_count; 1159 mref = m; 1160 } else { 1161 KASSERT(m->m_ext.ext_cnt != NULL, 1162 ("%s: no refcounting pointer on %p", __func__, m)); 1163 refcnt = m->m_ext.ext_cnt; 1164 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1165 } 1166 1167 /* 1168 * Check if the header is embedded in the cluster. It is 1169 * important that we can't touch any of the mbuf fields 1170 * after we have freed the external storage, since mbuf 1171 * could have been embedded in it. For now, the mbufs 1172 * embedded into the cluster are always of type EXT_EXTREF, 1173 * and for this type we won't free the mref. 1174 */ 1175 if (m->m_flags & M_NOFREE) { 1176 freembuf = 0; 1177 KASSERT(m->m_ext.ext_type == EXT_EXTREF || 1178 m->m_ext.ext_type == EXT_RXRING, 1179 ("%s: no-free mbuf %p has wrong type", __func__, m)); 1180 } else 1181 freembuf = 1; 1182 1183 /* Free attached storage if this mbuf is the only reference to it. */ 1184 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1185 switch (m->m_ext.ext_type) { 1186 case EXT_PACKET: 1187 /* The packet zone is special. */ 1188 if (*refcnt == 0) 1189 *refcnt = 1; 1190 uma_zfree(zone_pack, mref); 1191 break; 1192 case EXT_CLUSTER: 1193 uma_zfree(zone_clust, m->m_ext.ext_buf); 1194 m_free_raw(mref); 1195 break; 1196 case EXT_JUMBOP: 1197 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 1198 m_free_raw(mref); 1199 break; 1200 case EXT_JUMBO9: 1201 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 1202 m_free_raw(mref); 1203 break; 1204 case EXT_JUMBO16: 1205 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 1206 m_free_raw(mref); 1207 break; 1208 case EXT_SFBUF: 1209 case EXT_NET_DRV: 1210 case EXT_MOD_TYPE: 1211 case EXT_DISPOSABLE: 1212 KASSERT(mref->m_ext.ext_free != NULL, 1213 ("%s: ext_free not set", __func__)); 1214 mref->m_ext.ext_free(mref); 1215 m_free_raw(mref); 1216 break; 1217 case EXT_EXTREF: 1218 KASSERT(m->m_ext.ext_free != NULL, 1219 ("%s: ext_free not set", __func__)); 1220 m->m_ext.ext_free(m); 1221 break; 1222 case EXT_RXRING: 1223 KASSERT(m->m_ext.ext_free == NULL, 1224 ("%s: ext_free is set", __func__)); 1225 break; 1226 default: 1227 KASSERT(m->m_ext.ext_type == 0, 1228 ("%s: unknown ext_type", __func__)); 1229 } 1230 } 1231 1232 if (freembuf && m != mref) 1233 m_free_raw(m); 1234 } 1235 1236 /* 1237 * Clean up after mbufs with M_EXTPG storage attached to them if the 1238 * reference count hits 1. 1239 */ 1240 void 1241 mb_free_extpg(struct mbuf *m) 1242 { 1243 volatile u_int *refcnt; 1244 struct mbuf *mref; 1245 1246 M_ASSERTEXTPG(m); 1247 1248 /* See if this is the mbuf that holds the embedded refcount. */ 1249 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1250 refcnt = &m->m_ext.ext_count; 1251 mref = m; 1252 } else { 1253 KASSERT(m->m_ext.ext_cnt != NULL, 1254 ("%s: no refcounting pointer on %p", __func__, m)); 1255 refcnt = m->m_ext.ext_cnt; 1256 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1257 } 1258 1259 /* Free attached storage if this mbuf is the only reference to it. */ 1260 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1261 KASSERT(mref->m_ext.ext_free != NULL, 1262 ("%s: ext_free not set", __func__)); 1263 1264 mref->m_ext.ext_free(mref); 1265 #ifdef KERN_TLS 1266 if (mref->m_epg_tls != NULL && 1267 !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) 1268 ktls_enqueue_to_free(mref); 1269 else 1270 #endif 1271 m_free_raw(mref); 1272 } 1273 1274 if (m != mref) 1275 m_free_raw(m); 1276 } 1277 1278 /* 1279 * Official mbuf(9) allocation KPI for stack and drivers: 1280 * 1281 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 1282 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 1283 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 1284 * m_clget() - attach cluster to already allocated mbuf. 1285 * m_cljget() - attach jumbo cluster to already allocated mbuf. 1286 * m_get2() - allocate minimum mbuf that would fit size argument. 1287 * m_getm2() - allocate a chain of mbufs/clusters. 1288 * m_extadd() - attach external cluster to mbuf. 1289 * 1290 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 1291 * m_freem() - free chain of mbufs. 1292 */ 1293 1294 int 1295 m_clget(struct mbuf *m, int how) 1296 { 1297 1298 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1299 __func__, m)); 1300 m->m_ext.ext_buf = (char *)NULL; 1301 uma_zalloc_arg(zone_clust, m, how); 1302 /* 1303 * On a cluster allocation failure, drain the packet zone and retry, 1304 * we might be able to loosen a few clusters up on the drain. 1305 */ 1306 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 1307 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 1308 uma_zalloc_arg(zone_clust, m, how); 1309 } 1310 MBUF_PROBE2(m__clget, m, how); 1311 return (m->m_flags & M_EXT); 1312 } 1313 1314 /* 1315 * m_cljget() is different from m_clget() as it can allocate clusters without 1316 * attaching them to an mbuf. In that case the return value is the pointer 1317 * to the cluster of the requested size. If an mbuf was specified, it gets 1318 * the cluster attached to it and the return value can be safely ignored. 1319 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1320 */ 1321 void * 1322 m_cljget(struct mbuf *m, int how, int size) 1323 { 1324 uma_zone_t zone; 1325 void *retval; 1326 1327 if (m != NULL) { 1328 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1329 __func__, m)); 1330 m->m_ext.ext_buf = NULL; 1331 } 1332 1333 zone = m_getzone(size); 1334 retval = uma_zalloc_arg(zone, m, how); 1335 1336 MBUF_PROBE4(m__cljget, m, how, size, retval); 1337 1338 return (retval); 1339 } 1340 1341 /* 1342 * m_get2() allocates minimum mbuf that would fit "size" argument. 1343 */ 1344 struct mbuf * 1345 m_get2(int size, int how, short type, int flags) 1346 { 1347 struct mb_args args; 1348 struct mbuf *m, *n; 1349 1350 args.flags = flags; 1351 args.type = type; 1352 1353 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 1354 return (uma_zalloc_arg(zone_mbuf, &args, how)); 1355 if (size <= MCLBYTES) 1356 return (uma_zalloc_arg(zone_pack, &args, how)); 1357 1358 if (size > MJUMPAGESIZE) 1359 return (NULL); 1360 1361 m = uma_zalloc_arg(zone_mbuf, &args, how); 1362 if (m == NULL) 1363 return (NULL); 1364 1365 n = uma_zalloc_arg(zone_jumbop, m, how); 1366 if (n == NULL) { 1367 m_free_raw(m); 1368 return (NULL); 1369 } 1370 1371 return (m); 1372 } 1373 1374 /* 1375 * m_get3() allocates minimum mbuf that would fit "size" argument. 1376 * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. 1377 */ 1378 struct mbuf * 1379 m_get3(int size, int how, short type, int flags) 1380 { 1381 struct mb_args args; 1382 struct mbuf *m, *n; 1383 uma_zone_t zone; 1384 1385 if (size <= MJUMPAGESIZE) 1386 return (m_get2(size, how, type, flags)); 1387 1388 if (size > MJUM16BYTES) 1389 return (NULL); 1390 1391 args.flags = flags; 1392 args.type = type; 1393 1394 m = uma_zalloc_arg(zone_mbuf, &args, how); 1395 if (m == NULL) 1396 return (NULL); 1397 1398 if (size <= MJUM9BYTES) 1399 zone = zone_jumbo9; 1400 else 1401 zone = zone_jumbo16; 1402 1403 n = uma_zalloc_arg(zone_jumbop, m, how); 1404 if (n == NULL) { 1405 m_free_raw(m); 1406 return (NULL); 1407 } 1408 1409 return (m); 1410 } 1411 1412 /* 1413 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1414 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1415 */ 1416 struct mbuf * 1417 m_getjcl(int how, short type, int flags, int size) 1418 { 1419 struct mb_args args; 1420 struct mbuf *m, *n; 1421 uma_zone_t zone; 1422 1423 if (size == MCLBYTES) 1424 return m_getcl(how, type, flags); 1425 1426 args.flags = flags; 1427 args.type = type; 1428 1429 m = uma_zalloc_arg(zone_mbuf, &args, how); 1430 if (m == NULL) 1431 return (NULL); 1432 1433 zone = m_getzone(size); 1434 n = uma_zalloc_arg(zone, m, how); 1435 if (n == NULL) { 1436 m_free_raw(m); 1437 return (NULL); 1438 } 1439 MBUF_PROBE5(m__getjcl, how, type, flags, size, m); 1440 return (m); 1441 } 1442 1443 /* 1444 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1445 * best) and return a pointer to the top of the allocated chain. If an 1446 * existing mbuf chain is provided, then we will append the new chain 1447 * to the existing one and return a pointer to the provided mbuf. 1448 */ 1449 struct mbuf * 1450 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1451 { 1452 struct mbuf *mb, *nm = NULL, *mtail = NULL; 1453 1454 KASSERT(len >= 0, ("%s: len is < 0", __func__)); 1455 1456 /* Validate flags. */ 1457 flags &= (M_PKTHDR | M_EOR); 1458 1459 /* Packet header mbuf must be first in chain. */ 1460 if ((flags & M_PKTHDR) && m != NULL) 1461 flags &= ~M_PKTHDR; 1462 1463 /* Loop and append maximum sized mbufs to the chain tail. */ 1464 while (len > 0) { 1465 mb = NULL; 1466 if (len > MCLBYTES) { 1467 mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), 1468 MJUMPAGESIZE); 1469 } 1470 if (mb == NULL) { 1471 if (len >= MINCLSIZE) 1472 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1473 else if (flags & M_PKTHDR) 1474 mb = m_gethdr(how, type); 1475 else 1476 mb = m_get(how, type); 1477 1478 /* 1479 * Fail the whole operation if one mbuf can't be 1480 * allocated. 1481 */ 1482 if (mb == NULL) { 1483 m_freem(nm); 1484 return (NULL); 1485 } 1486 } 1487 1488 /* Book keeping. */ 1489 len -= M_SIZE(mb); 1490 if (mtail != NULL) 1491 mtail->m_next = mb; 1492 else 1493 nm = mb; 1494 mtail = mb; 1495 flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ 1496 } 1497 if (flags & M_EOR) 1498 mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ 1499 1500 /* If mbuf was supplied, append new chain to the end of it. */ 1501 if (m != NULL) { 1502 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) 1503 ; 1504 mtail->m_next = nm; 1505 mtail->m_flags &= ~M_EOR; 1506 } else 1507 m = nm; 1508 1509 return (m); 1510 } 1511 1512 /*- 1513 * Configure a provided mbuf to refer to the provided external storage 1514 * buffer and setup a reference count for said buffer. 1515 * 1516 * Arguments: 1517 * mb The existing mbuf to which to attach the provided buffer. 1518 * buf The address of the provided external storage buffer. 1519 * size The size of the provided buffer. 1520 * freef A pointer to a routine that is responsible for freeing the 1521 * provided external storage buffer. 1522 * args A pointer to an argument structure (of any type) to be passed 1523 * to the provided freef routine (may be NULL). 1524 * flags Any other flags to be passed to the provided mbuf. 1525 * type The type that the external storage buffer should be 1526 * labeled with. 1527 * 1528 * Returns: 1529 * Nothing. 1530 */ 1531 void 1532 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1533 void *arg1, void *arg2, int flags, int type) 1534 { 1535 1536 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1537 1538 mb->m_flags |= (M_EXT | flags); 1539 mb->m_ext.ext_buf = buf; 1540 mb->m_data = mb->m_ext.ext_buf; 1541 mb->m_ext.ext_size = size; 1542 mb->m_ext.ext_free = freef; 1543 mb->m_ext.ext_arg1 = arg1; 1544 mb->m_ext.ext_arg2 = arg2; 1545 mb->m_ext.ext_type = type; 1546 1547 if (type != EXT_EXTREF) { 1548 mb->m_ext.ext_count = 1; 1549 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1550 } else 1551 mb->m_ext.ext_flags = 0; 1552 } 1553 1554 /* 1555 * Free an entire chain of mbufs and associated external buffers, if 1556 * applicable. 1557 */ 1558 void 1559 m_freem(struct mbuf *mb) 1560 { 1561 1562 MBUF_PROBE1(m__freem, mb); 1563 while (mb != NULL) 1564 mb = m_free(mb); 1565 } 1566 1567 /* 1568 * Temporary primitive to allow freeing without going through m_free. 1569 */ 1570 void 1571 m_free_raw(struct mbuf *mb) 1572 { 1573 1574 uma_zfree(zone_mbuf, mb); 1575 } 1576 1577 int 1578 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, 1579 struct m_snd_tag **mstp) 1580 { 1581 1582 if (ifp->if_snd_tag_alloc == NULL) 1583 return (EOPNOTSUPP); 1584 return (ifp->if_snd_tag_alloc(ifp, params, mstp)); 1585 } 1586 1587 void 1588 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, 1589 const struct if_snd_tag_sw *sw) 1590 { 1591 1592 if_ref(ifp); 1593 mst->ifp = ifp; 1594 refcount_init(&mst->refcount, 1); 1595 mst->sw = sw; 1596 counter_u64_add(snd_tag_count, 1); 1597 } 1598 1599 void 1600 m_snd_tag_destroy(struct m_snd_tag *mst) 1601 { 1602 struct ifnet *ifp; 1603 1604 ifp = mst->ifp; 1605 mst->sw->snd_tag_free(mst); 1606 if_rele(ifp); 1607 counter_u64_add(snd_tag_count, -1); 1608 } 1609 1610 /* 1611 * Allocate an mbuf with anonymous external pages. 1612 */ 1613 struct mbuf * 1614 mb_alloc_ext_plus_pages(int len, int how) 1615 { 1616 struct mbuf *m; 1617 vm_page_t pg; 1618 int i, npgs; 1619 1620 m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1621 if (m == NULL) 1622 return (NULL); 1623 m->m_epg_flags |= EPG_FLAG_ANON; 1624 npgs = howmany(len, PAGE_SIZE); 1625 for (i = 0; i < npgs; i++) { 1626 do { 1627 pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 1628 VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); 1629 if (pg == NULL) { 1630 if (how == M_NOWAIT) { 1631 m->m_epg_npgs = i; 1632 m_free(m); 1633 return (NULL); 1634 } 1635 vm_wait(NULL); 1636 } 1637 } while (pg == NULL); 1638 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); 1639 } 1640 m->m_epg_npgs = npgs; 1641 return (m); 1642 } 1643 1644 /* 1645 * Copy the data in the mbuf chain to a chain of mbufs with anonymous external 1646 * unmapped pages. 1647 * len is the length of data in the input mbuf chain. 1648 * mlen is the maximum number of bytes put into each ext_page mbuf. 1649 */ 1650 struct mbuf * 1651 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, 1652 struct mbuf **mlast) 1653 { 1654 struct mbuf *m, *mout; 1655 char *pgpos, *mbpos; 1656 int i, mblen, mbufsiz, pglen, xfer; 1657 1658 if (len == 0) 1659 return (NULL); 1660 mbufsiz = min(mlen, len); 1661 m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); 1662 if (m == NULL) 1663 return (m); 1664 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); 1665 pglen = PAGE_SIZE; 1666 mblen = 0; 1667 i = 0; 1668 do { 1669 if (pglen == 0) { 1670 if (++i == m->m_epg_npgs) { 1671 m->m_epg_last_len = PAGE_SIZE; 1672 mbufsiz = min(mlen, len); 1673 m->m_next = mb_alloc_ext_plus_pages(mbufsiz, 1674 how); 1675 m = m->m_next; 1676 if (m == NULL) { 1677 m_freem(mout); 1678 return (m); 1679 } 1680 i = 0; 1681 } 1682 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); 1683 pglen = PAGE_SIZE; 1684 } 1685 while (mblen == 0) { 1686 if (mp == NULL) { 1687 m_freem(mout); 1688 return (NULL); 1689 } 1690 KASSERT((mp->m_flags & M_EXTPG) == 0, 1691 ("mb_copym_ext_pgs: ext_pgs input mbuf")); 1692 mbpos = mtod(mp, char *); 1693 mblen = mp->m_len; 1694 mp = mp->m_next; 1695 } 1696 xfer = min(mblen, pglen); 1697 memcpy(pgpos, mbpos, xfer); 1698 pgpos += xfer; 1699 mbpos += xfer; 1700 pglen -= xfer; 1701 mblen -= xfer; 1702 len -= xfer; 1703 m->m_len += xfer; 1704 } while (len > 0); 1705 m->m_epg_last_len = PAGE_SIZE - pglen; 1706 if (mlast != NULL) 1707 *mlast = m; 1708 return (mout); 1709 } 1710