1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_param.h" 34 #include "opt_kern_tls.h" 35 36 #include <sys/param.h> 37 #include <sys/conf.h> 38 #include <sys/domainset.h> 39 #include <sys/malloc.h> 40 #include <sys/systm.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/eventhandler.h> 44 #include <sys/kernel.h> 45 #include <sys/ktls.h> 46 #include <sys/limits.h> 47 #include <sys/lock.h> 48 #include <sys/mutex.h> 49 #include <sys/protosw.h> 50 #include <sys/refcount.h> 51 #include <sys/sf_buf.h> 52 #include <sys/smp.h> 53 #include <sys/socket.h> 54 #include <sys/sysctl.h> 55 56 #include <net/if.h> 57 #include <net/if_var.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_extern.h> 61 #include <vm/vm_kern.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_map.h> 65 #include <vm/uma.h> 66 #include <vm/uma_dbg.h> 67 68 /* 69 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 70 * Zones. 71 * 72 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 73 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 74 * administrator so desires. 75 * 76 * Mbufs are allocated from a UMA Primary Zone called the Mbuf 77 * Zone. 78 * 79 * Additionally, FreeBSD provides a Packet Zone, which it 80 * configures as a Secondary Zone to the Mbuf Primary Zone, 81 * thus sharing backend Slab kegs with the Mbuf Primary Zone. 82 * 83 * Thus common-case allocations and locking are simplified: 84 * 85 * m_clget() m_getcl() 86 * | | 87 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 88 * | | [ Packet ] | 89 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 90 * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] 91 * | \________ | 92 * [ Cluster Keg ] \ / 93 * | [ Mbuf Keg ] 94 * [ Cluster Slabs ] | 95 * | [ Mbuf Slabs ] 96 * \____________(VM)_________________/ 97 * 98 * 99 * Whenever an object is allocated with uma_zalloc() out of 100 * one of the Zones its _ctor_ function is executed. The same 101 * for any deallocation through uma_zfree() the _dtor_ function 102 * is executed. 103 * 104 * Caches are per-CPU and are filled from the Primary Zone. 105 * 106 * Whenever an object is allocated from the underlying global 107 * memory pool it gets pre-initialized with the _zinit_ functions. 108 * When the Keg's are overfull objects get decommissioned with 109 * _zfini_ functions and free'd back to the global memory pool. 110 * 111 */ 112 113 int nmbufs; /* limits number of mbufs */ 114 int nmbclusters; /* limits number of mbuf clusters */ 115 int nmbjumbop; /* limits number of page size jumbo clusters */ 116 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 117 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 118 119 bool mb_use_ext_pgs = false; /* use M_EXTPG mbufs for sendfile & TLS */ 120 121 static int 122 sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS) 123 { 124 int error, extpg; 125 126 extpg = mb_use_ext_pgs; 127 error = sysctl_handle_int(oidp, &extpg, 0, req); 128 if (error == 0 && req->newptr != NULL) { 129 if (extpg != 0 && !PMAP_HAS_DMAP) 130 error = EOPNOTSUPP; 131 else 132 mb_use_ext_pgs = extpg != 0; 133 } 134 return (error); 135 } 136 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLTYPE_INT | CTLFLAG_RW, 137 &mb_use_ext_pgs, 0, 138 sysctl_mb_use_ext_pgs, "IU", 139 "Use unmapped mbufs for sendfile(2) and TLS offload"); 140 141 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 142 143 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 144 "Maximum real memory allocatable to various mbuf types"); 145 146 static counter_u64_t snd_tag_count; 147 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, 148 &snd_tag_count, "# of active mbuf send tags"); 149 150 /* 151 * tunable_mbinit() has to be run before any mbuf allocations are done. 152 */ 153 static void 154 tunable_mbinit(void *dummy) 155 { 156 quad_t realmem; 157 int extpg; 158 159 /* 160 * The default limit for all mbuf related memory is 1/2 of all 161 * available kernel memory (physical or kmem). 162 * At most it can be 3/4 of available kernel memory. 163 */ 164 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 165 maxmbufmem = realmem / 2; 166 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 167 if (maxmbufmem > realmem / 4 * 3) 168 maxmbufmem = realmem / 4 * 3; 169 170 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 171 if (nmbclusters == 0) 172 nmbclusters = maxmbufmem / MCLBYTES / 4; 173 174 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 175 if (nmbjumbop == 0) 176 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 177 178 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 179 if (nmbjumbo9 == 0) 180 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 181 182 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 183 if (nmbjumbo16 == 0) 184 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 185 186 /* 187 * We need at least as many mbufs as we have clusters of 188 * the various types added together. 189 */ 190 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 191 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 192 nmbufs = lmax(maxmbufmem / MSIZE / 5, 193 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 194 195 /* 196 * Unmapped mbufs can only safely be used on platforms with a direct 197 * map. 198 */ 199 if (PMAP_HAS_DMAP) { 200 extpg = 1; 201 TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg); 202 mb_use_ext_pgs = extpg != 0; 203 } 204 } 205 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 206 207 static int 208 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 209 { 210 int error, newnmbclusters; 211 212 newnmbclusters = nmbclusters; 213 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 214 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 215 if (newnmbclusters > nmbclusters && 216 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 217 nmbclusters = newnmbclusters; 218 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 219 EVENTHANDLER_INVOKE(nmbclusters_change); 220 } else 221 error = EINVAL; 222 } 223 return (error); 224 } 225 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, 226 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbclusters, 0, 227 sysctl_nmbclusters, "IU", 228 "Maximum number of mbuf clusters allowed"); 229 230 static int 231 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 232 { 233 int error, newnmbjumbop; 234 235 newnmbjumbop = nmbjumbop; 236 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 237 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 238 if (newnmbjumbop > nmbjumbop && 239 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 240 nmbjumbop = newnmbjumbop; 241 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 242 } else 243 error = EINVAL; 244 } 245 return (error); 246 } 247 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, 248 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbjumbop, 0, 249 sysctl_nmbjumbop, "IU", 250 "Maximum number of mbuf page size jumbo clusters allowed"); 251 252 static int 253 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 254 { 255 int error, newnmbjumbo9; 256 257 newnmbjumbo9 = nmbjumbo9; 258 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 259 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 260 if (newnmbjumbo9 > nmbjumbo9 && 261 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 262 nmbjumbo9 = newnmbjumbo9; 263 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 264 } else 265 error = EINVAL; 266 } 267 return (error); 268 } 269 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, 270 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbjumbo9, 0, 271 sysctl_nmbjumbo9, "IU", 272 "Maximum number of mbuf 9k jumbo clusters allowed"); 273 274 static int 275 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 276 { 277 int error, newnmbjumbo16; 278 279 newnmbjumbo16 = nmbjumbo16; 280 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 281 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 282 if (newnmbjumbo16 > nmbjumbo16 && 283 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 284 nmbjumbo16 = newnmbjumbo16; 285 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 286 } else 287 error = EINVAL; 288 } 289 return (error); 290 } 291 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, 292 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &nmbjumbo16, 0, 293 sysctl_nmbjumbo16, "IU", 294 "Maximum number of mbuf 16k jumbo clusters allowed"); 295 296 static int 297 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 298 { 299 int error, newnmbufs; 300 301 newnmbufs = nmbufs; 302 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 303 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 304 if (newnmbufs > nmbufs) { 305 nmbufs = newnmbufs; 306 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 307 EVENTHANDLER_INVOKE(nmbufs_change); 308 } else 309 error = EINVAL; 310 } 311 return (error); 312 } 313 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, 314 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 315 &nmbufs, 0, sysctl_nmbufs, "IU", 316 "Maximum number of mbufs allowed"); 317 318 /* 319 * Zones from which we allocate. 320 */ 321 uma_zone_t zone_mbuf; 322 uma_zone_t zone_clust; 323 uma_zone_t zone_pack; 324 uma_zone_t zone_jumbop; 325 uma_zone_t zone_jumbo9; 326 uma_zone_t zone_jumbo16; 327 328 /* 329 * Local prototypes. 330 */ 331 static int mb_ctor_mbuf(void *, int, void *, int); 332 static int mb_ctor_clust(void *, int, void *, int); 333 static int mb_ctor_pack(void *, int, void *, int); 334 static void mb_dtor_mbuf(void *, int, void *); 335 static void mb_dtor_pack(void *, int, void *); 336 static int mb_zinit_pack(void *, int, int); 337 static void mb_zfini_pack(void *, int); 338 static void mb_reclaim(uma_zone_t, int); 339 340 /* Ensure that MSIZE is a power of 2. */ 341 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 342 343 _Static_assert(sizeof(struct mbuf) <= MSIZE, 344 "size of mbuf exceeds MSIZE"); 345 /* 346 * Initialize FreeBSD Network buffer allocation. 347 */ 348 static void 349 mbuf_init(void *dummy) 350 { 351 352 /* 353 * Configure UMA zones for Mbufs, Clusters, and Packets. 354 */ 355 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 356 mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 357 MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); 358 if (nmbufs > 0) 359 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 360 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 361 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 362 363 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 364 mb_ctor_clust, NULL, NULL, NULL, 365 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 366 if (nmbclusters > 0) 367 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 368 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 369 uma_zone_set_maxaction(zone_clust, mb_reclaim); 370 371 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 372 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 373 374 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 375 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 376 mb_ctor_clust, NULL, NULL, NULL, 377 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 378 if (nmbjumbop > 0) 379 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 380 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 381 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 382 383 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 384 mb_ctor_clust, NULL, NULL, NULL, 385 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 386 if (nmbjumbo9 > 0) 387 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 388 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 389 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 390 391 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 392 mb_ctor_clust, NULL, NULL, NULL, 393 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 394 if (nmbjumbo16 > 0) 395 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 396 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 397 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 398 399 /* 400 * Hook event handler for low-memory situation, used to 401 * drain protocols and push data back to the caches (UMA 402 * later pushes it back to VM). 403 */ 404 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 405 EVENTHANDLER_PRI_FIRST); 406 407 snd_tag_count = counter_u64_alloc(M_WAITOK); 408 } 409 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 410 411 #ifdef DEBUGNET 412 /* 413 * debugnet makes use of a pre-allocated pool of mbufs and clusters. When 414 * debugnet is configured, we initialize a set of UMA cache zones which return 415 * items from this pool. At panic-time, the regular UMA zone pointers are 416 * overwritten with those of the cache zones so that drivers may allocate and 417 * free mbufs and clusters without attempting to allocate physical memory. 418 * 419 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 420 * the purpose of caching clusters, we treat them as mbufs. 421 */ 422 static struct mbufq dn_mbufq = 423 { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; 424 static struct mbufq dn_clustq = 425 { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; 426 427 static int dn_clsize; 428 static uma_zone_t dn_zone_mbuf; 429 static uma_zone_t dn_zone_clust; 430 static uma_zone_t dn_zone_pack; 431 432 static struct debugnet_saved_zones { 433 uma_zone_t dsz_mbuf; 434 uma_zone_t dsz_clust; 435 uma_zone_t dsz_pack; 436 uma_zone_t dsz_jumbop; 437 uma_zone_t dsz_jumbo9; 438 uma_zone_t dsz_jumbo16; 439 bool dsz_debugnet_zones_enabled; 440 } dn_saved_zones; 441 442 static int 443 dn_buf_import(void *arg, void **store, int count, int domain __unused, 444 int flags) 445 { 446 struct mbufq *q; 447 struct mbuf *m; 448 int i; 449 450 q = arg; 451 452 for (i = 0; i < count; i++) { 453 m = mbufq_dequeue(q); 454 if (m == NULL) 455 break; 456 trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); 457 store[i] = m; 458 } 459 KASSERT((flags & M_WAITOK) == 0 || i == count, 460 ("%s: ran out of pre-allocated mbufs", __func__)); 461 return (i); 462 } 463 464 static void 465 dn_buf_release(void *arg, void **store, int count) 466 { 467 struct mbufq *q; 468 struct mbuf *m; 469 int i; 470 471 q = arg; 472 473 for (i = 0; i < count; i++) { 474 m = store[i]; 475 (void)mbufq_enqueue(q, m); 476 } 477 } 478 479 static int 480 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, 481 int flags __unused) 482 { 483 struct mbuf *m; 484 void *clust; 485 int i; 486 487 for (i = 0; i < count; i++) { 488 m = m_get(MT_DATA, M_NOWAIT); 489 if (m == NULL) 490 break; 491 clust = uma_zalloc(dn_zone_clust, M_NOWAIT); 492 if (clust == NULL) { 493 m_free(m); 494 break; 495 } 496 mb_ctor_clust(clust, dn_clsize, m, 0); 497 store[i] = m; 498 } 499 KASSERT((flags & M_WAITOK) == 0 || i == count, 500 ("%s: ran out of pre-allocated mbufs", __func__)); 501 return (i); 502 } 503 504 static void 505 dn_pack_release(void *arg __unused, void **store, int count) 506 { 507 struct mbuf *m; 508 void *clust; 509 int i; 510 511 for (i = 0; i < count; i++) { 512 m = store[i]; 513 clust = m->m_ext.ext_buf; 514 uma_zfree(dn_zone_clust, clust); 515 uma_zfree(dn_zone_mbuf, m); 516 } 517 } 518 519 /* 520 * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy 521 * the corresponding UMA cache zones. 522 */ 523 void 524 debugnet_mbuf_drain(void) 525 { 526 struct mbuf *m; 527 void *item; 528 529 if (dn_zone_mbuf != NULL) { 530 uma_zdestroy(dn_zone_mbuf); 531 dn_zone_mbuf = NULL; 532 } 533 if (dn_zone_clust != NULL) { 534 uma_zdestroy(dn_zone_clust); 535 dn_zone_clust = NULL; 536 } 537 if (dn_zone_pack != NULL) { 538 uma_zdestroy(dn_zone_pack); 539 dn_zone_pack = NULL; 540 } 541 542 while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) 543 m_free(m); 544 while ((item = mbufq_dequeue(&dn_clustq)) != NULL) 545 uma_zfree(m_getzone(dn_clsize), item); 546 } 547 548 /* 549 * Callback invoked immediately prior to starting a debugnet connection. 550 */ 551 void 552 debugnet_mbuf_start(void) 553 { 554 555 MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); 556 557 /* Save the old zone pointers to restore when debugnet is closed. */ 558 dn_saved_zones = (struct debugnet_saved_zones) { 559 .dsz_debugnet_zones_enabled = true, 560 .dsz_mbuf = zone_mbuf, 561 .dsz_clust = zone_clust, 562 .dsz_pack = zone_pack, 563 .dsz_jumbop = zone_jumbop, 564 .dsz_jumbo9 = zone_jumbo9, 565 .dsz_jumbo16 = zone_jumbo16, 566 }; 567 568 /* 569 * All cluster zones return buffers of the size requested by the 570 * drivers. It's up to the driver to reinitialize the zones if the 571 * MTU of a debugnet-enabled interface changes. 572 */ 573 printf("debugnet: overwriting mbuf zone pointers\n"); 574 zone_mbuf = dn_zone_mbuf; 575 zone_clust = dn_zone_clust; 576 zone_pack = dn_zone_pack; 577 zone_jumbop = dn_zone_clust; 578 zone_jumbo9 = dn_zone_clust; 579 zone_jumbo16 = dn_zone_clust; 580 } 581 582 /* 583 * Callback invoked when a debugnet connection is closed/finished. 584 */ 585 void 586 debugnet_mbuf_finish(void) 587 { 588 589 MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); 590 591 printf("debugnet: restoring mbuf zone pointers\n"); 592 zone_mbuf = dn_saved_zones.dsz_mbuf; 593 zone_clust = dn_saved_zones.dsz_clust; 594 zone_pack = dn_saved_zones.dsz_pack; 595 zone_jumbop = dn_saved_zones.dsz_jumbop; 596 zone_jumbo9 = dn_saved_zones.dsz_jumbo9; 597 zone_jumbo16 = dn_saved_zones.dsz_jumbo16; 598 599 memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); 600 } 601 602 /* 603 * Reinitialize the debugnet mbuf+cluster pool and cache zones. 604 */ 605 void 606 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) 607 { 608 struct mbuf *m; 609 void *item; 610 611 debugnet_mbuf_drain(); 612 613 dn_clsize = clsize; 614 615 dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, 616 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 617 dn_buf_import, dn_buf_release, 618 &dn_mbufq, UMA_ZONE_NOBUCKET); 619 620 dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, 621 clsize, mb_ctor_clust, NULL, NULL, NULL, 622 dn_buf_import, dn_buf_release, 623 &dn_clustq, UMA_ZONE_NOBUCKET); 624 625 dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, 626 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 627 dn_pack_import, dn_pack_release, 628 NULL, UMA_ZONE_NOBUCKET); 629 630 while (nmbuf-- > 0) { 631 m = m_get(MT_DATA, M_WAITOK); 632 uma_zfree(dn_zone_mbuf, m); 633 } 634 while (nclust-- > 0) { 635 item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); 636 uma_zfree(dn_zone_clust, item); 637 } 638 } 639 #endif /* DEBUGNET */ 640 641 /* 642 * Constructor for Mbuf primary zone. 643 * 644 * The 'arg' pointer points to a mb_args structure which 645 * contains call-specific information required to support the 646 * mbuf allocation API. See mbuf.h. 647 */ 648 static int 649 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 650 { 651 struct mbuf *m; 652 struct mb_args *args; 653 int error; 654 int flags; 655 short type; 656 657 args = (struct mb_args *)arg; 658 type = args->type; 659 660 /* 661 * The mbuf is initialized later. The caller has the 662 * responsibility to set up any MAC labels too. 663 */ 664 if (type == MT_NOINIT) 665 return (0); 666 667 m = (struct mbuf *)mem; 668 flags = args->flags; 669 MPASS((flags & M_NOFREE) == 0); 670 671 error = m_init(m, how, type, flags); 672 673 return (error); 674 } 675 676 /* 677 * The Mbuf primary zone destructor. 678 */ 679 static void 680 mb_dtor_mbuf(void *mem, int size, void *arg) 681 { 682 struct mbuf *m; 683 unsigned long flags __diagused; 684 685 m = (struct mbuf *)mem; 686 flags = (unsigned long)arg; 687 688 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 689 KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); 690 if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 691 m_tag_delete_chain(m, NULL); 692 } 693 694 /* 695 * The Mbuf Packet zone destructor. 696 */ 697 static void 698 mb_dtor_pack(void *mem, int size, void *arg) 699 { 700 struct mbuf *m; 701 702 m = (struct mbuf *)mem; 703 if ((m->m_flags & M_PKTHDR) != 0) 704 m_tag_delete_chain(m, NULL); 705 706 /* Make sure we've got a clean cluster back. */ 707 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 708 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 709 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 710 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 711 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 712 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 713 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 714 #if defined(INVARIANTS) && !defined(KMSAN) 715 trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); 716 #endif 717 /* 718 * If there are processes blocked on zone_clust, waiting for pages 719 * to be freed up, cause them to be woken up by draining the 720 * packet zone. We are exposed to a race here (in the check for 721 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 722 * is deliberate. We don't want to acquire the zone lock for every 723 * mbuf free. 724 */ 725 if (uma_zone_exhausted(zone_clust)) 726 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 727 } 728 729 /* 730 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 731 * 732 * Here the 'arg' pointer points to the Mbuf which we 733 * are configuring cluster storage for. If 'arg' is 734 * empty we allocate just the cluster without setting 735 * the mbuf to it. See mbuf.h. 736 */ 737 static int 738 mb_ctor_clust(void *mem, int size, void *arg, int how) 739 { 740 struct mbuf *m; 741 742 m = (struct mbuf *)arg; 743 if (m != NULL) { 744 m->m_ext.ext_buf = (char *)mem; 745 m->m_data = m->m_ext.ext_buf; 746 m->m_flags |= M_EXT; 747 m->m_ext.ext_free = NULL; 748 m->m_ext.ext_arg1 = NULL; 749 m->m_ext.ext_arg2 = NULL; 750 m->m_ext.ext_size = size; 751 m->m_ext.ext_type = m_gettype(size); 752 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 753 m->m_ext.ext_count = 1; 754 } 755 756 return (0); 757 } 758 759 /* 760 * The Packet secondary zone's init routine, executed on the 761 * object's transition from mbuf keg slab to zone cache. 762 */ 763 static int 764 mb_zinit_pack(void *mem, int size, int how) 765 { 766 struct mbuf *m; 767 768 m = (struct mbuf *)mem; /* m is virgin. */ 769 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 770 m->m_ext.ext_buf == NULL) 771 return (ENOMEM); 772 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 773 #if defined(INVARIANTS) && !defined(KMSAN) 774 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 775 #endif 776 return (0); 777 } 778 779 /* 780 * The Packet secondary zone's fini routine, executed on the 781 * object's transition from zone cache to keg slab. 782 */ 783 static void 784 mb_zfini_pack(void *mem, int size) 785 { 786 struct mbuf *m; 787 788 m = (struct mbuf *)mem; 789 #if defined(INVARIANTS) && !defined(KMSAN) 790 trash_fini(m->m_ext.ext_buf, MCLBYTES); 791 #endif 792 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 793 #if defined(INVARIANTS) && !defined(KMSAN) 794 trash_dtor(mem, size, NULL); 795 #endif 796 } 797 798 /* 799 * The "packet" keg constructor. 800 */ 801 static int 802 mb_ctor_pack(void *mem, int size, void *arg, int how) 803 { 804 struct mbuf *m; 805 struct mb_args *args; 806 int error, flags; 807 short type; 808 809 m = (struct mbuf *)mem; 810 args = (struct mb_args *)arg; 811 flags = args->flags; 812 type = args->type; 813 MPASS((flags & M_NOFREE) == 0); 814 815 #if defined(INVARIANTS) && !defined(KMSAN) 816 trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); 817 #endif 818 819 error = m_init(m, how, type, flags); 820 821 /* m_ext is already initialized. */ 822 m->m_data = m->m_ext.ext_buf; 823 m->m_flags = (flags | M_EXT); 824 825 return (error); 826 } 827 828 /* 829 * This is the protocol drain routine. Called by UMA whenever any of the 830 * mbuf zones is closed to its limit. 831 * 832 * No locks should be held when this is called. The drain routines have to 833 * presently acquire some locks which raises the possibility of lock order 834 * reversal. 835 */ 836 static void 837 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 838 { 839 struct epoch_tracker et; 840 struct domain *dp; 841 struct protosw *pr; 842 843 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); 844 845 NET_EPOCH_ENTER(et); 846 for (dp = domains; dp != NULL; dp = dp->dom_next) 847 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 848 if (pr->pr_drain != NULL) 849 (*pr->pr_drain)(); 850 NET_EPOCH_EXIT(et); 851 } 852 853 /* 854 * Free "count" units of I/O from an mbuf chain. They could be held 855 * in M_EXTPG or just as a normal mbuf. This code is intended to be 856 * called in an error path (I/O error, closed connection, etc). 857 */ 858 void 859 mb_free_notready(struct mbuf *m, int count) 860 { 861 int i; 862 863 for (i = 0; i < count && m != NULL; i++) { 864 if ((m->m_flags & M_EXTPG) != 0) { 865 m->m_epg_nrdy--; 866 if (m->m_epg_nrdy != 0) 867 continue; 868 } 869 m = m_free(m); 870 } 871 KASSERT(i == count, ("Removed only %d items from %p", i, m)); 872 } 873 874 /* 875 * Compress an unmapped mbuf into a simple mbuf when it holds a small 876 * amount of data. This is used as a DOS defense to avoid having 877 * small packets tie up wired pages, an ext_pgs structure, and an 878 * mbuf. Since this converts the existing mbuf in place, it can only 879 * be used if there are no other references to 'm'. 880 */ 881 int 882 mb_unmapped_compress(struct mbuf *m) 883 { 884 volatile u_int *refcnt; 885 char buf[MLEN]; 886 887 /* 888 * Assert that 'm' does not have a packet header. If 'm' had 889 * a packet header, it would only be able to hold MHLEN bytes 890 * and m_data would have to be initialized differently. 891 */ 892 KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), 893 ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); 894 KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); 895 896 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 897 refcnt = &m->m_ext.ext_count; 898 } else { 899 KASSERT(m->m_ext.ext_cnt != NULL, 900 ("%s: no refcounting pointer on %p", __func__, m)); 901 refcnt = m->m_ext.ext_cnt; 902 } 903 904 if (*refcnt != 1) 905 return (EBUSY); 906 907 m_copydata(m, 0, m->m_len, buf); 908 909 /* Free the backing pages. */ 910 m->m_ext.ext_free(m); 911 912 /* Turn 'm' into a "normal" mbuf. */ 913 m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); 914 m->m_data = m->m_dat; 915 916 /* Copy data back into m. */ 917 bcopy(buf, mtod(m, char *), m->m_len); 918 919 return (0); 920 } 921 922 /* 923 * These next few routines are used to permit downgrading an unmapped 924 * mbuf to a chain of mapped mbufs. This is used when an interface 925 * doesn't supported unmapped mbufs or if checksums need to be 926 * computed in software. 927 * 928 * Each unmapped mbuf is converted to a chain of mbufs. First, any 929 * TLS header data is stored in a regular mbuf. Second, each page of 930 * unmapped data is stored in an mbuf with an EXT_SFBUF external 931 * cluster. These mbufs use an sf_buf to provide a valid KVA for the 932 * associated physical page. They also hold a reference on the 933 * original M_EXTPG mbuf to ensure the physical page doesn't go away. 934 * Finally, any TLS trailer data is stored in a regular mbuf. 935 * 936 * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF 937 * mbufs. It frees the associated sf_buf and releases its reference 938 * on the original M_EXTPG mbuf. 939 * 940 * _mb_unmapped_to_ext() is a helper function that converts a single 941 * unmapped mbuf into a chain of mbufs. 942 * 943 * mb_unmapped_to_ext() is the public function that walks an mbuf 944 * chain converting any unmapped mbufs to mapped mbufs. It returns 945 * the new chain of unmapped mbufs on success. On failure it frees 946 * the original mbuf chain and returns NULL. 947 */ 948 static void 949 mb_unmapped_free_mext(struct mbuf *m) 950 { 951 struct sf_buf *sf; 952 struct mbuf *old_m; 953 954 sf = m->m_ext.ext_arg1; 955 sf_buf_free(sf); 956 957 /* Drop the reference on the backing M_EXTPG mbuf. */ 958 old_m = m->m_ext.ext_arg2; 959 mb_free_extpg(old_m); 960 } 961 962 static struct mbuf * 963 _mb_unmapped_to_ext(struct mbuf *m) 964 { 965 struct mbuf *m_new, *top, *prev, *mref; 966 struct sf_buf *sf; 967 vm_page_t pg; 968 int i, len, off, pglen, pgoff, seglen, segoff; 969 volatile u_int *refcnt; 970 u_int ref_inc = 0; 971 972 M_ASSERTEXTPG(m); 973 len = m->m_len; 974 KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", 975 __func__, m)); 976 977 /* See if this is the mbuf that holds the embedded refcount. */ 978 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 979 refcnt = &m->m_ext.ext_count; 980 mref = m; 981 } else { 982 KASSERT(m->m_ext.ext_cnt != NULL, 983 ("%s: no refcounting pointer on %p", __func__, m)); 984 refcnt = m->m_ext.ext_cnt; 985 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 986 } 987 988 /* Skip over any data removed from the front. */ 989 off = mtod(m, vm_offset_t); 990 991 top = NULL; 992 if (m->m_epg_hdrlen != 0) { 993 if (off >= m->m_epg_hdrlen) { 994 off -= m->m_epg_hdrlen; 995 } else { 996 seglen = m->m_epg_hdrlen - off; 997 segoff = off; 998 seglen = min(seglen, len); 999 off = 0; 1000 len -= seglen; 1001 m_new = m_get(M_NOWAIT, MT_DATA); 1002 if (m_new == NULL) 1003 goto fail; 1004 m_new->m_len = seglen; 1005 prev = top = m_new; 1006 memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], 1007 seglen); 1008 } 1009 } 1010 pgoff = m->m_epg_1st_off; 1011 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 1012 pglen = m_epg_pagelen(m, i, pgoff); 1013 if (off >= pglen) { 1014 off -= pglen; 1015 pgoff = 0; 1016 continue; 1017 } 1018 seglen = pglen - off; 1019 segoff = pgoff + off; 1020 off = 0; 1021 seglen = min(seglen, len); 1022 len -= seglen; 1023 1024 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1025 m_new = m_get(M_NOWAIT, MT_DATA); 1026 if (m_new == NULL) 1027 goto fail; 1028 if (top == NULL) { 1029 top = prev = m_new; 1030 } else { 1031 prev->m_next = m_new; 1032 prev = m_new; 1033 } 1034 sf = sf_buf_alloc(pg, SFB_NOWAIT); 1035 if (sf == NULL) 1036 goto fail; 1037 1038 ref_inc++; 1039 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, 1040 mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); 1041 m_new->m_data += segoff; 1042 m_new->m_len = seglen; 1043 1044 pgoff = 0; 1045 }; 1046 if (len != 0) { 1047 KASSERT((off + len) <= m->m_epg_trllen, 1048 ("off + len > trail (%d + %d > %d)", off, len, 1049 m->m_epg_trllen)); 1050 m_new = m_get(M_NOWAIT, MT_DATA); 1051 if (m_new == NULL) 1052 goto fail; 1053 if (top == NULL) 1054 top = m_new; 1055 else 1056 prev->m_next = m_new; 1057 m_new->m_len = len; 1058 memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); 1059 } 1060 1061 if (ref_inc != 0) { 1062 /* 1063 * Obtain an additional reference on the old mbuf for 1064 * each created EXT_SFBUF mbuf. They will be dropped 1065 * in mb_unmapped_free_mext(). 1066 */ 1067 if (*refcnt == 1) 1068 *refcnt += ref_inc; 1069 else 1070 atomic_add_int(refcnt, ref_inc); 1071 } 1072 m_free(m); 1073 return (top); 1074 1075 fail: 1076 if (ref_inc != 0) { 1077 /* 1078 * Obtain an additional reference on the old mbuf for 1079 * each created EXT_SFBUF mbuf. They will be 1080 * immediately dropped when these mbufs are freed 1081 * below. 1082 */ 1083 if (*refcnt == 1) 1084 *refcnt += ref_inc; 1085 else 1086 atomic_add_int(refcnt, ref_inc); 1087 } 1088 m_free(m); 1089 m_freem(top); 1090 return (NULL); 1091 } 1092 1093 struct mbuf * 1094 mb_unmapped_to_ext(struct mbuf *top) 1095 { 1096 struct mbuf *m, *next, *prev = NULL; 1097 1098 prev = NULL; 1099 for (m = top; m != NULL; m = next) { 1100 /* m might be freed, so cache the next pointer. */ 1101 next = m->m_next; 1102 if (m->m_flags & M_EXTPG) { 1103 if (prev != NULL) { 1104 /* 1105 * Remove 'm' from the new chain so 1106 * that the 'top' chain terminates 1107 * before 'm' in case 'top' is freed 1108 * due to an error. 1109 */ 1110 prev->m_next = NULL; 1111 } 1112 m = _mb_unmapped_to_ext(m); 1113 if (m == NULL) { 1114 m_freem(top); 1115 m_freem(next); 1116 return (NULL); 1117 } 1118 if (prev == NULL) { 1119 top = m; 1120 } else { 1121 prev->m_next = m; 1122 } 1123 1124 /* 1125 * Replaced one mbuf with a chain, so we must 1126 * find the end of chain. 1127 */ 1128 prev = m_last(m); 1129 } else { 1130 if (prev != NULL) { 1131 prev->m_next = m; 1132 } 1133 prev = m; 1134 } 1135 } 1136 return (top); 1137 } 1138 1139 /* 1140 * Allocate an empty M_EXTPG mbuf. The ext_free routine is 1141 * responsible for freeing any pages backing this mbuf when it is 1142 * freed. 1143 */ 1144 struct mbuf * 1145 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) 1146 { 1147 struct mbuf *m; 1148 1149 m = m_get(how, MT_DATA); 1150 if (m == NULL) 1151 return (NULL); 1152 1153 m->m_epg_npgs = 0; 1154 m->m_epg_nrdy = 0; 1155 m->m_epg_1st_off = 0; 1156 m->m_epg_last_len = 0; 1157 m->m_epg_flags = 0; 1158 m->m_epg_hdrlen = 0; 1159 m->m_epg_trllen = 0; 1160 m->m_epg_tls = NULL; 1161 m->m_epg_so = NULL; 1162 m->m_data = NULL; 1163 m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); 1164 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 1165 m->m_ext.ext_count = 1; 1166 m->m_ext.ext_size = 0; 1167 m->m_ext.ext_free = ext_free; 1168 return (m); 1169 } 1170 1171 /* 1172 * Clean up after mbufs with M_EXT storage attached to them if the 1173 * reference count hits 1. 1174 */ 1175 void 1176 mb_free_ext(struct mbuf *m) 1177 { 1178 volatile u_int *refcnt; 1179 struct mbuf *mref; 1180 int freembuf; 1181 1182 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 1183 1184 /* See if this is the mbuf that holds the embedded refcount. */ 1185 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1186 refcnt = &m->m_ext.ext_count; 1187 mref = m; 1188 } else { 1189 KASSERT(m->m_ext.ext_cnt != NULL, 1190 ("%s: no refcounting pointer on %p", __func__, m)); 1191 refcnt = m->m_ext.ext_cnt; 1192 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1193 } 1194 1195 /* 1196 * Check if the header is embedded in the cluster. It is 1197 * important that we can't touch any of the mbuf fields 1198 * after we have freed the external storage, since mbuf 1199 * could have been embedded in it. For now, the mbufs 1200 * embedded into the cluster are always of type EXT_EXTREF, 1201 * and for this type we won't free the mref. 1202 */ 1203 if (m->m_flags & M_NOFREE) { 1204 freembuf = 0; 1205 KASSERT(m->m_ext.ext_type == EXT_EXTREF || 1206 m->m_ext.ext_type == EXT_RXRING, 1207 ("%s: no-free mbuf %p has wrong type", __func__, m)); 1208 } else 1209 freembuf = 1; 1210 1211 /* Free attached storage if this mbuf is the only reference to it. */ 1212 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1213 switch (m->m_ext.ext_type) { 1214 case EXT_PACKET: 1215 /* The packet zone is special. */ 1216 if (*refcnt == 0) 1217 *refcnt = 1; 1218 uma_zfree(zone_pack, mref); 1219 break; 1220 case EXT_CLUSTER: 1221 uma_zfree(zone_clust, m->m_ext.ext_buf); 1222 m_free_raw(mref); 1223 break; 1224 case EXT_JUMBOP: 1225 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 1226 m_free_raw(mref); 1227 break; 1228 case EXT_JUMBO9: 1229 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 1230 m_free_raw(mref); 1231 break; 1232 case EXT_JUMBO16: 1233 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 1234 m_free_raw(mref); 1235 break; 1236 case EXT_SFBUF: 1237 case EXT_NET_DRV: 1238 case EXT_MOD_TYPE: 1239 case EXT_DISPOSABLE: 1240 KASSERT(mref->m_ext.ext_free != NULL, 1241 ("%s: ext_free not set", __func__)); 1242 mref->m_ext.ext_free(mref); 1243 m_free_raw(mref); 1244 break; 1245 case EXT_EXTREF: 1246 KASSERT(m->m_ext.ext_free != NULL, 1247 ("%s: ext_free not set", __func__)); 1248 m->m_ext.ext_free(m); 1249 break; 1250 case EXT_RXRING: 1251 KASSERT(m->m_ext.ext_free == NULL, 1252 ("%s: ext_free is set", __func__)); 1253 break; 1254 default: 1255 KASSERT(m->m_ext.ext_type == 0, 1256 ("%s: unknown ext_type", __func__)); 1257 } 1258 } 1259 1260 if (freembuf && m != mref) 1261 m_free_raw(m); 1262 } 1263 1264 /* 1265 * Clean up after mbufs with M_EXTPG storage attached to them if the 1266 * reference count hits 1. 1267 */ 1268 void 1269 mb_free_extpg(struct mbuf *m) 1270 { 1271 volatile u_int *refcnt; 1272 struct mbuf *mref; 1273 1274 M_ASSERTEXTPG(m); 1275 1276 /* See if this is the mbuf that holds the embedded refcount. */ 1277 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1278 refcnt = &m->m_ext.ext_count; 1279 mref = m; 1280 } else { 1281 KASSERT(m->m_ext.ext_cnt != NULL, 1282 ("%s: no refcounting pointer on %p", __func__, m)); 1283 refcnt = m->m_ext.ext_cnt; 1284 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1285 } 1286 1287 /* Free attached storage if this mbuf is the only reference to it. */ 1288 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1289 KASSERT(mref->m_ext.ext_free != NULL, 1290 ("%s: ext_free not set", __func__)); 1291 1292 mref->m_ext.ext_free(mref); 1293 #ifdef KERN_TLS 1294 if (mref->m_epg_tls != NULL && 1295 !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) 1296 ktls_enqueue_to_free(mref); 1297 else 1298 #endif 1299 m_free_raw(mref); 1300 } 1301 1302 if (m != mref) 1303 m_free_raw(m); 1304 } 1305 1306 /* 1307 * Official mbuf(9) allocation KPI for stack and drivers: 1308 * 1309 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 1310 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 1311 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 1312 * m_clget() - attach cluster to already allocated mbuf. 1313 * m_cljget() - attach jumbo cluster to already allocated mbuf. 1314 * m_get2() - allocate minimum mbuf that would fit size argument. 1315 * m_getm2() - allocate a chain of mbufs/clusters. 1316 * m_extadd() - attach external cluster to mbuf. 1317 * 1318 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 1319 * m_freem() - free chain of mbufs. 1320 */ 1321 1322 int 1323 m_clget(struct mbuf *m, int how) 1324 { 1325 1326 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1327 __func__, m)); 1328 m->m_ext.ext_buf = (char *)NULL; 1329 uma_zalloc_arg(zone_clust, m, how); 1330 /* 1331 * On a cluster allocation failure, drain the packet zone and retry, 1332 * we might be able to loosen a few clusters up on the drain. 1333 */ 1334 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 1335 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 1336 uma_zalloc_arg(zone_clust, m, how); 1337 } 1338 MBUF_PROBE2(m__clget, m, how); 1339 return (m->m_flags & M_EXT); 1340 } 1341 1342 /* 1343 * m_cljget() is different from m_clget() as it can allocate clusters without 1344 * attaching them to an mbuf. In that case the return value is the pointer 1345 * to the cluster of the requested size. If an mbuf was specified, it gets 1346 * the cluster attached to it and the return value can be safely ignored. 1347 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1348 */ 1349 void * 1350 m_cljget(struct mbuf *m, int how, int size) 1351 { 1352 uma_zone_t zone; 1353 void *retval; 1354 1355 if (m != NULL) { 1356 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1357 __func__, m)); 1358 m->m_ext.ext_buf = NULL; 1359 } 1360 1361 zone = m_getzone(size); 1362 retval = uma_zalloc_arg(zone, m, how); 1363 1364 MBUF_PROBE4(m__cljget, m, how, size, retval); 1365 1366 return (retval); 1367 } 1368 1369 /* 1370 * m_get2() allocates minimum mbuf that would fit "size" argument. 1371 */ 1372 struct mbuf * 1373 m_get2(int size, int how, short type, int flags) 1374 { 1375 struct mb_args args; 1376 struct mbuf *m, *n; 1377 1378 args.flags = flags; 1379 args.type = type; 1380 1381 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 1382 return (uma_zalloc_arg(zone_mbuf, &args, how)); 1383 if (size <= MCLBYTES) 1384 return (uma_zalloc_arg(zone_pack, &args, how)); 1385 1386 if (size > MJUMPAGESIZE) 1387 return (NULL); 1388 1389 m = uma_zalloc_arg(zone_mbuf, &args, how); 1390 if (m == NULL) 1391 return (NULL); 1392 1393 n = uma_zalloc_arg(zone_jumbop, m, how); 1394 if (n == NULL) { 1395 m_free_raw(m); 1396 return (NULL); 1397 } 1398 1399 return (m); 1400 } 1401 1402 /* 1403 * m_get3() allocates minimum mbuf that would fit "size" argument. 1404 * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. 1405 */ 1406 struct mbuf * 1407 m_get3(int size, int how, short type, int flags) 1408 { 1409 struct mb_args args; 1410 struct mbuf *m, *n; 1411 uma_zone_t zone; 1412 1413 if (size <= MJUMPAGESIZE) 1414 return (m_get2(size, how, type, flags)); 1415 1416 if (size > MJUM16BYTES) 1417 return (NULL); 1418 1419 args.flags = flags; 1420 args.type = type; 1421 1422 m = uma_zalloc_arg(zone_mbuf, &args, how); 1423 if (m == NULL) 1424 return (NULL); 1425 1426 if (size <= MJUM9BYTES) 1427 zone = zone_jumbo9; 1428 else 1429 zone = zone_jumbo16; 1430 1431 n = uma_zalloc_arg(zone, m, how); 1432 if (n == NULL) { 1433 m_free_raw(m); 1434 return (NULL); 1435 } 1436 1437 return (m); 1438 } 1439 1440 /* 1441 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1442 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1443 */ 1444 struct mbuf * 1445 m_getjcl(int how, short type, int flags, int size) 1446 { 1447 struct mb_args args; 1448 struct mbuf *m, *n; 1449 uma_zone_t zone; 1450 1451 if (size == MCLBYTES) 1452 return m_getcl(how, type, flags); 1453 1454 args.flags = flags; 1455 args.type = type; 1456 1457 m = uma_zalloc_arg(zone_mbuf, &args, how); 1458 if (m == NULL) 1459 return (NULL); 1460 1461 zone = m_getzone(size); 1462 n = uma_zalloc_arg(zone, m, how); 1463 if (n == NULL) { 1464 m_free_raw(m); 1465 return (NULL); 1466 } 1467 MBUF_PROBE5(m__getjcl, how, type, flags, size, m); 1468 return (m); 1469 } 1470 1471 /* 1472 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1473 * best) and return a pointer to the top of the allocated chain. If an 1474 * existing mbuf chain is provided, then we will append the new chain 1475 * to the existing one and return a pointer to the provided mbuf. 1476 */ 1477 struct mbuf * 1478 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1479 { 1480 struct mbuf *mb, *nm = NULL, *mtail = NULL; 1481 1482 KASSERT(len >= 0, ("%s: len is < 0", __func__)); 1483 1484 /* Validate flags. */ 1485 flags &= (M_PKTHDR | M_EOR); 1486 1487 /* Packet header mbuf must be first in chain. */ 1488 if ((flags & M_PKTHDR) && m != NULL) 1489 flags &= ~M_PKTHDR; 1490 1491 /* Loop and append maximum sized mbufs to the chain tail. */ 1492 while (len > 0) { 1493 mb = NULL; 1494 if (len > MCLBYTES) { 1495 mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), 1496 MJUMPAGESIZE); 1497 } 1498 if (mb == NULL) { 1499 if (len >= MINCLSIZE) 1500 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1501 else if (flags & M_PKTHDR) 1502 mb = m_gethdr(how, type); 1503 else 1504 mb = m_get(how, type); 1505 1506 /* 1507 * Fail the whole operation if one mbuf can't be 1508 * allocated. 1509 */ 1510 if (mb == NULL) { 1511 m_freem(nm); 1512 return (NULL); 1513 } 1514 } 1515 1516 /* Book keeping. */ 1517 len -= M_SIZE(mb); 1518 if (mtail != NULL) 1519 mtail->m_next = mb; 1520 else 1521 nm = mb; 1522 mtail = mb; 1523 flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ 1524 } 1525 if (flags & M_EOR) 1526 mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ 1527 1528 /* If mbuf was supplied, append new chain to the end of it. */ 1529 if (m != NULL) { 1530 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) 1531 ; 1532 mtail->m_next = nm; 1533 mtail->m_flags &= ~M_EOR; 1534 } else 1535 m = nm; 1536 1537 return (m); 1538 } 1539 1540 /*- 1541 * Configure a provided mbuf to refer to the provided external storage 1542 * buffer and setup a reference count for said buffer. 1543 * 1544 * Arguments: 1545 * mb The existing mbuf to which to attach the provided buffer. 1546 * buf The address of the provided external storage buffer. 1547 * size The size of the provided buffer. 1548 * freef A pointer to a routine that is responsible for freeing the 1549 * provided external storage buffer. 1550 * args A pointer to an argument structure (of any type) to be passed 1551 * to the provided freef routine (may be NULL). 1552 * flags Any other flags to be passed to the provided mbuf. 1553 * type The type that the external storage buffer should be 1554 * labeled with. 1555 * 1556 * Returns: 1557 * Nothing. 1558 */ 1559 void 1560 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1561 void *arg1, void *arg2, int flags, int type) 1562 { 1563 1564 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1565 1566 mb->m_flags |= (M_EXT | flags); 1567 mb->m_ext.ext_buf = buf; 1568 mb->m_data = mb->m_ext.ext_buf; 1569 mb->m_ext.ext_size = size; 1570 mb->m_ext.ext_free = freef; 1571 mb->m_ext.ext_arg1 = arg1; 1572 mb->m_ext.ext_arg2 = arg2; 1573 mb->m_ext.ext_type = type; 1574 1575 if (type != EXT_EXTREF) { 1576 mb->m_ext.ext_count = 1; 1577 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1578 } else 1579 mb->m_ext.ext_flags = 0; 1580 } 1581 1582 /* 1583 * Free an entire chain of mbufs and associated external buffers, if 1584 * applicable. 1585 */ 1586 void 1587 m_freem(struct mbuf *mb) 1588 { 1589 1590 MBUF_PROBE1(m__freem, mb); 1591 while (mb != NULL) 1592 mb = m_free(mb); 1593 } 1594 1595 /* 1596 * Temporary primitive to allow freeing without going through m_free. 1597 */ 1598 void 1599 m_free_raw(struct mbuf *mb) 1600 { 1601 1602 uma_zfree(zone_mbuf, mb); 1603 } 1604 1605 int 1606 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, 1607 struct m_snd_tag **mstp) 1608 { 1609 1610 if (ifp->if_snd_tag_alloc == NULL) 1611 return (EOPNOTSUPP); 1612 return (ifp->if_snd_tag_alloc(ifp, params, mstp)); 1613 } 1614 1615 void 1616 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, 1617 const struct if_snd_tag_sw *sw) 1618 { 1619 1620 if_ref(ifp); 1621 mst->ifp = ifp; 1622 refcount_init(&mst->refcount, 1); 1623 mst->sw = sw; 1624 counter_u64_add(snd_tag_count, 1); 1625 } 1626 1627 void 1628 m_snd_tag_destroy(struct m_snd_tag *mst) 1629 { 1630 struct ifnet *ifp; 1631 1632 ifp = mst->ifp; 1633 mst->sw->snd_tag_free(mst); 1634 if_rele(ifp); 1635 counter_u64_add(snd_tag_count, -1); 1636 } 1637 1638 void 1639 m_rcvif_serialize(struct mbuf *m) 1640 { 1641 u_short idx, gen; 1642 1643 M_ASSERTPKTHDR(m); 1644 idx = m->m_pkthdr.rcvif->if_index; 1645 gen = m->m_pkthdr.rcvif->if_idxgen; 1646 m->m_pkthdr.rcvidx = idx; 1647 m->m_pkthdr.rcvgen = gen; 1648 if (__predict_false(m->m_pkthdr.leaf_rcvif != NULL)) { 1649 idx = m->m_pkthdr.leaf_rcvif->if_index; 1650 gen = m->m_pkthdr.leaf_rcvif->if_idxgen; 1651 } else { 1652 idx = -1; 1653 gen = 0; 1654 } 1655 m->m_pkthdr.leaf_rcvidx = idx; 1656 m->m_pkthdr.leaf_rcvgen = gen; 1657 } 1658 1659 struct ifnet * 1660 m_rcvif_restore(struct mbuf *m) 1661 { 1662 struct ifnet *ifp, *leaf_ifp; 1663 1664 M_ASSERTPKTHDR(m); 1665 NET_EPOCH_ASSERT(); 1666 1667 ifp = ifnet_byindexgen(m->m_pkthdr.rcvidx, m->m_pkthdr.rcvgen); 1668 if (ifp == NULL || (ifp->if_flags & IFF_DYING)) 1669 return (NULL); 1670 1671 if (__predict_true(m->m_pkthdr.leaf_rcvidx == (u_short)-1)) { 1672 leaf_ifp = NULL; 1673 } else { 1674 leaf_ifp = ifnet_byindexgen(m->m_pkthdr.leaf_rcvidx, 1675 m->m_pkthdr.leaf_rcvgen); 1676 if (__predict_false(leaf_ifp != NULL && (leaf_ifp->if_flags & IFF_DYING))) 1677 leaf_ifp = NULL; 1678 } 1679 1680 m->m_pkthdr.leaf_rcvif = leaf_ifp; 1681 m->m_pkthdr.rcvif = ifp; 1682 1683 return (ifp); 1684 } 1685 1686 /* 1687 * Allocate an mbuf with anonymous external pages. 1688 */ 1689 struct mbuf * 1690 mb_alloc_ext_plus_pages(int len, int how) 1691 { 1692 struct mbuf *m; 1693 vm_page_t pg; 1694 int i, npgs; 1695 1696 m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1697 if (m == NULL) 1698 return (NULL); 1699 m->m_epg_flags |= EPG_FLAG_ANON; 1700 npgs = howmany(len, PAGE_SIZE); 1701 for (i = 0; i < npgs; i++) { 1702 do { 1703 pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | 1704 VM_ALLOC_WIRED); 1705 if (pg == NULL) { 1706 if (how == M_NOWAIT) { 1707 m->m_epg_npgs = i; 1708 m_free(m); 1709 return (NULL); 1710 } 1711 vm_wait(NULL); 1712 } 1713 } while (pg == NULL); 1714 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); 1715 } 1716 m->m_epg_npgs = npgs; 1717 return (m); 1718 } 1719 1720 /* 1721 * Copy the data in the mbuf chain to a chain of mbufs with anonymous external 1722 * unmapped pages. 1723 * len is the length of data in the input mbuf chain. 1724 * mlen is the maximum number of bytes put into each ext_page mbuf. 1725 */ 1726 struct mbuf * 1727 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, 1728 struct mbuf **mlast) 1729 { 1730 struct mbuf *m, *mout; 1731 char *pgpos, *mbpos; 1732 int i, mblen, mbufsiz, pglen, xfer; 1733 1734 if (len == 0) 1735 return (NULL); 1736 mbufsiz = min(mlen, len); 1737 m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); 1738 if (m == NULL) 1739 return (m); 1740 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); 1741 pglen = PAGE_SIZE; 1742 mblen = 0; 1743 i = 0; 1744 do { 1745 if (pglen == 0) { 1746 if (++i == m->m_epg_npgs) { 1747 m->m_epg_last_len = PAGE_SIZE; 1748 mbufsiz = min(mlen, len); 1749 m->m_next = mb_alloc_ext_plus_pages(mbufsiz, 1750 how); 1751 m = m->m_next; 1752 if (m == NULL) { 1753 m_freem(mout); 1754 return (m); 1755 } 1756 i = 0; 1757 } 1758 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); 1759 pglen = PAGE_SIZE; 1760 } 1761 while (mblen == 0) { 1762 if (mp == NULL) { 1763 m_freem(mout); 1764 return (NULL); 1765 } 1766 KASSERT((mp->m_flags & M_EXTPG) == 0, 1767 ("mb_copym_ext_pgs: ext_pgs input mbuf")); 1768 mbpos = mtod(mp, char *); 1769 mblen = mp->m_len; 1770 mp = mp->m_next; 1771 } 1772 xfer = min(mblen, pglen); 1773 memcpy(pgpos, mbpos, xfer); 1774 pgpos += xfer; 1775 mbpos += xfer; 1776 pglen -= xfer; 1777 mblen -= xfer; 1778 len -= xfer; 1779 m->m_len += xfer; 1780 } while (len > 0); 1781 m->m_epg_last_len = PAGE_SIZE - pglen; 1782 if (mlast != NULL) 1783 *mlast = m; 1784 return (mout); 1785 } 1786