1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_param.h" 32 #include "opt_kern_tls.h" 33 34 #include <sys/param.h> 35 #include <sys/conf.h> 36 #include <sys/domainset.h> 37 #include <sys/malloc.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/eventhandler.h> 41 #include <sys/kernel.h> 42 #include <sys/ktls.h> 43 #include <sys/limits.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/refcount.h> 47 #include <sys/sf_buf.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/sysctl.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_pageout.h> 60 #include <vm/vm_map.h> 61 #include <vm/uma.h> 62 #include <vm/uma_dbg.h> 63 64 _Static_assert(MJUMPAGESIZE > MCLBYTES, 65 "Cluster must be smaller than a jumbo page"); 66 67 /* 68 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 69 * Zones. 70 * 71 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 72 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 73 * administrator so desires. 74 * 75 * Mbufs are allocated from a UMA Primary Zone called the Mbuf 76 * Zone. 77 * 78 * Additionally, FreeBSD provides a Packet Zone, which it 79 * configures as a Secondary Zone to the Mbuf Primary Zone, 80 * thus sharing backend Slab kegs with the Mbuf Primary Zone. 81 * 82 * Thus common-case allocations and locking are simplified: 83 * 84 * m_clget() m_getcl() 85 * | | 86 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 87 * | | [ Packet ] | 88 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 89 * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] 90 * | \________ | 91 * [ Cluster Keg ] \ / 92 * | [ Mbuf Keg ] 93 * [ Cluster Slabs ] | 94 * | [ Mbuf Slabs ] 95 * \____________(VM)_________________/ 96 * 97 * 98 * Whenever an object is allocated with uma_zalloc() out of 99 * one of the Zones its _ctor_ function is executed. The same 100 * for any deallocation through uma_zfree() the _dtor_ function 101 * is executed. 102 * 103 * Caches are per-CPU and are filled from the Primary Zone. 104 * 105 * Whenever an object is allocated from the underlying global 106 * memory pool it gets pre-initialized with the _zinit_ functions. 107 * When the Keg's are overfull objects get decommissioned with 108 * _zfini_ functions and free'd back to the global memory pool. 109 * 110 */ 111 112 int nmbufs; /* limits number of mbufs */ 113 int nmbclusters; /* limits number of mbuf clusters */ 114 int nmbjumbop; /* limits number of page size jumbo clusters */ 115 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 116 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 117 118 bool mb_use_ext_pgs = false; /* use M_EXTPG mbufs for sendfile & TLS */ 119 120 static int 121 sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS) 122 { 123 int error, extpg; 124 125 extpg = mb_use_ext_pgs; 126 error = sysctl_handle_int(oidp, &extpg, 0, req); 127 if (error == 0 && req->newptr != NULL) { 128 if (extpg != 0 && !PMAP_HAS_DMAP) 129 error = EOPNOTSUPP; 130 else 131 mb_use_ext_pgs = extpg != 0; 132 } 133 return (error); 134 } 135 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs, 136 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 137 &mb_use_ext_pgs, 0, sysctl_mb_use_ext_pgs, "IU", 138 "Use unmapped mbufs for sendfile(2) and TLS offload"); 139 140 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 141 142 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 143 "Maximum real memory allocatable to various mbuf types"); 144 145 static counter_u64_t snd_tag_count; 146 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, 147 &snd_tag_count, "# of active mbuf send tags"); 148 149 /* 150 * tunable_mbinit() has to be run before any mbuf allocations are done. 151 */ 152 static void 153 tunable_mbinit(void *dummy) 154 { 155 quad_t realmem; 156 int extpg; 157 158 /* 159 * The default limit for all mbuf related memory is 1/2 of all 160 * available kernel memory (physical or kmem). 161 * At most it can be 3/4 of available kernel memory. 162 */ 163 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 164 maxmbufmem = realmem / 2; 165 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 166 if (maxmbufmem > realmem / 4 * 3) 167 maxmbufmem = realmem / 4 * 3; 168 169 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 170 if (nmbclusters == 0) 171 nmbclusters = maxmbufmem / MCLBYTES / 4; 172 173 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 174 if (nmbjumbop == 0) 175 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 176 177 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 178 if (nmbjumbo9 == 0) 179 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 180 181 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 182 if (nmbjumbo16 == 0) 183 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 184 185 /* 186 * We need at least as many mbufs as we have clusters of 187 * the various types added together. 188 */ 189 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 190 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 191 nmbufs = lmax(maxmbufmem / MSIZE / 5, 192 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 193 194 /* 195 * Unmapped mbufs can only safely be used on platforms with a direct 196 * map. 197 */ 198 if (PMAP_HAS_DMAP) { 199 extpg = 1; 200 TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg); 201 mb_use_ext_pgs = extpg != 0; 202 } 203 } 204 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 205 206 static int 207 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 208 { 209 int error, newnmbclusters; 210 211 newnmbclusters = nmbclusters; 212 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 213 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 214 if (newnmbclusters > nmbclusters && 215 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 216 nmbclusters = newnmbclusters; 217 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 218 EVENTHANDLER_INVOKE(nmbclusters_change); 219 } else 220 error = EINVAL; 221 } 222 return (error); 223 } 224 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, 225 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 226 &nmbclusters, 0, sysctl_nmbclusters, "IU", 227 "Maximum number of mbuf clusters allowed"); 228 229 static int 230 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 231 { 232 int error, newnmbjumbop; 233 234 newnmbjumbop = nmbjumbop; 235 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 236 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 237 if (newnmbjumbop > nmbjumbop && 238 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 239 nmbjumbop = newnmbjumbop; 240 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 241 } else 242 error = EINVAL; 243 } 244 return (error); 245 } 246 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, 247 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 248 &nmbjumbop, 0, sysctl_nmbjumbop, "IU", 249 "Maximum number of mbuf page size jumbo clusters allowed"); 250 251 static int 252 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 253 { 254 int error, newnmbjumbo9; 255 256 newnmbjumbo9 = nmbjumbo9; 257 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 258 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 259 if (newnmbjumbo9 > nmbjumbo9 && 260 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 261 nmbjumbo9 = newnmbjumbo9; 262 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 263 } else 264 error = EINVAL; 265 } 266 return (error); 267 } 268 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, 269 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 270 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", 271 "Maximum number of mbuf 9k jumbo clusters allowed"); 272 273 static int 274 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 275 { 276 int error, newnmbjumbo16; 277 278 newnmbjumbo16 = nmbjumbo16; 279 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 280 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 281 if (newnmbjumbo16 > nmbjumbo16 && 282 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 283 nmbjumbo16 = newnmbjumbo16; 284 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 285 } else 286 error = EINVAL; 287 } 288 return (error); 289 } 290 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, 291 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 292 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", 293 "Maximum number of mbuf 16k jumbo clusters allowed"); 294 295 static int 296 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 297 { 298 int error, newnmbufs; 299 300 newnmbufs = nmbufs; 301 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 302 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 303 if (newnmbufs > nmbufs) { 304 nmbufs = newnmbufs; 305 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 306 EVENTHANDLER_INVOKE(nmbufs_change); 307 } else 308 error = EINVAL; 309 } 310 return (error); 311 } 312 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, 313 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 314 &nmbufs, 0, sysctl_nmbufs, "IU", 315 "Maximum number of mbufs allowed"); 316 317 /* 318 * Zones from which we allocate. 319 */ 320 uma_zone_t zone_mbuf; 321 uma_zone_t zone_clust; 322 uma_zone_t zone_pack; 323 uma_zone_t zone_jumbop; 324 uma_zone_t zone_jumbo9; 325 uma_zone_t zone_jumbo16; 326 327 /* 328 * Local prototypes. 329 */ 330 static int mb_ctor_mbuf(void *, int, void *, int); 331 static int mb_ctor_clust(void *, int, void *, int); 332 static int mb_ctor_pack(void *, int, void *, int); 333 static void mb_dtor_mbuf(void *, int, void *); 334 static void mb_dtor_pack(void *, int, void *); 335 static int mb_zinit_pack(void *, int, int); 336 static void mb_zfini_pack(void *, int); 337 static void mb_reclaim(uma_zone_t, int); 338 339 /* Ensure that MSIZE is a power of 2. */ 340 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 341 342 _Static_assert(sizeof(struct mbuf) <= MSIZE, 343 "size of mbuf exceeds MSIZE"); 344 /* 345 * Initialize FreeBSD Network buffer allocation. 346 */ 347 static void 348 mbuf_init(void *dummy) 349 { 350 351 /* 352 * Configure UMA zones for Mbufs, Clusters, and Packets. 353 */ 354 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 355 mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 356 MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); 357 if (nmbufs > 0) 358 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 359 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 360 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 361 362 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 363 mb_ctor_clust, NULL, NULL, NULL, 364 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 365 if (nmbclusters > 0) 366 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 367 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 368 uma_zone_set_maxaction(zone_clust, mb_reclaim); 369 370 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 371 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 372 373 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 374 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 375 mb_ctor_clust, NULL, NULL, NULL, 376 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 377 if (nmbjumbop > 0) 378 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 379 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 380 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 381 382 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 383 mb_ctor_clust, NULL, NULL, NULL, 384 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 385 if (nmbjumbo9 > 0) 386 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 387 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 388 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 389 390 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 391 mb_ctor_clust, NULL, NULL, NULL, 392 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 393 if (nmbjumbo16 > 0) 394 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 395 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 396 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 397 398 snd_tag_count = counter_u64_alloc(M_WAITOK); 399 } 400 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 401 402 #ifdef DEBUGNET 403 /* 404 * debugnet makes use of a pre-allocated pool of mbufs and clusters. When 405 * debugnet is configured, we initialize a set of UMA cache zones which return 406 * items from this pool. At panic-time, the regular UMA zone pointers are 407 * overwritten with those of the cache zones so that drivers may allocate and 408 * free mbufs and clusters without attempting to allocate physical memory. 409 * 410 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 411 * the purpose of caching clusters, we treat them as mbufs. 412 */ 413 static struct mbufq dn_mbufq = 414 { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; 415 static struct mbufq dn_clustq = 416 { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; 417 418 static int dn_clsize; 419 static uma_zone_t dn_zone_mbuf; 420 static uma_zone_t dn_zone_clust; 421 static uma_zone_t dn_zone_pack; 422 423 static struct debugnet_saved_zones { 424 uma_zone_t dsz_mbuf; 425 uma_zone_t dsz_clust; 426 uma_zone_t dsz_pack; 427 uma_zone_t dsz_jumbop; 428 uma_zone_t dsz_jumbo9; 429 uma_zone_t dsz_jumbo16; 430 bool dsz_debugnet_zones_enabled; 431 } dn_saved_zones; 432 433 static int 434 dn_buf_import(void *arg, void **store, int count, int domain __unused, 435 int flags) 436 { 437 struct mbufq *q; 438 struct mbuf *m; 439 int i; 440 441 q = arg; 442 443 for (i = 0; i < count; i++) { 444 m = mbufq_dequeue(q); 445 if (m == NULL) 446 break; 447 trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); 448 store[i] = m; 449 } 450 KASSERT((flags & M_WAITOK) == 0 || i == count, 451 ("%s: ran out of pre-allocated mbufs", __func__)); 452 return (i); 453 } 454 455 static void 456 dn_buf_release(void *arg, void **store, int count) 457 { 458 struct mbufq *q; 459 struct mbuf *m; 460 int i; 461 462 q = arg; 463 464 for (i = 0; i < count; i++) { 465 m = store[i]; 466 (void)mbufq_enqueue(q, m); 467 } 468 } 469 470 static int 471 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, 472 int flags __unused) 473 { 474 struct mbuf *m; 475 void *clust; 476 int i; 477 478 for (i = 0; i < count; i++) { 479 m = m_get(M_NOWAIT, MT_DATA); 480 if (m == NULL) 481 break; 482 clust = uma_zalloc(dn_zone_clust, M_NOWAIT); 483 if (clust == NULL) { 484 m_free(m); 485 break; 486 } 487 mb_ctor_clust(clust, dn_clsize, m, 0); 488 store[i] = m; 489 } 490 KASSERT((flags & M_WAITOK) == 0 || i == count, 491 ("%s: ran out of pre-allocated mbufs", __func__)); 492 return (i); 493 } 494 495 static void 496 dn_pack_release(void *arg __unused, void **store, int count) 497 { 498 struct mbuf *m; 499 void *clust; 500 int i; 501 502 for (i = 0; i < count; i++) { 503 m = store[i]; 504 clust = m->m_ext.ext_buf; 505 uma_zfree(dn_zone_clust, clust); 506 uma_zfree(dn_zone_mbuf, m); 507 } 508 } 509 510 /* 511 * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy 512 * the corresponding UMA cache zones. 513 */ 514 void 515 debugnet_mbuf_drain(void) 516 { 517 struct mbuf *m; 518 void *item; 519 520 if (dn_zone_mbuf != NULL) { 521 uma_zdestroy(dn_zone_mbuf); 522 dn_zone_mbuf = NULL; 523 } 524 if (dn_zone_clust != NULL) { 525 uma_zdestroy(dn_zone_clust); 526 dn_zone_clust = NULL; 527 } 528 if (dn_zone_pack != NULL) { 529 uma_zdestroy(dn_zone_pack); 530 dn_zone_pack = NULL; 531 } 532 533 while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) 534 m_free(m); 535 while ((item = mbufq_dequeue(&dn_clustq)) != NULL) 536 uma_zfree(m_getzone(dn_clsize), item); 537 } 538 539 /* 540 * Callback invoked immediately prior to starting a debugnet connection. 541 */ 542 void 543 debugnet_mbuf_start(void) 544 { 545 546 MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); 547 548 /* Save the old zone pointers to restore when debugnet is closed. */ 549 dn_saved_zones = (struct debugnet_saved_zones) { 550 .dsz_debugnet_zones_enabled = true, 551 .dsz_mbuf = zone_mbuf, 552 .dsz_clust = zone_clust, 553 .dsz_pack = zone_pack, 554 .dsz_jumbop = zone_jumbop, 555 .dsz_jumbo9 = zone_jumbo9, 556 .dsz_jumbo16 = zone_jumbo16, 557 }; 558 559 /* 560 * All cluster zones return buffers of the size requested by the 561 * drivers. It's up to the driver to reinitialize the zones if the 562 * MTU of a debugnet-enabled interface changes. 563 */ 564 printf("debugnet: overwriting mbuf zone pointers\n"); 565 zone_mbuf = dn_zone_mbuf; 566 zone_clust = dn_zone_clust; 567 zone_pack = dn_zone_pack; 568 zone_jumbop = dn_zone_clust; 569 zone_jumbo9 = dn_zone_clust; 570 zone_jumbo16 = dn_zone_clust; 571 } 572 573 /* 574 * Callback invoked when a debugnet connection is closed/finished. 575 */ 576 void 577 debugnet_mbuf_finish(void) 578 { 579 580 MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); 581 582 printf("debugnet: restoring mbuf zone pointers\n"); 583 zone_mbuf = dn_saved_zones.dsz_mbuf; 584 zone_clust = dn_saved_zones.dsz_clust; 585 zone_pack = dn_saved_zones.dsz_pack; 586 zone_jumbop = dn_saved_zones.dsz_jumbop; 587 zone_jumbo9 = dn_saved_zones.dsz_jumbo9; 588 zone_jumbo16 = dn_saved_zones.dsz_jumbo16; 589 590 memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); 591 } 592 593 /* 594 * Reinitialize the debugnet mbuf+cluster pool and cache zones. 595 */ 596 void 597 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) 598 { 599 struct mbuf *m; 600 void *item; 601 602 debugnet_mbuf_drain(); 603 604 dn_clsize = clsize; 605 606 dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, 607 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 608 dn_buf_import, dn_buf_release, 609 &dn_mbufq, UMA_ZONE_NOBUCKET); 610 611 dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, 612 clsize, mb_ctor_clust, NULL, NULL, NULL, 613 dn_buf_import, dn_buf_release, 614 &dn_clustq, UMA_ZONE_NOBUCKET); 615 616 dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, 617 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 618 dn_pack_import, dn_pack_release, 619 NULL, UMA_ZONE_NOBUCKET); 620 621 while (nmbuf-- > 0) { 622 m = m_get(M_WAITOK, MT_DATA); 623 uma_zfree(dn_zone_mbuf, m); 624 } 625 while (nclust-- > 0) { 626 item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); 627 uma_zfree(dn_zone_clust, item); 628 } 629 } 630 #endif /* DEBUGNET */ 631 632 /* 633 * Constructor for Mbuf primary zone. 634 * 635 * The 'arg' pointer points to a mb_args structure which 636 * contains call-specific information required to support the 637 * mbuf allocation API. See mbuf.h. 638 */ 639 static int 640 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 641 { 642 struct mbuf *m; 643 struct mb_args *args; 644 int error; 645 int flags; 646 short type; 647 648 args = (struct mb_args *)arg; 649 type = args->type; 650 651 /* 652 * The mbuf is initialized later. The caller has the 653 * responsibility to set up any MAC labels too. 654 */ 655 if (type == MT_NOINIT) 656 return (0); 657 658 m = (struct mbuf *)mem; 659 flags = args->flags; 660 MPASS((flags & M_NOFREE) == 0); 661 662 error = m_init(m, how, type, flags); 663 664 return (error); 665 } 666 667 /* 668 * The Mbuf primary zone destructor. 669 */ 670 static void 671 mb_dtor_mbuf(void *mem, int size, void *arg) 672 { 673 struct mbuf *m; 674 unsigned long flags __diagused; 675 676 m = (struct mbuf *)mem; 677 flags = (unsigned long)arg; 678 679 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 680 KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); 681 if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 682 m_tag_delete_chain(m, NULL); 683 } 684 685 /* 686 * The Mbuf Packet zone destructor. 687 */ 688 static void 689 mb_dtor_pack(void *mem, int size, void *arg) 690 { 691 struct mbuf *m; 692 693 m = (struct mbuf *)mem; 694 if ((m->m_flags & M_PKTHDR) != 0) 695 m_tag_delete_chain(m, NULL); 696 697 /* Make sure we've got a clean cluster back. */ 698 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 699 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 700 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 701 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 702 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 703 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 704 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 705 #if defined(INVARIANTS) && !defined(KMSAN) 706 trash_dtor(m->m_ext.ext_buf, MCLBYTES, zone_clust); 707 #endif 708 /* 709 * If there are processes blocked on zone_clust, waiting for pages 710 * to be freed up, cause them to be woken up by draining the 711 * packet zone. We are exposed to a race here (in the check for 712 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 713 * is deliberate. We don't want to acquire the zone lock for every 714 * mbuf free. 715 */ 716 if (uma_zone_exhausted(zone_clust)) 717 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 718 } 719 720 /* 721 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 722 * 723 * Here the 'arg' pointer points to the Mbuf which we 724 * are configuring cluster storage for. If 'arg' is 725 * empty we allocate just the cluster without setting 726 * the mbuf to it. See mbuf.h. 727 */ 728 static int 729 mb_ctor_clust(void *mem, int size, void *arg, int how) 730 { 731 struct mbuf *m; 732 733 m = (struct mbuf *)arg; 734 if (m != NULL) { 735 m->m_ext.ext_buf = (char *)mem; 736 m->m_data = m->m_ext.ext_buf; 737 m->m_flags |= M_EXT; 738 m->m_ext.ext_free = NULL; 739 m->m_ext.ext_arg1 = NULL; 740 m->m_ext.ext_arg2 = NULL; 741 m->m_ext.ext_size = size; 742 m->m_ext.ext_type = m_gettype(size); 743 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 744 m->m_ext.ext_count = 1; 745 } 746 747 return (0); 748 } 749 750 /* 751 * The Packet secondary zone's init routine, executed on the 752 * object's transition from mbuf keg slab to zone cache. 753 */ 754 static int 755 mb_zinit_pack(void *mem, int size, int how) 756 { 757 struct mbuf *m; 758 759 m = (struct mbuf *)mem; /* m is virgin. */ 760 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 761 m->m_ext.ext_buf == NULL) 762 return (ENOMEM); 763 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 764 #if defined(INVARIANTS) && !defined(KMSAN) 765 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 766 #endif 767 return (0); 768 } 769 770 /* 771 * The Packet secondary zone's fini routine, executed on the 772 * object's transition from zone cache to keg slab. 773 */ 774 static void 775 mb_zfini_pack(void *mem, int size) 776 { 777 struct mbuf *m; 778 779 m = (struct mbuf *)mem; 780 #if defined(INVARIANTS) && !defined(KMSAN) 781 trash_fini(m->m_ext.ext_buf, MCLBYTES); 782 #endif 783 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 784 #if defined(INVARIANTS) && !defined(KMSAN) 785 trash_dtor(mem, size, zone_clust); 786 #endif 787 } 788 789 /* 790 * The "packet" keg constructor. 791 */ 792 static int 793 mb_ctor_pack(void *mem, int size, void *arg, int how) 794 { 795 struct mbuf *m; 796 struct mb_args *args; 797 int error, flags; 798 short type; 799 800 m = (struct mbuf *)mem; 801 args = (struct mb_args *)arg; 802 flags = args->flags; 803 type = args->type; 804 MPASS((flags & M_NOFREE) == 0); 805 806 #if defined(INVARIANTS) && !defined(KMSAN) 807 trash_ctor(m->m_ext.ext_buf, MCLBYTES, zone_clust, how); 808 #endif 809 810 error = m_init(m, how, type, flags); 811 812 /* m_ext is already initialized. */ 813 m->m_data = m->m_ext.ext_buf; 814 m->m_flags = (flags | M_EXT); 815 816 return (error); 817 } 818 819 /* 820 * This is the protocol drain routine. Called by UMA whenever any of the 821 * mbuf zones is closed to its limit. 822 */ 823 static void 824 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 825 { 826 827 EVENTHANDLER_INVOKE(mbuf_lowmem, VM_LOW_MBUFS); 828 } 829 830 /* 831 * Free "count" units of I/O from an mbuf chain. They could be held 832 * in M_EXTPG or just as a normal mbuf. This code is intended to be 833 * called in an error path (I/O error, closed connection, etc). 834 */ 835 void 836 mb_free_notready(struct mbuf *m, int count) 837 { 838 int i; 839 840 for (i = 0; i < count && m != NULL; i++) { 841 if ((m->m_flags & M_EXTPG) != 0) { 842 m->m_epg_nrdy--; 843 if (m->m_epg_nrdy != 0) 844 continue; 845 } 846 m = m_free(m); 847 } 848 KASSERT(i == count, ("Removed only %d items from %p", i, m)); 849 } 850 851 /* 852 * Compress an unmapped mbuf into a simple mbuf when it holds a small 853 * amount of data. This is used as a DOS defense to avoid having 854 * small packets tie up wired pages, an ext_pgs structure, and an 855 * mbuf. Since this converts the existing mbuf in place, it can only 856 * be used if there are no other references to 'm'. 857 */ 858 int 859 mb_unmapped_compress(struct mbuf *m) 860 { 861 volatile u_int *refcnt; 862 char buf[MLEN]; 863 864 /* 865 * Assert that 'm' does not have a packet header. If 'm' had 866 * a packet header, it would only be able to hold MHLEN bytes 867 * and m_data would have to be initialized differently. 868 */ 869 KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), 870 ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); 871 KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); 872 873 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 874 refcnt = &m->m_ext.ext_count; 875 } else { 876 KASSERT(m->m_ext.ext_cnt != NULL, 877 ("%s: no refcounting pointer on %p", __func__, m)); 878 refcnt = m->m_ext.ext_cnt; 879 } 880 881 if (*refcnt != 1) 882 return (EBUSY); 883 884 m_copydata(m, 0, m->m_len, buf); 885 886 /* Free the backing pages. */ 887 m->m_ext.ext_free(m); 888 889 /* Turn 'm' into a "normal" mbuf. */ 890 m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); 891 m->m_data = m->m_dat; 892 893 /* Copy data back into m. */ 894 bcopy(buf, mtod(m, char *), m->m_len); 895 896 return (0); 897 } 898 899 /* 900 * These next few routines are used to permit downgrading an unmapped 901 * mbuf to a chain of mapped mbufs. This is used when an interface 902 * doesn't supported unmapped mbufs or if checksums need to be 903 * computed in software. 904 * 905 * Each unmapped mbuf is converted to a chain of mbufs. First, any 906 * TLS header data is stored in a regular mbuf. Second, each page of 907 * unmapped data is stored in an mbuf with an EXT_SFBUF external 908 * cluster. These mbufs use an sf_buf to provide a valid KVA for the 909 * associated physical page. They also hold a reference on the 910 * original M_EXTPG mbuf to ensure the physical page doesn't go away. 911 * Finally, any TLS trailer data is stored in a regular mbuf. 912 * 913 * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF 914 * mbufs. It frees the associated sf_buf and releases its reference 915 * on the original M_EXTPG mbuf. 916 * 917 * _mb_unmapped_to_ext() is a helper function that converts a single 918 * unmapped mbuf into a chain of mbufs. 919 * 920 * mb_unmapped_to_ext() is the public function that walks an mbuf 921 * chain converting any unmapped mbufs to mapped mbufs. It returns 922 * the new chain of unmapped mbufs on success. On failure it frees 923 * the original mbuf chain and returns NULL. 924 */ 925 static void 926 mb_unmapped_free_mext(struct mbuf *m) 927 { 928 struct sf_buf *sf; 929 struct mbuf *old_m; 930 931 sf = m->m_ext.ext_arg1; 932 sf_buf_free(sf); 933 934 /* Drop the reference on the backing M_EXTPG mbuf. */ 935 old_m = m->m_ext.ext_arg2; 936 mb_free_extpg(old_m); 937 } 938 939 static struct mbuf * 940 _mb_unmapped_to_ext(struct mbuf *m) 941 { 942 struct mbuf *m_new, *top, *prev, *mref; 943 struct sf_buf *sf; 944 vm_page_t pg; 945 int i, len, off, pglen, pgoff, seglen, segoff; 946 volatile u_int *refcnt; 947 u_int ref_inc = 0; 948 949 M_ASSERTEXTPG(m); 950 len = m->m_len; 951 KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", 952 __func__, m)); 953 954 /* See if this is the mbuf that holds the embedded refcount. */ 955 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 956 refcnt = &m->m_ext.ext_count; 957 mref = m; 958 } else { 959 KASSERT(m->m_ext.ext_cnt != NULL, 960 ("%s: no refcounting pointer on %p", __func__, m)); 961 refcnt = m->m_ext.ext_cnt; 962 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 963 } 964 965 /* Skip over any data removed from the front. */ 966 off = mtod(m, vm_offset_t); 967 968 top = NULL; 969 if (m->m_epg_hdrlen != 0) { 970 if (off >= m->m_epg_hdrlen) { 971 off -= m->m_epg_hdrlen; 972 } else { 973 seglen = m->m_epg_hdrlen - off; 974 segoff = off; 975 seglen = min(seglen, len); 976 off = 0; 977 len -= seglen; 978 m_new = m_get(M_NOWAIT, MT_DATA); 979 if (m_new == NULL) 980 goto fail; 981 m_new->m_len = seglen; 982 prev = top = m_new; 983 memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], 984 seglen); 985 } 986 } 987 pgoff = m->m_epg_1st_off; 988 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 989 pglen = m_epg_pagelen(m, i, pgoff); 990 if (off >= pglen) { 991 off -= pglen; 992 pgoff = 0; 993 continue; 994 } 995 seglen = pglen - off; 996 segoff = pgoff + off; 997 off = 0; 998 seglen = min(seglen, len); 999 len -= seglen; 1000 1001 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1002 m_new = m_get(M_NOWAIT, MT_DATA); 1003 if (m_new == NULL) 1004 goto fail; 1005 if (top == NULL) { 1006 top = prev = m_new; 1007 } else { 1008 prev->m_next = m_new; 1009 prev = m_new; 1010 } 1011 sf = sf_buf_alloc(pg, SFB_NOWAIT); 1012 if (sf == NULL) 1013 goto fail; 1014 1015 ref_inc++; 1016 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, 1017 mb_unmapped_free_mext, sf, mref, m->m_flags & M_RDONLY, 1018 EXT_SFBUF); 1019 m_new->m_data += segoff; 1020 m_new->m_len = seglen; 1021 1022 pgoff = 0; 1023 }; 1024 if (len != 0) { 1025 KASSERT((off + len) <= m->m_epg_trllen, 1026 ("off + len > trail (%d + %d > %d)", off, len, 1027 m->m_epg_trllen)); 1028 m_new = m_get(M_NOWAIT, MT_DATA); 1029 if (m_new == NULL) 1030 goto fail; 1031 if (top == NULL) 1032 top = m_new; 1033 else 1034 prev->m_next = m_new; 1035 m_new->m_len = len; 1036 memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); 1037 } 1038 1039 if (ref_inc != 0) { 1040 /* 1041 * Obtain an additional reference on the old mbuf for 1042 * each created EXT_SFBUF mbuf. They will be dropped 1043 * in mb_unmapped_free_mext(). 1044 */ 1045 if (*refcnt == 1) 1046 *refcnt += ref_inc; 1047 else 1048 atomic_add_int(refcnt, ref_inc); 1049 } 1050 m_free(m); 1051 return (top); 1052 1053 fail: 1054 if (ref_inc != 0) { 1055 /* 1056 * Obtain an additional reference on the old mbuf for 1057 * each created EXT_SFBUF mbuf. They will be 1058 * immediately dropped when these mbufs are freed 1059 * below. 1060 */ 1061 if (*refcnt == 1) 1062 *refcnt += ref_inc; 1063 else 1064 atomic_add_int(refcnt, ref_inc); 1065 } 1066 m_free(m); 1067 m_freem(top); 1068 return (NULL); 1069 } 1070 1071 struct mbuf * 1072 mb_unmapped_to_ext(struct mbuf *top) 1073 { 1074 struct mbuf *m, *next, *prev = NULL; 1075 1076 prev = NULL; 1077 for (m = top; m != NULL; m = next) { 1078 /* m might be freed, so cache the next pointer. */ 1079 next = m->m_next; 1080 if (m->m_flags & M_EXTPG) { 1081 if (prev != NULL) { 1082 /* 1083 * Remove 'm' from the new chain so 1084 * that the 'top' chain terminates 1085 * before 'm' in case 'top' is freed 1086 * due to an error. 1087 */ 1088 prev->m_next = NULL; 1089 } 1090 m = _mb_unmapped_to_ext(m); 1091 if (m == NULL) { 1092 m_freem(top); 1093 m_freem(next); 1094 return (NULL); 1095 } 1096 if (prev == NULL) { 1097 top = m; 1098 } else { 1099 prev->m_next = m; 1100 } 1101 1102 /* 1103 * Replaced one mbuf with a chain, so we must 1104 * find the end of chain. 1105 */ 1106 prev = m_last(m); 1107 } else { 1108 if (prev != NULL) { 1109 prev->m_next = m; 1110 } 1111 prev = m; 1112 } 1113 } 1114 return (top); 1115 } 1116 1117 /* 1118 * Allocate an empty M_EXTPG mbuf. The ext_free routine is 1119 * responsible for freeing any pages backing this mbuf when it is 1120 * freed. 1121 */ 1122 struct mbuf * 1123 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free, int flags) 1124 { 1125 struct mbuf *m; 1126 1127 m = m_get(how, MT_DATA); 1128 if (m == NULL) 1129 return (NULL); 1130 1131 m->m_epg_npgs = 0; 1132 m->m_epg_nrdy = 0; 1133 m->m_epg_1st_off = 0; 1134 m->m_epg_last_len = 0; 1135 m->m_epg_flags = 0; 1136 m->m_epg_hdrlen = 0; 1137 m->m_epg_trllen = 0; 1138 m->m_epg_tls = NULL; 1139 m->m_epg_so = NULL; 1140 m->m_data = NULL; 1141 m->m_flags |= M_EXT | M_EXTPG | flags; 1142 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 1143 m->m_ext.ext_count = 1; 1144 m->m_ext.ext_size = 0; 1145 m->m_ext.ext_free = ext_free; 1146 return (m); 1147 } 1148 1149 /* 1150 * Clean up after mbufs with M_EXT storage attached to them if the 1151 * reference count hits 1. 1152 */ 1153 void 1154 mb_free_ext(struct mbuf *m) 1155 { 1156 volatile u_int *refcnt; 1157 struct mbuf *mref; 1158 int freembuf; 1159 1160 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 1161 1162 /* See if this is the mbuf that holds the embedded refcount. */ 1163 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1164 refcnt = &m->m_ext.ext_count; 1165 mref = m; 1166 } else { 1167 KASSERT(m->m_ext.ext_cnt != NULL, 1168 ("%s: no refcounting pointer on %p", __func__, m)); 1169 refcnt = m->m_ext.ext_cnt; 1170 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1171 } 1172 1173 /* 1174 * Check if the header is embedded in the cluster. It is 1175 * important that we can't touch any of the mbuf fields 1176 * after we have freed the external storage, since mbuf 1177 * could have been embedded in it. For now, the mbufs 1178 * embedded into the cluster are always of type EXT_EXTREF, 1179 * and for this type we won't free the mref. 1180 */ 1181 if (m->m_flags & M_NOFREE) { 1182 freembuf = 0; 1183 KASSERT(m->m_ext.ext_type == EXT_EXTREF || 1184 m->m_ext.ext_type == EXT_RXRING, 1185 ("%s: no-free mbuf %p has wrong type", __func__, m)); 1186 } else 1187 freembuf = 1; 1188 1189 /* Free attached storage if this mbuf is the only reference to it. */ 1190 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1191 switch (m->m_ext.ext_type) { 1192 case EXT_PACKET: 1193 /* The packet zone is special. */ 1194 if (*refcnt == 0) 1195 *refcnt = 1; 1196 uma_zfree(zone_pack, mref); 1197 break; 1198 case EXT_CLUSTER: 1199 uma_zfree(zone_clust, m->m_ext.ext_buf); 1200 m_free_raw(mref); 1201 break; 1202 case EXT_JUMBOP: 1203 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 1204 m_free_raw(mref); 1205 break; 1206 case EXT_JUMBO9: 1207 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 1208 m_free_raw(mref); 1209 break; 1210 case EXT_JUMBO16: 1211 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 1212 m_free_raw(mref); 1213 break; 1214 case EXT_SFBUF: 1215 case EXT_NET_DRV: 1216 case EXT_CTL: 1217 case EXT_MOD_TYPE: 1218 case EXT_DISPOSABLE: 1219 KASSERT(mref->m_ext.ext_free != NULL, 1220 ("%s: ext_free not set", __func__)); 1221 mref->m_ext.ext_free(mref); 1222 m_free_raw(mref); 1223 break; 1224 case EXT_EXTREF: 1225 KASSERT(m->m_ext.ext_free != NULL, 1226 ("%s: ext_free not set", __func__)); 1227 m->m_ext.ext_free(m); 1228 break; 1229 case EXT_RXRING: 1230 KASSERT(m->m_ext.ext_free == NULL, 1231 ("%s: ext_free is set", __func__)); 1232 break; 1233 default: 1234 KASSERT(m->m_ext.ext_type == 0, 1235 ("%s: unknown ext_type", __func__)); 1236 } 1237 } 1238 1239 if (freembuf && m != mref) 1240 m_free_raw(m); 1241 } 1242 1243 /* 1244 * Clean up after mbufs with M_EXTPG storage attached to them if the 1245 * reference count hits 1. 1246 */ 1247 void 1248 mb_free_extpg(struct mbuf *m) 1249 { 1250 volatile u_int *refcnt; 1251 struct mbuf *mref; 1252 1253 M_ASSERTEXTPG(m); 1254 1255 /* See if this is the mbuf that holds the embedded refcount. */ 1256 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1257 refcnt = &m->m_ext.ext_count; 1258 mref = m; 1259 } else { 1260 KASSERT(m->m_ext.ext_cnt != NULL, 1261 ("%s: no refcounting pointer on %p", __func__, m)); 1262 refcnt = m->m_ext.ext_cnt; 1263 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1264 } 1265 1266 /* Free attached storage if this mbuf is the only reference to it. */ 1267 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1268 KASSERT(mref->m_ext.ext_free != NULL, 1269 ("%s: ext_free not set", __func__)); 1270 1271 mref->m_ext.ext_free(mref); 1272 #ifdef KERN_TLS 1273 if (mref->m_epg_tls != NULL && 1274 !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) 1275 ktls_enqueue_to_free(mref); 1276 else 1277 #endif 1278 m_free_raw(mref); 1279 } 1280 1281 if (m != mref) 1282 m_free_raw(m); 1283 } 1284 1285 /* 1286 * Official mbuf(9) allocation KPI for stack and drivers: 1287 * 1288 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 1289 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 1290 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 1291 * m_clget() - attach cluster to already allocated mbuf. 1292 * m_cljget() - attach jumbo cluster to already allocated mbuf. 1293 * m_get2() - allocate minimum mbuf that would fit size argument. 1294 * m_getm2() - allocate a chain of mbufs/clusters. 1295 * m_extadd() - attach external cluster to mbuf. 1296 * 1297 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 1298 * m_freem() - free chain of mbufs. 1299 */ 1300 1301 int 1302 m_clget(struct mbuf *m, int how) 1303 { 1304 1305 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1306 __func__, m)); 1307 m->m_ext.ext_buf = (char *)NULL; 1308 uma_zalloc_arg(zone_clust, m, how); 1309 /* 1310 * On a cluster allocation failure, drain the packet zone and retry, 1311 * we might be able to loosen a few clusters up on the drain. 1312 */ 1313 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 1314 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 1315 uma_zalloc_arg(zone_clust, m, how); 1316 } 1317 MBUF_PROBE2(m__clget, m, how); 1318 return (m->m_flags & M_EXT); 1319 } 1320 1321 /* 1322 * m_cljget() is different from m_clget() as it can allocate clusters without 1323 * attaching them to an mbuf. In that case the return value is the pointer 1324 * to the cluster of the requested size. If an mbuf was specified, it gets 1325 * the cluster attached to it and the return value can be safely ignored. 1326 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1327 */ 1328 void * 1329 m_cljget(struct mbuf *m, int how, int size) 1330 { 1331 uma_zone_t zone; 1332 void *retval; 1333 1334 if (m != NULL) { 1335 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1336 __func__, m)); 1337 m->m_ext.ext_buf = NULL; 1338 } 1339 1340 zone = m_getzone(size); 1341 retval = uma_zalloc_arg(zone, m, how); 1342 1343 MBUF_PROBE4(m__cljget, m, how, size, retval); 1344 1345 return (retval); 1346 } 1347 1348 /* 1349 * m_get2() allocates minimum mbuf that would fit "size" argument. 1350 */ 1351 struct mbuf * 1352 m_get2(int size, int how, short type, int flags) 1353 { 1354 struct mb_args args; 1355 struct mbuf *m, *n; 1356 1357 args.flags = flags; 1358 args.type = type; 1359 1360 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 1361 return (uma_zalloc_arg(zone_mbuf, &args, how)); 1362 if (size <= MCLBYTES) 1363 return (uma_zalloc_arg(zone_pack, &args, how)); 1364 1365 if (size > MJUMPAGESIZE) 1366 return (NULL); 1367 1368 m = uma_zalloc_arg(zone_mbuf, &args, how); 1369 if (m == NULL) 1370 return (NULL); 1371 1372 n = uma_zalloc_arg(zone_jumbop, m, how); 1373 if (n == NULL) { 1374 m_free_raw(m); 1375 return (NULL); 1376 } 1377 1378 return (m); 1379 } 1380 1381 /* 1382 * m_get3() allocates minimum mbuf that would fit "size" argument. 1383 * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. 1384 */ 1385 struct mbuf * 1386 m_get3(int size, int how, short type, int flags) 1387 { 1388 struct mb_args args; 1389 struct mbuf *m, *n; 1390 uma_zone_t zone; 1391 1392 if (size <= MJUMPAGESIZE) 1393 return (m_get2(size, how, type, flags)); 1394 1395 if (size > MJUM16BYTES) 1396 return (NULL); 1397 1398 args.flags = flags; 1399 args.type = type; 1400 1401 m = uma_zalloc_arg(zone_mbuf, &args, how); 1402 if (m == NULL) 1403 return (NULL); 1404 1405 if (size <= MJUM9BYTES) 1406 zone = zone_jumbo9; 1407 else 1408 zone = zone_jumbo16; 1409 1410 n = uma_zalloc_arg(zone, m, how); 1411 if (n == NULL) { 1412 m_free_raw(m); 1413 return (NULL); 1414 } 1415 1416 return (m); 1417 } 1418 1419 /* 1420 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1421 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1422 */ 1423 struct mbuf * 1424 m_getjcl(int how, short type, int flags, int size) 1425 { 1426 struct mb_args args; 1427 struct mbuf *m, *n; 1428 uma_zone_t zone; 1429 1430 if (size == MCLBYTES) 1431 return m_getcl(how, type, flags); 1432 1433 args.flags = flags; 1434 args.type = type; 1435 1436 m = uma_zalloc_arg(zone_mbuf, &args, how); 1437 if (m == NULL) 1438 return (NULL); 1439 1440 zone = m_getzone(size); 1441 n = uma_zalloc_arg(zone, m, how); 1442 if (n == NULL) { 1443 m_free_raw(m); 1444 return (NULL); 1445 } 1446 MBUF_PROBE5(m__getjcl, how, type, flags, size, m); 1447 return (m); 1448 } 1449 1450 /* 1451 * Allocate mchain of a given length of mbufs and/or clusters (whatever fits 1452 * best). May fail due to ENOMEM. In case of failure state of mchain is 1453 * inconsistent. 1454 */ 1455 int 1456 mc_get(struct mchain *mc, u_int length, int how, short type, int flags) 1457 { 1458 struct mbuf *mb; 1459 u_int progress; 1460 1461 MPASS(length >= 0); 1462 1463 *mc = MCHAIN_INITIALIZER(mc); 1464 flags &= (M_PKTHDR | M_EOR); 1465 progress = 0; 1466 1467 /* Loop and append maximum sized mbufs to the chain tail. */ 1468 do { 1469 if (length - progress > MCLBYTES) { 1470 /* 1471 * M_NOWAIT here is intentional, it avoids blocking if 1472 * the jumbop zone is exhausted. See 796d4eb89e2c and 1473 * D26150 for more detail. 1474 */ 1475 mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), 1476 MJUMPAGESIZE); 1477 } else 1478 mb = NULL; 1479 if (mb == NULL) { 1480 if (length - progress >= MINCLSIZE) 1481 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1482 else if (flags & M_PKTHDR) 1483 mb = m_gethdr(how, type); 1484 else 1485 mb = m_get(how, type); 1486 1487 /* 1488 * Fail the whole operation if one mbuf can't be 1489 * allocated. 1490 */ 1491 if (mb == NULL) { 1492 m_freem(mc_first(mc)); 1493 return (ENOMEM); 1494 } 1495 } 1496 1497 progress += M_SIZE(mb); 1498 mc_append(mc, mb); 1499 /* Only valid on the first mbuf. */ 1500 flags &= ~M_PKTHDR; 1501 } while (progress < length); 1502 if (flags & M_EOR) 1503 /* Only valid on the last mbuf. */ 1504 mc_last(mc)->m_flags |= M_EOR; 1505 1506 return (0); 1507 } 1508 1509 /* 1510 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1511 * best) and return a pointer to the top of the allocated chain. If an 1512 * existing mbuf chain is provided, then we will append the new chain 1513 * to the existing one and return a pointer to the provided mbuf. 1514 */ 1515 struct mbuf * 1516 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1517 { 1518 struct mchain mc; 1519 1520 /* Packet header mbuf must be first in chain. */ 1521 if (m != NULL && (flags & M_PKTHDR)) 1522 flags &= ~M_PKTHDR; 1523 1524 if (__predict_false(mc_get(&mc, len, how, type, flags) != 0)) 1525 return (NULL); 1526 1527 /* If mbuf was supplied, append new chain to the end of it. */ 1528 if (m != NULL) { 1529 struct mbuf *mtail; 1530 1531 mtail = m_last(m); 1532 mtail->m_next = mc_first(&mc); 1533 mtail->m_flags &= ~M_EOR; 1534 } else 1535 m = mc_first(&mc); 1536 1537 return (m); 1538 } 1539 1540 /*- 1541 * Configure a provided mbuf to refer to the provided external storage 1542 * buffer and setup a reference count for said buffer. 1543 * 1544 * Arguments: 1545 * mb The existing mbuf to which to attach the provided buffer. 1546 * buf The address of the provided external storage buffer. 1547 * size The size of the provided buffer. 1548 * freef A pointer to a routine that is responsible for freeing the 1549 * provided external storage buffer. 1550 * args A pointer to an argument structure (of any type) to be passed 1551 * to the provided freef routine (may be NULL). 1552 * flags Any other flags to be passed to the provided mbuf. 1553 * type The type that the external storage buffer should be 1554 * labeled with. 1555 * 1556 * Returns: 1557 * Nothing. 1558 */ 1559 void 1560 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1561 void *arg1, void *arg2, int flags, int type) 1562 { 1563 1564 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1565 1566 mb->m_flags |= (M_EXT | flags); 1567 mb->m_ext.ext_buf = buf; 1568 mb->m_data = mb->m_ext.ext_buf; 1569 mb->m_ext.ext_size = size; 1570 mb->m_ext.ext_free = freef; 1571 mb->m_ext.ext_arg1 = arg1; 1572 mb->m_ext.ext_arg2 = arg2; 1573 mb->m_ext.ext_type = type; 1574 1575 if (type != EXT_EXTREF) { 1576 mb->m_ext.ext_count = 1; 1577 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1578 } else 1579 mb->m_ext.ext_flags = 0; 1580 } 1581 1582 /* 1583 * Free an entire chain of mbufs and associated external buffers, if 1584 * applicable. 1585 */ 1586 void 1587 m_freem(struct mbuf *mb) 1588 { 1589 1590 MBUF_PROBE1(m__freem, mb); 1591 while (mb != NULL) 1592 mb = m_free(mb); 1593 } 1594 1595 /* 1596 * Free an entire chain of mbufs and associated external buffers, following 1597 * both m_next and m_nextpkt linkage. 1598 * Note: doesn't support NULL argument. 1599 */ 1600 void 1601 m_freemp(struct mbuf *m) 1602 { 1603 struct mbuf *n; 1604 1605 MBUF_PROBE1(m__freemp, m); 1606 do { 1607 n = m->m_nextpkt; 1608 while (m != NULL) 1609 m = m_free(m); 1610 m = n; 1611 } while (m != NULL); 1612 } 1613 1614 /* 1615 * Temporary primitive to allow freeing without going through m_free. 1616 */ 1617 void 1618 m_free_raw(struct mbuf *mb) 1619 { 1620 1621 uma_zfree(zone_mbuf, mb); 1622 } 1623 1624 int 1625 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, 1626 struct m_snd_tag **mstp) 1627 { 1628 1629 return (if_snd_tag_alloc(ifp, params, mstp)); 1630 } 1631 1632 void 1633 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, 1634 const struct if_snd_tag_sw *sw) 1635 { 1636 1637 if_ref(ifp); 1638 mst->ifp = ifp; 1639 refcount_init(&mst->refcount, 1); 1640 mst->sw = sw; 1641 counter_u64_add(snd_tag_count, 1); 1642 } 1643 1644 void 1645 m_snd_tag_destroy(struct m_snd_tag *mst) 1646 { 1647 struct ifnet *ifp; 1648 1649 ifp = mst->ifp; 1650 mst->sw->snd_tag_free(mst); 1651 if_rele(ifp); 1652 counter_u64_add(snd_tag_count, -1); 1653 } 1654 1655 void 1656 m_rcvif_serialize(struct mbuf *m) 1657 { 1658 u_short idx, gen; 1659 1660 M_ASSERTPKTHDR(m); 1661 idx = if_getindex(m->m_pkthdr.rcvif); 1662 gen = if_getidxgen(m->m_pkthdr.rcvif); 1663 m->m_pkthdr.rcvidx = idx; 1664 m->m_pkthdr.rcvgen = gen; 1665 if (__predict_false(m->m_pkthdr.leaf_rcvif != NULL)) { 1666 idx = if_getindex(m->m_pkthdr.leaf_rcvif); 1667 gen = if_getidxgen(m->m_pkthdr.leaf_rcvif); 1668 } else { 1669 idx = -1; 1670 gen = 0; 1671 } 1672 m->m_pkthdr.leaf_rcvidx = idx; 1673 m->m_pkthdr.leaf_rcvgen = gen; 1674 } 1675 1676 struct ifnet * 1677 m_rcvif_restore(struct mbuf *m) 1678 { 1679 struct ifnet *ifp, *leaf_ifp; 1680 1681 M_ASSERTPKTHDR(m); 1682 NET_EPOCH_ASSERT(); 1683 1684 ifp = ifnet_byindexgen(m->m_pkthdr.rcvidx, m->m_pkthdr.rcvgen); 1685 if (ifp == NULL || (if_getflags(ifp) & IFF_DYING)) 1686 return (NULL); 1687 1688 if (__predict_true(m->m_pkthdr.leaf_rcvidx == (u_short)-1)) { 1689 leaf_ifp = NULL; 1690 } else { 1691 leaf_ifp = ifnet_byindexgen(m->m_pkthdr.leaf_rcvidx, 1692 m->m_pkthdr.leaf_rcvgen); 1693 if (__predict_false(leaf_ifp != NULL && (if_getflags(leaf_ifp) & IFF_DYING))) 1694 leaf_ifp = NULL; 1695 } 1696 1697 m->m_pkthdr.leaf_rcvif = leaf_ifp; 1698 m->m_pkthdr.rcvif = ifp; 1699 1700 return (ifp); 1701 } 1702 1703 /* 1704 * Allocate an mbuf with anonymous external pages. 1705 */ 1706 struct mbuf * 1707 mb_alloc_ext_plus_pages(int len, int how) 1708 { 1709 struct mbuf *m; 1710 vm_page_t pg; 1711 int i, npgs; 1712 1713 m = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0); 1714 if (m == NULL) 1715 return (NULL); 1716 m->m_epg_flags |= EPG_FLAG_ANON; 1717 npgs = howmany(len, PAGE_SIZE); 1718 for (i = 0; i < npgs; i++) { 1719 do { 1720 pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | 1721 VM_ALLOC_WIRED); 1722 if (pg == NULL) { 1723 if (how == M_NOWAIT) { 1724 m->m_epg_npgs = i; 1725 m_free(m); 1726 return (NULL); 1727 } 1728 vm_wait(NULL); 1729 } 1730 } while (pg == NULL); 1731 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); 1732 } 1733 m->m_epg_npgs = npgs; 1734 return (m); 1735 } 1736 1737 /* 1738 * Copy the data in the mbuf chain to a chain of mbufs with anonymous external 1739 * unmapped pages. 1740 * len is the length of data in the input mbuf chain. 1741 * mlen is the maximum number of bytes put into each ext_page mbuf. 1742 */ 1743 struct mbuf * 1744 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, 1745 struct mbuf **mlast) 1746 { 1747 struct mbuf *m, *mout; 1748 char *pgpos, *mbpos; 1749 int i, mblen, mbufsiz, pglen, xfer; 1750 1751 if (len == 0) 1752 return (NULL); 1753 mbufsiz = min(mlen, len); 1754 m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); 1755 if (m == NULL) 1756 return (m); 1757 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); 1758 pglen = PAGE_SIZE; 1759 mblen = 0; 1760 i = 0; 1761 do { 1762 if (pglen == 0) { 1763 if (++i == m->m_epg_npgs) { 1764 m->m_epg_last_len = PAGE_SIZE; 1765 mbufsiz = min(mlen, len); 1766 m->m_next = mb_alloc_ext_plus_pages(mbufsiz, 1767 how); 1768 m = m->m_next; 1769 if (m == NULL) { 1770 m_freem(mout); 1771 return (m); 1772 } 1773 i = 0; 1774 } 1775 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); 1776 pglen = PAGE_SIZE; 1777 } 1778 while (mblen == 0) { 1779 if (mp == NULL) { 1780 m_freem(mout); 1781 return (NULL); 1782 } 1783 KASSERT((mp->m_flags & M_EXTPG) == 0, 1784 ("mb_copym_ext_pgs: ext_pgs input mbuf")); 1785 mbpos = mtod(mp, char *); 1786 mblen = mp->m_len; 1787 mp = mp->m_next; 1788 } 1789 xfer = min(mblen, pglen); 1790 memcpy(pgpos, mbpos, xfer); 1791 pgpos += xfer; 1792 mbpos += xfer; 1793 pglen -= xfer; 1794 mblen -= xfer; 1795 len -= xfer; 1796 m->m_len += xfer; 1797 } while (len > 0); 1798 m->m_epg_last_len = PAGE_SIZE - pglen; 1799 if (mlast != NULL) 1800 *mlast = m; 1801 return (mout); 1802 } 1803