1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_param.h" 32 #include "opt_kern_tls.h" 33 34 #include <sys/param.h> 35 #include <sys/conf.h> 36 #include <sys/domainset.h> 37 #include <sys/malloc.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/eventhandler.h> 41 #include <sys/kernel.h> 42 #include <sys/ktls.h> 43 #include <sys/limits.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/refcount.h> 47 #include <sys/sf_buf.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/sysctl.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_pageout.h> 60 #include <vm/vm_map.h> 61 #include <vm/uma.h> 62 #include <vm/uma_dbg.h> 63 64 _Static_assert(MJUMPAGESIZE > MCLBYTES, 65 "Cluster must be smaller than a jumbo page"); 66 67 /* 68 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 69 * Zones. 70 * 71 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 72 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 73 * administrator so desires. 74 * 75 * Mbufs are allocated from a UMA Primary Zone called the Mbuf 76 * Zone. 77 * 78 * Additionally, FreeBSD provides a Packet Zone, which it 79 * configures as a Secondary Zone to the Mbuf Primary Zone, 80 * thus sharing backend Slab kegs with the Mbuf Primary Zone. 81 * 82 * Thus common-case allocations and locking are simplified: 83 * 84 * m_clget() m_getcl() 85 * | | 86 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 87 * | | [ Packet ] | 88 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 89 * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] 90 * | \________ | 91 * [ Cluster Keg ] \ / 92 * | [ Mbuf Keg ] 93 * [ Cluster Slabs ] | 94 * | [ Mbuf Slabs ] 95 * \____________(VM)_________________/ 96 * 97 * 98 * Whenever an object is allocated with uma_zalloc() out of 99 * one of the Zones its _ctor_ function is executed. The same 100 * for any deallocation through uma_zfree() the _dtor_ function 101 * is executed. 102 * 103 * Caches are per-CPU and are filled from the Primary Zone. 104 * 105 * Whenever an object is allocated from the underlying global 106 * memory pool it gets pre-initialized with the _zinit_ functions. 107 * When the Keg's are overfull objects get decommissioned with 108 * _zfini_ functions and free'd back to the global memory pool. 109 * 110 */ 111 112 int nmbufs; /* limits number of mbufs */ 113 int nmbclusters; /* limits number of mbuf clusters */ 114 int nmbjumbop; /* limits number of page size jumbo clusters */ 115 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 116 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 117 118 bool mb_use_ext_pgs = false; /* use M_EXTPG mbufs for sendfile & TLS */ 119 120 static int 121 sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS) 122 { 123 int error, extpg; 124 125 extpg = mb_use_ext_pgs; 126 error = sysctl_handle_int(oidp, &extpg, 0, req); 127 if (error == 0 && req->newptr != NULL) { 128 if (extpg != 0 && !PMAP_HAS_DMAP) 129 error = EOPNOTSUPP; 130 else 131 mb_use_ext_pgs = extpg != 0; 132 } 133 return (error); 134 } 135 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs, 136 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 137 &mb_use_ext_pgs, 0, sysctl_mb_use_ext_pgs, "IU", 138 "Use unmapped mbufs for sendfile(2) and TLS offload"); 139 140 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 141 142 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 143 "Maximum real memory allocatable to various mbuf types"); 144 145 static counter_u64_t snd_tag_count; 146 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, 147 &snd_tag_count, "# of active mbuf send tags"); 148 149 /* 150 * tunable_mbinit() has to be run before any mbuf allocations are done. 151 */ 152 static void 153 tunable_mbinit(void *dummy) 154 { 155 quad_t realmem; 156 int extpg; 157 158 /* 159 * The default limit for all mbuf related memory is 1/2 of all 160 * available kernel memory (physical or kmem). 161 * At most it can be 3/4 of available kernel memory. 162 */ 163 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 164 maxmbufmem = realmem / 2; 165 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 166 if (maxmbufmem > realmem / 4 * 3) 167 maxmbufmem = realmem / 4 * 3; 168 169 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 170 if (nmbclusters == 0) 171 nmbclusters = maxmbufmem / MCLBYTES / 4; 172 173 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 174 if (nmbjumbop == 0) 175 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 176 177 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 178 if (nmbjumbo9 == 0) 179 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 180 181 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 182 if (nmbjumbo16 == 0) 183 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 184 185 /* 186 * We need at least as many mbufs as we have clusters of 187 * the various types added together. 188 */ 189 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 190 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 191 nmbufs = lmax(maxmbufmem / MSIZE / 5, 192 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 193 194 /* 195 * Unmapped mbufs can only safely be used on platforms with a direct 196 * map. 197 */ 198 if (PMAP_HAS_DMAP) { 199 extpg = 1; 200 TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg); 201 mb_use_ext_pgs = extpg != 0; 202 } 203 } 204 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 205 206 static int 207 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 208 { 209 int error, newnmbclusters; 210 211 newnmbclusters = nmbclusters; 212 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 213 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 214 if (newnmbclusters > nmbclusters && 215 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 216 nmbclusters = newnmbclusters; 217 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 218 EVENTHANDLER_INVOKE(nmbclusters_change); 219 } else 220 error = EINVAL; 221 } 222 return (error); 223 } 224 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, 225 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 226 &nmbclusters, 0, sysctl_nmbclusters, "IU", 227 "Maximum number of mbuf clusters allowed"); 228 229 static int 230 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 231 { 232 int error, newnmbjumbop; 233 234 newnmbjumbop = nmbjumbop; 235 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 236 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 237 if (newnmbjumbop > nmbjumbop && 238 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 239 nmbjumbop = newnmbjumbop; 240 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 241 } else 242 error = EINVAL; 243 } 244 return (error); 245 } 246 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, 247 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 248 &nmbjumbop, 0, sysctl_nmbjumbop, "IU", 249 "Maximum number of mbuf page size jumbo clusters allowed"); 250 251 static int 252 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 253 { 254 int error, newnmbjumbo9; 255 256 newnmbjumbo9 = nmbjumbo9; 257 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 258 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 259 if (newnmbjumbo9 > nmbjumbo9 && 260 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 261 nmbjumbo9 = newnmbjumbo9; 262 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 263 } else 264 error = EINVAL; 265 } 266 return (error); 267 } 268 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, 269 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 270 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", 271 "Maximum number of mbuf 9k jumbo clusters allowed"); 272 273 static int 274 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 275 { 276 int error, newnmbjumbo16; 277 278 newnmbjumbo16 = nmbjumbo16; 279 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 280 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 281 if (newnmbjumbo16 > nmbjumbo16 && 282 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 283 nmbjumbo16 = newnmbjumbo16; 284 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 285 } else 286 error = EINVAL; 287 } 288 return (error); 289 } 290 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, 291 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 292 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", 293 "Maximum number of mbuf 16k jumbo clusters allowed"); 294 295 static int 296 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 297 { 298 int error, newnmbufs; 299 300 newnmbufs = nmbufs; 301 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 302 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 303 if (newnmbufs > nmbufs) { 304 nmbufs = newnmbufs; 305 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 306 EVENTHANDLER_INVOKE(nmbufs_change); 307 } else 308 error = EINVAL; 309 } 310 return (error); 311 } 312 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, 313 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 314 &nmbufs, 0, sysctl_nmbufs, "IU", 315 "Maximum number of mbufs allowed"); 316 317 /* 318 * Zones from which we allocate. 319 */ 320 uma_zone_t zone_mbuf; 321 uma_zone_t zone_clust; 322 uma_zone_t zone_pack; 323 uma_zone_t zone_jumbop; 324 uma_zone_t zone_jumbo9; 325 uma_zone_t zone_jumbo16; 326 327 /* 328 * Local prototypes. 329 */ 330 static int mb_ctor_mbuf(void *, int, void *, int); 331 static int mb_ctor_clust(void *, int, void *, int); 332 static int mb_ctor_pack(void *, int, void *, int); 333 static void mb_dtor_mbuf(void *, int, void *); 334 static void mb_dtor_pack(void *, int, void *); 335 static int mb_zinit_pack(void *, int, int); 336 static void mb_zfini_pack(void *, int); 337 static void mb_reclaim(uma_zone_t, int); 338 339 /* Ensure that MSIZE is a power of 2. */ 340 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 341 342 _Static_assert(sizeof(struct mbuf) <= MSIZE, 343 "size of mbuf exceeds MSIZE"); 344 /* 345 * Initialize FreeBSD Network buffer allocation. 346 */ 347 static void 348 mbuf_init(void *dummy) 349 { 350 351 /* 352 * Configure UMA zones for Mbufs, Clusters, and Packets. 353 */ 354 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 355 mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 356 MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); 357 if (nmbufs > 0) 358 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 359 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 360 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 361 362 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 363 mb_ctor_clust, NULL, NULL, NULL, 364 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 365 if (nmbclusters > 0) 366 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 367 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 368 uma_zone_set_maxaction(zone_clust, mb_reclaim); 369 370 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 371 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 372 373 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 374 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 375 mb_ctor_clust, NULL, NULL, NULL, 376 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 377 if (nmbjumbop > 0) 378 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 379 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 380 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 381 382 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 383 mb_ctor_clust, NULL, NULL, NULL, 384 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 385 if (nmbjumbo9 > 0) 386 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 387 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 388 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 389 390 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 391 mb_ctor_clust, NULL, NULL, NULL, 392 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 393 if (nmbjumbo16 > 0) 394 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 395 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 396 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 397 398 snd_tag_count = counter_u64_alloc(M_WAITOK); 399 } 400 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 401 402 #ifdef DEBUGNET 403 /* 404 * debugnet makes use of a pre-allocated pool of mbufs and clusters. When 405 * debugnet is configured, we initialize a set of UMA cache zones which return 406 * items from this pool. At panic-time, the regular UMA zone pointers are 407 * overwritten with those of the cache zones so that drivers may allocate and 408 * free mbufs and clusters without attempting to allocate physical memory. 409 * 410 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 411 * the purpose of caching clusters, we treat them as mbufs. 412 */ 413 static struct mbufq dn_mbufq = 414 { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; 415 static struct mbufq dn_clustq = 416 { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; 417 418 static int dn_clsize; 419 static uma_zone_t dn_zone_mbuf; 420 static uma_zone_t dn_zone_clust; 421 static uma_zone_t dn_zone_pack; 422 423 static struct debugnet_saved_zones { 424 uma_zone_t dsz_mbuf; 425 uma_zone_t dsz_clust; 426 uma_zone_t dsz_pack; 427 uma_zone_t dsz_jumbop; 428 uma_zone_t dsz_jumbo9; 429 uma_zone_t dsz_jumbo16; 430 bool dsz_debugnet_zones_enabled; 431 } dn_saved_zones; 432 433 static int 434 dn_buf_import(void *arg, void **store, int count, int domain __unused, 435 int flags) 436 { 437 struct mbufq *q; 438 struct mbuf *m; 439 int i; 440 441 q = arg; 442 443 for (i = 0; i < count; i++) { 444 m = mbufq_dequeue(q); 445 if (m == NULL) 446 break; 447 trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); 448 store[i] = m; 449 } 450 KASSERT((flags & M_WAITOK) == 0 || i == count, 451 ("%s: ran out of pre-allocated mbufs", __func__)); 452 return (i); 453 } 454 455 static void 456 dn_buf_release(void *arg, void **store, int count) 457 { 458 struct mbufq *q; 459 struct mbuf *m; 460 int i; 461 462 q = arg; 463 464 for (i = 0; i < count; i++) { 465 m = store[i]; 466 (void)mbufq_enqueue(q, m); 467 } 468 } 469 470 static int 471 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, 472 int flags __unused) 473 { 474 struct mbuf *m; 475 void *clust; 476 int i; 477 478 for (i = 0; i < count; i++) { 479 m = m_get(M_NOWAIT, MT_DATA); 480 if (m == NULL) 481 break; 482 clust = uma_zalloc(dn_zone_clust, M_NOWAIT); 483 if (clust == NULL) { 484 m_free(m); 485 break; 486 } 487 mb_ctor_clust(clust, dn_clsize, m, 0); 488 store[i] = m; 489 } 490 KASSERT((flags & M_WAITOK) == 0 || i == count, 491 ("%s: ran out of pre-allocated mbufs", __func__)); 492 return (i); 493 } 494 495 static void 496 dn_pack_release(void *arg __unused, void **store, int count) 497 { 498 struct mbuf *m; 499 void *clust; 500 int i; 501 502 for (i = 0; i < count; i++) { 503 m = store[i]; 504 clust = m->m_ext.ext_buf; 505 uma_zfree(dn_zone_clust, clust); 506 uma_zfree(dn_zone_mbuf, m); 507 } 508 } 509 510 /* 511 * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy 512 * the corresponding UMA cache zones. 513 */ 514 void 515 debugnet_mbuf_drain(void) 516 { 517 struct mbuf *m; 518 void *item; 519 520 if (dn_zone_mbuf != NULL) { 521 uma_zdestroy(dn_zone_mbuf); 522 dn_zone_mbuf = NULL; 523 } 524 if (dn_zone_clust != NULL) { 525 uma_zdestroy(dn_zone_clust); 526 dn_zone_clust = NULL; 527 } 528 if (dn_zone_pack != NULL) { 529 uma_zdestroy(dn_zone_pack); 530 dn_zone_pack = NULL; 531 } 532 533 while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) 534 m_free(m); 535 while ((item = mbufq_dequeue(&dn_clustq)) != NULL) 536 uma_zfree(m_getzone(dn_clsize), item); 537 } 538 539 /* 540 * Callback invoked immediately prior to starting a debugnet connection. 541 */ 542 void 543 debugnet_mbuf_start(void) 544 { 545 546 MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); 547 548 /* Save the old zone pointers to restore when debugnet is closed. */ 549 dn_saved_zones = (struct debugnet_saved_zones) { 550 .dsz_debugnet_zones_enabled = true, 551 .dsz_mbuf = zone_mbuf, 552 .dsz_clust = zone_clust, 553 .dsz_pack = zone_pack, 554 .dsz_jumbop = zone_jumbop, 555 .dsz_jumbo9 = zone_jumbo9, 556 .dsz_jumbo16 = zone_jumbo16, 557 }; 558 559 /* 560 * All cluster zones return buffers of the size requested by the 561 * drivers. It's up to the driver to reinitialize the zones if the 562 * MTU of a debugnet-enabled interface changes. 563 */ 564 printf("debugnet: overwriting mbuf zone pointers\n"); 565 zone_mbuf = dn_zone_mbuf; 566 zone_clust = dn_zone_clust; 567 zone_pack = dn_zone_pack; 568 zone_jumbop = dn_zone_clust; 569 zone_jumbo9 = dn_zone_clust; 570 zone_jumbo16 = dn_zone_clust; 571 } 572 573 /* 574 * Callback invoked when a debugnet connection is closed/finished. 575 */ 576 void 577 debugnet_mbuf_finish(void) 578 { 579 580 MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); 581 582 printf("debugnet: restoring mbuf zone pointers\n"); 583 zone_mbuf = dn_saved_zones.dsz_mbuf; 584 zone_clust = dn_saved_zones.dsz_clust; 585 zone_pack = dn_saved_zones.dsz_pack; 586 zone_jumbop = dn_saved_zones.dsz_jumbop; 587 zone_jumbo9 = dn_saved_zones.dsz_jumbo9; 588 zone_jumbo16 = dn_saved_zones.dsz_jumbo16; 589 590 memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); 591 } 592 593 /* 594 * Reinitialize the debugnet mbuf+cluster pool and cache zones. 595 */ 596 void 597 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) 598 { 599 struct mbuf *m; 600 void *item; 601 602 debugnet_mbuf_drain(); 603 604 dn_clsize = clsize; 605 606 dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, 607 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 608 dn_buf_import, dn_buf_release, 609 &dn_mbufq, UMA_ZONE_NOBUCKET); 610 611 dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, 612 clsize, mb_ctor_clust, NULL, NULL, NULL, 613 dn_buf_import, dn_buf_release, 614 &dn_clustq, UMA_ZONE_NOBUCKET); 615 616 dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, 617 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 618 dn_pack_import, dn_pack_release, 619 NULL, UMA_ZONE_NOBUCKET); 620 621 while (nmbuf-- > 0) { 622 m = m_get(M_WAITOK, MT_DATA); 623 uma_zfree(dn_zone_mbuf, m); 624 } 625 while (nclust-- > 0) { 626 item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); 627 uma_zfree(dn_zone_clust, item); 628 } 629 } 630 #endif /* DEBUGNET */ 631 632 /* 633 * Constructor for Mbuf primary zone. 634 * 635 * The 'arg' pointer points to a mb_args structure which 636 * contains call-specific information required to support the 637 * mbuf allocation API. See mbuf.h. 638 */ 639 static int 640 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 641 { 642 struct mbuf *m; 643 struct mb_args *args; 644 int error; 645 int flags; 646 short type; 647 648 args = (struct mb_args *)arg; 649 type = args->type; 650 651 /* 652 * The mbuf is initialized later. The caller has the 653 * responsibility to set up any MAC labels too. 654 */ 655 if (type == MT_NOINIT) 656 return (0); 657 658 m = (struct mbuf *)mem; 659 flags = args->flags; 660 MPASS((flags & M_NOFREE) == 0); 661 662 error = m_init(m, how, type, flags); 663 664 return (error); 665 } 666 667 /* 668 * The Mbuf primary zone destructor. 669 */ 670 static void 671 mb_dtor_mbuf(void *mem, int size, void *arg) 672 { 673 struct mbuf *m; 674 unsigned long flags __diagused; 675 676 m = (struct mbuf *)mem; 677 flags = (unsigned long)arg; 678 679 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 680 KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); 681 if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 682 m_tag_delete_chain(m, NULL); 683 } 684 685 /* 686 * The Mbuf Packet zone destructor. 687 */ 688 static void 689 mb_dtor_pack(void *mem, int size, void *arg) 690 { 691 struct mbuf *m; 692 693 m = (struct mbuf *)mem; 694 if ((m->m_flags & M_PKTHDR) != 0) 695 m_tag_delete_chain(m, NULL); 696 697 /* Make sure we've got a clean cluster back. */ 698 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 699 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 700 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 701 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 702 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 703 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 704 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 705 #if defined(INVARIANTS) && !defined(KMSAN) 706 trash_dtor(m->m_ext.ext_buf, MCLBYTES, zone_clust); 707 #endif 708 /* 709 * If there are processes blocked on zone_clust, waiting for pages 710 * to be freed up, cause them to be woken up by draining the 711 * packet zone. We are exposed to a race here (in the check for 712 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 713 * is deliberate. We don't want to acquire the zone lock for every 714 * mbuf free. 715 */ 716 if (uma_zone_exhausted(zone_clust)) 717 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 718 } 719 720 /* 721 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 722 * 723 * Here the 'arg' pointer points to the Mbuf which we 724 * are configuring cluster storage for. If 'arg' is 725 * empty we allocate just the cluster without setting 726 * the mbuf to it. See mbuf.h. 727 */ 728 static int 729 mb_ctor_clust(void *mem, int size, void *arg, int how) 730 { 731 struct mbuf *m; 732 733 m = (struct mbuf *)arg; 734 if (m != NULL) { 735 m->m_ext.ext_buf = (char *)mem; 736 m->m_data = m->m_ext.ext_buf; 737 m->m_flags |= M_EXT; 738 m->m_ext.ext_free = NULL; 739 m->m_ext.ext_arg1 = NULL; 740 m->m_ext.ext_arg2 = NULL; 741 m->m_ext.ext_size = size; 742 m->m_ext.ext_type = m_gettype(size); 743 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 744 m->m_ext.ext_count = 1; 745 } 746 747 return (0); 748 } 749 750 /* 751 * The Packet secondary zone's init routine, executed on the 752 * object's transition from mbuf keg slab to zone cache. 753 */ 754 static int 755 mb_zinit_pack(void *mem, int size, int how) 756 { 757 struct mbuf *m; 758 759 m = (struct mbuf *)mem; /* m is virgin. */ 760 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 761 m->m_ext.ext_buf == NULL) 762 return (ENOMEM); 763 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 764 #if defined(INVARIANTS) && !defined(KMSAN) 765 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 766 #endif 767 return (0); 768 } 769 770 /* 771 * The Packet secondary zone's fini routine, executed on the 772 * object's transition from zone cache to keg slab. 773 */ 774 static void 775 mb_zfini_pack(void *mem, int size) 776 { 777 struct mbuf *m; 778 779 m = (struct mbuf *)mem; 780 #if defined(INVARIANTS) && !defined(KMSAN) 781 trash_fini(m->m_ext.ext_buf, MCLBYTES); 782 #endif 783 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 784 #if defined(INVARIANTS) && !defined(KMSAN) 785 trash_dtor(mem, size, zone_clust); 786 #endif 787 } 788 789 /* 790 * The "packet" keg constructor. 791 */ 792 static int 793 mb_ctor_pack(void *mem, int size, void *arg, int how) 794 { 795 struct mbuf *m; 796 struct mb_args *args; 797 int error, flags; 798 short type; 799 800 m = (struct mbuf *)mem; 801 args = (struct mb_args *)arg; 802 flags = args->flags; 803 type = args->type; 804 MPASS((flags & M_NOFREE) == 0); 805 806 #if defined(INVARIANTS) && !defined(KMSAN) 807 trash_ctor(m->m_ext.ext_buf, MCLBYTES, zone_clust, how); 808 #endif 809 810 error = m_init(m, how, type, flags); 811 812 /* m_ext is already initialized. */ 813 m->m_data = m->m_ext.ext_buf; 814 m->m_flags = (flags | M_EXT); 815 816 return (error); 817 } 818 819 /* 820 * This is the protocol drain routine. Called by UMA whenever any of the 821 * mbuf zones is closed to its limit. 822 */ 823 static void 824 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 825 { 826 827 EVENTHANDLER_INVOKE(mbuf_lowmem, VM_LOW_MBUFS); 828 } 829 830 /* 831 * Free "count" units of I/O from an mbuf chain. They could be held 832 * in M_EXTPG or just as a normal mbuf. This code is intended to be 833 * called in an error path (I/O error, closed connection, etc). 834 */ 835 void 836 mb_free_notready(struct mbuf *m, int count) 837 { 838 int i; 839 840 for (i = 0; i < count && m != NULL; i++) { 841 if ((m->m_flags & M_EXTPG) != 0) { 842 m->m_epg_nrdy--; 843 if (m->m_epg_nrdy != 0) 844 continue; 845 } 846 m = m_free(m); 847 } 848 KASSERT(i == count, ("Removed only %d items from %p", i, m)); 849 } 850 851 /* 852 * Compress an unmapped mbuf into a simple mbuf when it holds a small 853 * amount of data. This is used as a DOS defense to avoid having 854 * small packets tie up wired pages, an ext_pgs structure, and an 855 * mbuf. Since this converts the existing mbuf in place, it can only 856 * be used if there are no other references to 'm'. 857 */ 858 int 859 mb_unmapped_compress(struct mbuf *m) 860 { 861 volatile u_int *refcnt; 862 char buf[MLEN]; 863 864 /* 865 * Assert that 'm' does not have a packet header. If 'm' had 866 * a packet header, it would only be able to hold MHLEN bytes 867 * and m_data would have to be initialized differently. 868 */ 869 KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), 870 ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); 871 KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); 872 873 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 874 refcnt = &m->m_ext.ext_count; 875 } else { 876 KASSERT(m->m_ext.ext_cnt != NULL, 877 ("%s: no refcounting pointer on %p", __func__, m)); 878 refcnt = m->m_ext.ext_cnt; 879 } 880 881 if (*refcnt != 1) 882 return (EBUSY); 883 884 m_copydata(m, 0, m->m_len, buf); 885 886 /* Free the backing pages. */ 887 m->m_ext.ext_free(m); 888 889 /* Turn 'm' into a "normal" mbuf. */ 890 m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); 891 m->m_data = m->m_dat; 892 893 /* Copy data back into m. */ 894 bcopy(buf, mtod(m, char *), m->m_len); 895 896 return (0); 897 } 898 899 /* 900 * These next few routines are used to permit downgrading an unmapped 901 * mbuf to a chain of mapped mbufs. This is used when an interface 902 * doesn't supported unmapped mbufs or if checksums need to be 903 * computed in software. 904 * 905 * Each unmapped mbuf is converted to a chain of mbufs. First, any 906 * TLS header data is stored in a regular mbuf. Second, each page of 907 * unmapped data is stored in an mbuf with an EXT_SFBUF external 908 * cluster. These mbufs use an sf_buf to provide a valid KVA for the 909 * associated physical page. They also hold a reference on the 910 * original M_EXTPG mbuf to ensure the physical page doesn't go away. 911 * Finally, any TLS trailer data is stored in a regular mbuf. 912 * 913 * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF 914 * mbufs. It frees the associated sf_buf and releases its reference 915 * on the original M_EXTPG mbuf. 916 * 917 * _mb_unmapped_to_ext() is a helper function that converts a single 918 * unmapped mbuf into a chain of mbufs. 919 * 920 * mb_unmapped_to_ext() is the public function that walks an mbuf 921 * chain converting any unmapped mbufs to mapped mbufs. It returns 922 * the new chain of unmapped mbufs on success. On failure it frees 923 * the original mbuf chain and returns NULL. 924 */ 925 static void 926 mb_unmapped_free_mext(struct mbuf *m) 927 { 928 struct sf_buf *sf; 929 struct mbuf *old_m; 930 931 sf = m->m_ext.ext_arg1; 932 sf_buf_free(sf); 933 934 /* Drop the reference on the backing M_EXTPG mbuf. */ 935 old_m = m->m_ext.ext_arg2; 936 mb_free_extpg(old_m); 937 } 938 939 static struct mbuf * 940 _mb_unmapped_to_ext(struct mbuf *m) 941 { 942 struct mbuf *m_new, *top, *prev, *mref; 943 struct sf_buf *sf; 944 vm_page_t pg; 945 int i, len, off, pglen, pgoff, seglen, segoff; 946 volatile u_int *refcnt; 947 u_int ref_inc = 0; 948 949 M_ASSERTEXTPG(m); 950 len = m->m_len; 951 KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", 952 __func__, m)); 953 954 /* See if this is the mbuf that holds the embedded refcount. */ 955 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 956 refcnt = &m->m_ext.ext_count; 957 mref = m; 958 } else { 959 KASSERT(m->m_ext.ext_cnt != NULL, 960 ("%s: no refcounting pointer on %p", __func__, m)); 961 refcnt = m->m_ext.ext_cnt; 962 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 963 } 964 965 /* Skip over any data removed from the front. */ 966 off = mtod(m, vm_offset_t); 967 968 top = NULL; 969 if (m->m_epg_hdrlen != 0) { 970 if (off >= m->m_epg_hdrlen) { 971 off -= m->m_epg_hdrlen; 972 } else { 973 seglen = m->m_epg_hdrlen - off; 974 segoff = off; 975 seglen = min(seglen, len); 976 off = 0; 977 len -= seglen; 978 m_new = m_get(M_NOWAIT, MT_DATA); 979 if (m_new == NULL) 980 goto fail; 981 m_new->m_len = seglen; 982 prev = top = m_new; 983 memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], 984 seglen); 985 } 986 } 987 pgoff = m->m_epg_1st_off; 988 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 989 pglen = m_epg_pagelen(m, i, pgoff); 990 if (off >= pglen) { 991 off -= pglen; 992 pgoff = 0; 993 continue; 994 } 995 seglen = pglen - off; 996 segoff = pgoff + off; 997 off = 0; 998 seglen = min(seglen, len); 999 len -= seglen; 1000 1001 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1002 m_new = m_get(M_NOWAIT, MT_DATA); 1003 if (m_new == NULL) 1004 goto fail; 1005 if (top == NULL) { 1006 top = prev = m_new; 1007 } else { 1008 prev->m_next = m_new; 1009 prev = m_new; 1010 } 1011 sf = sf_buf_alloc(pg, SFB_NOWAIT); 1012 if (sf == NULL) 1013 goto fail; 1014 1015 ref_inc++; 1016 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, 1017 mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); 1018 m_new->m_data += segoff; 1019 m_new->m_len = seglen; 1020 1021 pgoff = 0; 1022 }; 1023 if (len != 0) { 1024 KASSERT((off + len) <= m->m_epg_trllen, 1025 ("off + len > trail (%d + %d > %d)", off, len, 1026 m->m_epg_trllen)); 1027 m_new = m_get(M_NOWAIT, MT_DATA); 1028 if (m_new == NULL) 1029 goto fail; 1030 if (top == NULL) 1031 top = m_new; 1032 else 1033 prev->m_next = m_new; 1034 m_new->m_len = len; 1035 memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); 1036 } 1037 1038 if (ref_inc != 0) { 1039 /* 1040 * Obtain an additional reference on the old mbuf for 1041 * each created EXT_SFBUF mbuf. They will be dropped 1042 * in mb_unmapped_free_mext(). 1043 */ 1044 if (*refcnt == 1) 1045 *refcnt += ref_inc; 1046 else 1047 atomic_add_int(refcnt, ref_inc); 1048 } 1049 m_free(m); 1050 return (top); 1051 1052 fail: 1053 if (ref_inc != 0) { 1054 /* 1055 * Obtain an additional reference on the old mbuf for 1056 * each created EXT_SFBUF mbuf. They will be 1057 * immediately dropped when these mbufs are freed 1058 * below. 1059 */ 1060 if (*refcnt == 1) 1061 *refcnt += ref_inc; 1062 else 1063 atomic_add_int(refcnt, ref_inc); 1064 } 1065 m_free(m); 1066 m_freem(top); 1067 return (NULL); 1068 } 1069 1070 struct mbuf * 1071 mb_unmapped_to_ext(struct mbuf *top) 1072 { 1073 struct mbuf *m, *next, *prev = NULL; 1074 1075 prev = NULL; 1076 for (m = top; m != NULL; m = next) { 1077 /* m might be freed, so cache the next pointer. */ 1078 next = m->m_next; 1079 if (m->m_flags & M_EXTPG) { 1080 if (prev != NULL) { 1081 /* 1082 * Remove 'm' from the new chain so 1083 * that the 'top' chain terminates 1084 * before 'm' in case 'top' is freed 1085 * due to an error. 1086 */ 1087 prev->m_next = NULL; 1088 } 1089 m = _mb_unmapped_to_ext(m); 1090 if (m == NULL) { 1091 m_freem(top); 1092 m_freem(next); 1093 return (NULL); 1094 } 1095 if (prev == NULL) { 1096 top = m; 1097 } else { 1098 prev->m_next = m; 1099 } 1100 1101 /* 1102 * Replaced one mbuf with a chain, so we must 1103 * find the end of chain. 1104 */ 1105 prev = m_last(m); 1106 } else { 1107 if (prev != NULL) { 1108 prev->m_next = m; 1109 } 1110 prev = m; 1111 } 1112 } 1113 return (top); 1114 } 1115 1116 /* 1117 * Allocate an empty M_EXTPG mbuf. The ext_free routine is 1118 * responsible for freeing any pages backing this mbuf when it is 1119 * freed. 1120 */ 1121 struct mbuf * 1122 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) 1123 { 1124 struct mbuf *m; 1125 1126 m = m_get(how, MT_DATA); 1127 if (m == NULL) 1128 return (NULL); 1129 1130 m->m_epg_npgs = 0; 1131 m->m_epg_nrdy = 0; 1132 m->m_epg_1st_off = 0; 1133 m->m_epg_last_len = 0; 1134 m->m_epg_flags = 0; 1135 m->m_epg_hdrlen = 0; 1136 m->m_epg_trllen = 0; 1137 m->m_epg_tls = NULL; 1138 m->m_epg_so = NULL; 1139 m->m_data = NULL; 1140 m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); 1141 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 1142 m->m_ext.ext_count = 1; 1143 m->m_ext.ext_size = 0; 1144 m->m_ext.ext_free = ext_free; 1145 return (m); 1146 } 1147 1148 /* 1149 * Clean up after mbufs with M_EXT storage attached to them if the 1150 * reference count hits 1. 1151 */ 1152 void 1153 mb_free_ext(struct mbuf *m) 1154 { 1155 volatile u_int *refcnt; 1156 struct mbuf *mref; 1157 int freembuf; 1158 1159 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 1160 1161 /* See if this is the mbuf that holds the embedded refcount. */ 1162 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1163 refcnt = &m->m_ext.ext_count; 1164 mref = m; 1165 } else { 1166 KASSERT(m->m_ext.ext_cnt != NULL, 1167 ("%s: no refcounting pointer on %p", __func__, m)); 1168 refcnt = m->m_ext.ext_cnt; 1169 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1170 } 1171 1172 /* 1173 * Check if the header is embedded in the cluster. It is 1174 * important that we can't touch any of the mbuf fields 1175 * after we have freed the external storage, since mbuf 1176 * could have been embedded in it. For now, the mbufs 1177 * embedded into the cluster are always of type EXT_EXTREF, 1178 * and for this type we won't free the mref. 1179 */ 1180 if (m->m_flags & M_NOFREE) { 1181 freembuf = 0; 1182 KASSERT(m->m_ext.ext_type == EXT_EXTREF || 1183 m->m_ext.ext_type == EXT_RXRING, 1184 ("%s: no-free mbuf %p has wrong type", __func__, m)); 1185 } else 1186 freembuf = 1; 1187 1188 /* Free attached storage if this mbuf is the only reference to it. */ 1189 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1190 switch (m->m_ext.ext_type) { 1191 case EXT_PACKET: 1192 /* The packet zone is special. */ 1193 if (*refcnt == 0) 1194 *refcnt = 1; 1195 uma_zfree(zone_pack, mref); 1196 break; 1197 case EXT_CLUSTER: 1198 uma_zfree(zone_clust, m->m_ext.ext_buf); 1199 m_free_raw(mref); 1200 break; 1201 case EXT_JUMBOP: 1202 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 1203 m_free_raw(mref); 1204 break; 1205 case EXT_JUMBO9: 1206 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 1207 m_free_raw(mref); 1208 break; 1209 case EXT_JUMBO16: 1210 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 1211 m_free_raw(mref); 1212 break; 1213 case EXT_SFBUF: 1214 case EXT_NET_DRV: 1215 case EXT_MOD_TYPE: 1216 case EXT_DISPOSABLE: 1217 KASSERT(mref->m_ext.ext_free != NULL, 1218 ("%s: ext_free not set", __func__)); 1219 mref->m_ext.ext_free(mref); 1220 m_free_raw(mref); 1221 break; 1222 case EXT_EXTREF: 1223 KASSERT(m->m_ext.ext_free != NULL, 1224 ("%s: ext_free not set", __func__)); 1225 m->m_ext.ext_free(m); 1226 break; 1227 case EXT_RXRING: 1228 KASSERT(m->m_ext.ext_free == NULL, 1229 ("%s: ext_free is set", __func__)); 1230 break; 1231 default: 1232 KASSERT(m->m_ext.ext_type == 0, 1233 ("%s: unknown ext_type", __func__)); 1234 } 1235 } 1236 1237 if (freembuf && m != mref) 1238 m_free_raw(m); 1239 } 1240 1241 /* 1242 * Clean up after mbufs with M_EXTPG storage attached to them if the 1243 * reference count hits 1. 1244 */ 1245 void 1246 mb_free_extpg(struct mbuf *m) 1247 { 1248 volatile u_int *refcnt; 1249 struct mbuf *mref; 1250 1251 M_ASSERTEXTPG(m); 1252 1253 /* See if this is the mbuf that holds the embedded refcount. */ 1254 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1255 refcnt = &m->m_ext.ext_count; 1256 mref = m; 1257 } else { 1258 KASSERT(m->m_ext.ext_cnt != NULL, 1259 ("%s: no refcounting pointer on %p", __func__, m)); 1260 refcnt = m->m_ext.ext_cnt; 1261 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1262 } 1263 1264 /* Free attached storage if this mbuf is the only reference to it. */ 1265 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1266 KASSERT(mref->m_ext.ext_free != NULL, 1267 ("%s: ext_free not set", __func__)); 1268 1269 mref->m_ext.ext_free(mref); 1270 #ifdef KERN_TLS 1271 if (mref->m_epg_tls != NULL && 1272 !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) 1273 ktls_enqueue_to_free(mref); 1274 else 1275 #endif 1276 m_free_raw(mref); 1277 } 1278 1279 if (m != mref) 1280 m_free_raw(m); 1281 } 1282 1283 /* 1284 * Official mbuf(9) allocation KPI for stack and drivers: 1285 * 1286 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 1287 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 1288 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 1289 * m_clget() - attach cluster to already allocated mbuf. 1290 * m_cljget() - attach jumbo cluster to already allocated mbuf. 1291 * m_get2() - allocate minimum mbuf that would fit size argument. 1292 * m_getm2() - allocate a chain of mbufs/clusters. 1293 * m_extadd() - attach external cluster to mbuf. 1294 * 1295 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 1296 * m_freem() - free chain of mbufs. 1297 */ 1298 1299 int 1300 m_clget(struct mbuf *m, int how) 1301 { 1302 1303 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1304 __func__, m)); 1305 m->m_ext.ext_buf = (char *)NULL; 1306 uma_zalloc_arg(zone_clust, m, how); 1307 /* 1308 * On a cluster allocation failure, drain the packet zone and retry, 1309 * we might be able to loosen a few clusters up on the drain. 1310 */ 1311 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 1312 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 1313 uma_zalloc_arg(zone_clust, m, how); 1314 } 1315 MBUF_PROBE2(m__clget, m, how); 1316 return (m->m_flags & M_EXT); 1317 } 1318 1319 /* 1320 * m_cljget() is different from m_clget() as it can allocate clusters without 1321 * attaching them to an mbuf. In that case the return value is the pointer 1322 * to the cluster of the requested size. If an mbuf was specified, it gets 1323 * the cluster attached to it and the return value can be safely ignored. 1324 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1325 */ 1326 void * 1327 m_cljget(struct mbuf *m, int how, int size) 1328 { 1329 uma_zone_t zone; 1330 void *retval; 1331 1332 if (m != NULL) { 1333 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1334 __func__, m)); 1335 m->m_ext.ext_buf = NULL; 1336 } 1337 1338 zone = m_getzone(size); 1339 retval = uma_zalloc_arg(zone, m, how); 1340 1341 MBUF_PROBE4(m__cljget, m, how, size, retval); 1342 1343 return (retval); 1344 } 1345 1346 /* 1347 * m_get2() allocates minimum mbuf that would fit "size" argument. 1348 */ 1349 struct mbuf * 1350 m_get2(int size, int how, short type, int flags) 1351 { 1352 struct mb_args args; 1353 struct mbuf *m, *n; 1354 1355 args.flags = flags; 1356 args.type = type; 1357 1358 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 1359 return (uma_zalloc_arg(zone_mbuf, &args, how)); 1360 if (size <= MCLBYTES) 1361 return (uma_zalloc_arg(zone_pack, &args, how)); 1362 1363 if (size > MJUMPAGESIZE) 1364 return (NULL); 1365 1366 m = uma_zalloc_arg(zone_mbuf, &args, how); 1367 if (m == NULL) 1368 return (NULL); 1369 1370 n = uma_zalloc_arg(zone_jumbop, m, how); 1371 if (n == NULL) { 1372 m_free_raw(m); 1373 return (NULL); 1374 } 1375 1376 return (m); 1377 } 1378 1379 /* 1380 * m_get3() allocates minimum mbuf that would fit "size" argument. 1381 * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. 1382 */ 1383 struct mbuf * 1384 m_get3(int size, int how, short type, int flags) 1385 { 1386 struct mb_args args; 1387 struct mbuf *m, *n; 1388 uma_zone_t zone; 1389 1390 if (size <= MJUMPAGESIZE) 1391 return (m_get2(size, how, type, flags)); 1392 1393 if (size > MJUM16BYTES) 1394 return (NULL); 1395 1396 args.flags = flags; 1397 args.type = type; 1398 1399 m = uma_zalloc_arg(zone_mbuf, &args, how); 1400 if (m == NULL) 1401 return (NULL); 1402 1403 if (size <= MJUM9BYTES) 1404 zone = zone_jumbo9; 1405 else 1406 zone = zone_jumbo16; 1407 1408 n = uma_zalloc_arg(zone, m, how); 1409 if (n == NULL) { 1410 m_free_raw(m); 1411 return (NULL); 1412 } 1413 1414 return (m); 1415 } 1416 1417 /* 1418 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1419 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1420 */ 1421 struct mbuf * 1422 m_getjcl(int how, short type, int flags, int size) 1423 { 1424 struct mb_args args; 1425 struct mbuf *m, *n; 1426 uma_zone_t zone; 1427 1428 if (size == MCLBYTES) 1429 return m_getcl(how, type, flags); 1430 1431 args.flags = flags; 1432 args.type = type; 1433 1434 m = uma_zalloc_arg(zone_mbuf, &args, how); 1435 if (m == NULL) 1436 return (NULL); 1437 1438 zone = m_getzone(size); 1439 n = uma_zalloc_arg(zone, m, how); 1440 if (n == NULL) { 1441 m_free_raw(m); 1442 return (NULL); 1443 } 1444 MBUF_PROBE5(m__getjcl, how, type, flags, size, m); 1445 return (m); 1446 } 1447 1448 /* 1449 * Allocate mchain of a given length of mbufs and/or clusters (whatever fits 1450 * best). May fail due to ENOMEM. In case of failure state of mchain is 1451 * inconsistent. 1452 */ 1453 int 1454 mc_get(struct mchain *mc, u_int length, int how, short type, int flags) 1455 { 1456 struct mbuf *mb; 1457 u_int progress; 1458 1459 MPASS(length >= 0); 1460 1461 *mc = MCHAIN_INITIALIZER(mc); 1462 flags &= (M_PKTHDR | M_EOR); 1463 progress = 0; 1464 1465 /* Loop and append maximum sized mbufs to the chain tail. */ 1466 do { 1467 if (length - progress > MCLBYTES) { 1468 /* 1469 * M_NOWAIT here is intentional, it avoids blocking if 1470 * the jumbop zone is exhausted. See 796d4eb89e2c and 1471 * D26150 for more detail. 1472 */ 1473 mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), 1474 MJUMPAGESIZE); 1475 } else 1476 mb = NULL; 1477 if (mb == NULL) { 1478 if (length - progress >= MINCLSIZE) 1479 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1480 else if (flags & M_PKTHDR) 1481 mb = m_gethdr(how, type); 1482 else 1483 mb = m_get(how, type); 1484 1485 /* 1486 * Fail the whole operation if one mbuf can't be 1487 * allocated. 1488 */ 1489 if (mb == NULL) { 1490 m_freem(mc_first(mc)); 1491 return (ENOMEM); 1492 } 1493 } 1494 1495 progress += M_SIZE(mb); 1496 mc_append(mc, mb); 1497 /* Only valid on the first mbuf. */ 1498 flags &= ~M_PKTHDR; 1499 } while (progress < length); 1500 if (flags & M_EOR) 1501 /* Only valid on the last mbuf. */ 1502 mc_last(mc)->m_flags |= M_EOR; 1503 1504 return (0); 1505 } 1506 1507 /* 1508 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1509 * best) and return a pointer to the top of the allocated chain. If an 1510 * existing mbuf chain is provided, then we will append the new chain 1511 * to the existing one and return a pointer to the provided mbuf. 1512 */ 1513 struct mbuf * 1514 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1515 { 1516 struct mchain mc; 1517 1518 /* Packet header mbuf must be first in chain. */ 1519 if (m != NULL && (flags & M_PKTHDR)) 1520 flags &= ~M_PKTHDR; 1521 1522 if (__predict_false(mc_get(&mc, len, how, type, flags) != 0)) 1523 return (NULL); 1524 1525 /* If mbuf was supplied, append new chain to the end of it. */ 1526 if (m != NULL) { 1527 struct mbuf *mtail; 1528 1529 mtail = m_last(m); 1530 mtail->m_next = mc_first(&mc); 1531 mtail->m_flags &= ~M_EOR; 1532 } else 1533 m = mc_first(&mc); 1534 1535 return (m); 1536 } 1537 1538 /*- 1539 * Configure a provided mbuf to refer to the provided external storage 1540 * buffer and setup a reference count for said buffer. 1541 * 1542 * Arguments: 1543 * mb The existing mbuf to which to attach the provided buffer. 1544 * buf The address of the provided external storage buffer. 1545 * size The size of the provided buffer. 1546 * freef A pointer to a routine that is responsible for freeing the 1547 * provided external storage buffer. 1548 * args A pointer to an argument structure (of any type) to be passed 1549 * to the provided freef routine (may be NULL). 1550 * flags Any other flags to be passed to the provided mbuf. 1551 * type The type that the external storage buffer should be 1552 * labeled with. 1553 * 1554 * Returns: 1555 * Nothing. 1556 */ 1557 void 1558 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1559 void *arg1, void *arg2, int flags, int type) 1560 { 1561 1562 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1563 1564 mb->m_flags |= (M_EXT | flags); 1565 mb->m_ext.ext_buf = buf; 1566 mb->m_data = mb->m_ext.ext_buf; 1567 mb->m_ext.ext_size = size; 1568 mb->m_ext.ext_free = freef; 1569 mb->m_ext.ext_arg1 = arg1; 1570 mb->m_ext.ext_arg2 = arg2; 1571 mb->m_ext.ext_type = type; 1572 1573 if (type != EXT_EXTREF) { 1574 mb->m_ext.ext_count = 1; 1575 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1576 } else 1577 mb->m_ext.ext_flags = 0; 1578 } 1579 1580 /* 1581 * Free an entire chain of mbufs and associated external buffers, if 1582 * applicable. 1583 */ 1584 void 1585 m_freem(struct mbuf *mb) 1586 { 1587 1588 MBUF_PROBE1(m__freem, mb); 1589 while (mb != NULL) 1590 mb = m_free(mb); 1591 } 1592 1593 /* 1594 * Temporary primitive to allow freeing without going through m_free. 1595 */ 1596 void 1597 m_free_raw(struct mbuf *mb) 1598 { 1599 1600 uma_zfree(zone_mbuf, mb); 1601 } 1602 1603 int 1604 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, 1605 struct m_snd_tag **mstp) 1606 { 1607 1608 return (if_snd_tag_alloc(ifp, params, mstp)); 1609 } 1610 1611 void 1612 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, 1613 const struct if_snd_tag_sw *sw) 1614 { 1615 1616 if_ref(ifp); 1617 mst->ifp = ifp; 1618 refcount_init(&mst->refcount, 1); 1619 mst->sw = sw; 1620 counter_u64_add(snd_tag_count, 1); 1621 } 1622 1623 void 1624 m_snd_tag_destroy(struct m_snd_tag *mst) 1625 { 1626 struct ifnet *ifp; 1627 1628 ifp = mst->ifp; 1629 mst->sw->snd_tag_free(mst); 1630 if_rele(ifp); 1631 counter_u64_add(snd_tag_count, -1); 1632 } 1633 1634 void 1635 m_rcvif_serialize(struct mbuf *m) 1636 { 1637 u_short idx, gen; 1638 1639 M_ASSERTPKTHDR(m); 1640 idx = if_getindex(m->m_pkthdr.rcvif); 1641 gen = if_getidxgen(m->m_pkthdr.rcvif); 1642 m->m_pkthdr.rcvidx = idx; 1643 m->m_pkthdr.rcvgen = gen; 1644 if (__predict_false(m->m_pkthdr.leaf_rcvif != NULL)) { 1645 idx = if_getindex(m->m_pkthdr.leaf_rcvif); 1646 gen = if_getidxgen(m->m_pkthdr.leaf_rcvif); 1647 } else { 1648 idx = -1; 1649 gen = 0; 1650 } 1651 m->m_pkthdr.leaf_rcvidx = idx; 1652 m->m_pkthdr.leaf_rcvgen = gen; 1653 } 1654 1655 struct ifnet * 1656 m_rcvif_restore(struct mbuf *m) 1657 { 1658 struct ifnet *ifp, *leaf_ifp; 1659 1660 M_ASSERTPKTHDR(m); 1661 NET_EPOCH_ASSERT(); 1662 1663 ifp = ifnet_byindexgen(m->m_pkthdr.rcvidx, m->m_pkthdr.rcvgen); 1664 if (ifp == NULL || (if_getflags(ifp) & IFF_DYING)) 1665 return (NULL); 1666 1667 if (__predict_true(m->m_pkthdr.leaf_rcvidx == (u_short)-1)) { 1668 leaf_ifp = NULL; 1669 } else { 1670 leaf_ifp = ifnet_byindexgen(m->m_pkthdr.leaf_rcvidx, 1671 m->m_pkthdr.leaf_rcvgen); 1672 if (__predict_false(leaf_ifp != NULL && (if_getflags(leaf_ifp) & IFF_DYING))) 1673 leaf_ifp = NULL; 1674 } 1675 1676 m->m_pkthdr.leaf_rcvif = leaf_ifp; 1677 m->m_pkthdr.rcvif = ifp; 1678 1679 return (ifp); 1680 } 1681 1682 /* 1683 * Allocate an mbuf with anonymous external pages. 1684 */ 1685 struct mbuf * 1686 mb_alloc_ext_plus_pages(int len, int how) 1687 { 1688 struct mbuf *m; 1689 vm_page_t pg; 1690 int i, npgs; 1691 1692 m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1693 if (m == NULL) 1694 return (NULL); 1695 m->m_epg_flags |= EPG_FLAG_ANON; 1696 npgs = howmany(len, PAGE_SIZE); 1697 for (i = 0; i < npgs; i++) { 1698 do { 1699 pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | 1700 VM_ALLOC_WIRED); 1701 if (pg == NULL) { 1702 if (how == M_NOWAIT) { 1703 m->m_epg_npgs = i; 1704 m_free(m); 1705 return (NULL); 1706 } 1707 vm_wait(NULL); 1708 } 1709 } while (pg == NULL); 1710 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); 1711 } 1712 m->m_epg_npgs = npgs; 1713 return (m); 1714 } 1715 1716 /* 1717 * Copy the data in the mbuf chain to a chain of mbufs with anonymous external 1718 * unmapped pages. 1719 * len is the length of data in the input mbuf chain. 1720 * mlen is the maximum number of bytes put into each ext_page mbuf. 1721 */ 1722 struct mbuf * 1723 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, 1724 struct mbuf **mlast) 1725 { 1726 struct mbuf *m, *mout; 1727 char *pgpos, *mbpos; 1728 int i, mblen, mbufsiz, pglen, xfer; 1729 1730 if (len == 0) 1731 return (NULL); 1732 mbufsiz = min(mlen, len); 1733 m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); 1734 if (m == NULL) 1735 return (m); 1736 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); 1737 pglen = PAGE_SIZE; 1738 mblen = 0; 1739 i = 0; 1740 do { 1741 if (pglen == 0) { 1742 if (++i == m->m_epg_npgs) { 1743 m->m_epg_last_len = PAGE_SIZE; 1744 mbufsiz = min(mlen, len); 1745 m->m_next = mb_alloc_ext_plus_pages(mbufsiz, 1746 how); 1747 m = m->m_next; 1748 if (m == NULL) { 1749 m_freem(mout); 1750 return (m); 1751 } 1752 i = 0; 1753 } 1754 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); 1755 pglen = PAGE_SIZE; 1756 } 1757 while (mblen == 0) { 1758 if (mp == NULL) { 1759 m_freem(mout); 1760 return (NULL); 1761 } 1762 KASSERT((mp->m_flags & M_EXTPG) == 0, 1763 ("mb_copym_ext_pgs: ext_pgs input mbuf")); 1764 mbpos = mtod(mp, char *); 1765 mblen = mp->m_len; 1766 mp = mp->m_next; 1767 } 1768 xfer = min(mblen, pglen); 1769 memcpy(pgpos, mbpos, xfer); 1770 pgpos += xfer; 1771 mbpos += xfer; 1772 pglen -= xfer; 1773 mblen -= xfer; 1774 len -= xfer; 1775 m->m_len += xfer; 1776 } while (len > 0); 1777 m->m_epg_last_len = PAGE_SIZE - pglen; 1778 if (mlast != NULL) 1779 *mlast = m; 1780 return (mout); 1781 } 1782