1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_param.h" 34 #include "opt_kern_tls.h" 35 36 #include <sys/param.h> 37 #include <sys/conf.h> 38 #include <sys/domainset.h> 39 #include <sys/malloc.h> 40 #include <sys/systm.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/eventhandler.h> 44 #include <sys/kernel.h> 45 #include <sys/ktls.h> 46 #include <sys/limits.h> 47 #include <sys/lock.h> 48 #include <sys/mutex.h> 49 #include <sys/protosw.h> 50 #include <sys/refcount.h> 51 #include <sys/sf_buf.h> 52 #include <sys/smp.h> 53 #include <sys/socket.h> 54 #include <sys/sysctl.h> 55 56 #include <net/if.h> 57 #include <net/if_var.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_extern.h> 61 #include <vm/vm_kern.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_map.h> 65 #include <vm/uma.h> 66 #include <vm/uma_dbg.h> 67 68 /* 69 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA 70 * Zones. 71 * 72 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster 73 * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the 74 * administrator so desires. 75 * 76 * Mbufs are allocated from a UMA Primary Zone called the Mbuf 77 * Zone. 78 * 79 * Additionally, FreeBSD provides a Packet Zone, which it 80 * configures as a Secondary Zone to the Mbuf Primary Zone, 81 * thus sharing backend Slab kegs with the Mbuf Primary Zone. 82 * 83 * Thus common-case allocations and locking are simplified: 84 * 85 * m_clget() m_getcl() 86 * | | 87 * | .------------>[(Packet Cache)] m_get(), m_gethdr() 88 * | | [ Packet ] | 89 * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] 90 * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] 91 * | \________ | 92 * [ Cluster Keg ] \ / 93 * | [ Mbuf Keg ] 94 * [ Cluster Slabs ] | 95 * | [ Mbuf Slabs ] 96 * \____________(VM)_________________/ 97 * 98 * 99 * Whenever an object is allocated with uma_zalloc() out of 100 * one of the Zones its _ctor_ function is executed. The same 101 * for any deallocation through uma_zfree() the _dtor_ function 102 * is executed. 103 * 104 * Caches are per-CPU and are filled from the Primary Zone. 105 * 106 * Whenever an object is allocated from the underlying global 107 * memory pool it gets pre-initialized with the _zinit_ functions. 108 * When the Keg's are overfull objects get decommissioned with 109 * _zfini_ functions and free'd back to the global memory pool. 110 * 111 */ 112 113 int nmbufs; /* limits number of mbufs */ 114 int nmbclusters; /* limits number of mbuf clusters */ 115 int nmbjumbop; /* limits number of page size jumbo clusters */ 116 int nmbjumbo9; /* limits number of 9k jumbo clusters */ 117 int nmbjumbo16; /* limits number of 16k jumbo clusters */ 118 119 bool mb_use_ext_pgs; /* use M_EXTPG mbufs for sendfile & TLS */ 120 SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, 121 &mb_use_ext_pgs, 0, 122 "Use unmapped mbufs for sendfile(2) and TLS offload"); 123 124 static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ 125 126 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, 127 "Maximum real memory allocatable to various mbuf types"); 128 129 static counter_u64_t snd_tag_count; 130 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, 131 &snd_tag_count, "# of active mbuf send tags"); 132 133 /* 134 * tunable_mbinit() has to be run before any mbuf allocations are done. 135 */ 136 static void 137 tunable_mbinit(void *dummy) 138 { 139 quad_t realmem; 140 141 /* 142 * The default limit for all mbuf related memory is 1/2 of all 143 * available kernel memory (physical or kmem). 144 * At most it can be 3/4 of available kernel memory. 145 */ 146 realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); 147 maxmbufmem = realmem / 2; 148 TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); 149 if (maxmbufmem > realmem / 4 * 3) 150 maxmbufmem = realmem / 4 * 3; 151 152 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); 153 if (nmbclusters == 0) 154 nmbclusters = maxmbufmem / MCLBYTES / 4; 155 156 TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); 157 if (nmbjumbop == 0) 158 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; 159 160 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); 161 if (nmbjumbo9 == 0) 162 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; 163 164 TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); 165 if (nmbjumbo16 == 0) 166 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; 167 168 /* 169 * We need at least as many mbufs as we have clusters of 170 * the various types added together. 171 */ 172 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); 173 if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) 174 nmbufs = lmax(maxmbufmem / MSIZE / 5, 175 nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); 176 } 177 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); 178 179 static int 180 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) 181 { 182 int error, newnmbclusters; 183 184 newnmbclusters = nmbclusters; 185 error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 186 if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { 187 if (newnmbclusters > nmbclusters && 188 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 189 nmbclusters = newnmbclusters; 190 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 191 EVENTHANDLER_INVOKE(nmbclusters_change); 192 } else 193 error = EINVAL; 194 } 195 return (error); 196 } 197 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, 198 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0, 199 sysctl_nmbclusters, "IU", 200 "Maximum number of mbuf clusters allowed"); 201 202 static int 203 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) 204 { 205 int error, newnmbjumbop; 206 207 newnmbjumbop = nmbjumbop; 208 error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 209 if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { 210 if (newnmbjumbop > nmbjumbop && 211 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 212 nmbjumbop = newnmbjumbop; 213 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 214 } else 215 error = EINVAL; 216 } 217 return (error); 218 } 219 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, 220 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0, 221 sysctl_nmbjumbop, "IU", 222 "Maximum number of mbuf page size jumbo clusters allowed"); 223 224 static int 225 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) 226 { 227 int error, newnmbjumbo9; 228 229 newnmbjumbo9 = nmbjumbo9; 230 error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 231 if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { 232 if (newnmbjumbo9 > nmbjumbo9 && 233 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 234 nmbjumbo9 = newnmbjumbo9; 235 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 236 } else 237 error = EINVAL; 238 } 239 return (error); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, 242 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0, 243 sysctl_nmbjumbo9, "IU", 244 "Maximum number of mbuf 9k jumbo clusters allowed"); 245 246 static int 247 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) 248 { 249 int error, newnmbjumbo16; 250 251 newnmbjumbo16 = nmbjumbo16; 252 error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 253 if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { 254 if (newnmbjumbo16 > nmbjumbo16 && 255 nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { 256 nmbjumbo16 = newnmbjumbo16; 257 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 258 } else 259 error = EINVAL; 260 } 261 return (error); 262 } 263 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, 264 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0, 265 sysctl_nmbjumbo16, "IU", 266 "Maximum number of mbuf 16k jumbo clusters allowed"); 267 268 static int 269 sysctl_nmbufs(SYSCTL_HANDLER_ARGS) 270 { 271 int error, newnmbufs; 272 273 newnmbufs = nmbufs; 274 error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 275 if (error == 0 && req->newptr && newnmbufs != nmbufs) { 276 if (newnmbufs > nmbufs) { 277 nmbufs = newnmbufs; 278 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 279 EVENTHANDLER_INVOKE(nmbufs_change); 280 } else 281 error = EINVAL; 282 } 283 return (error); 284 } 285 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, 286 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 287 &nmbufs, 0, sysctl_nmbufs, "IU", 288 "Maximum number of mbufs allowed"); 289 290 /* 291 * Zones from which we allocate. 292 */ 293 uma_zone_t zone_mbuf; 294 uma_zone_t zone_clust; 295 uma_zone_t zone_pack; 296 uma_zone_t zone_jumbop; 297 uma_zone_t zone_jumbo9; 298 uma_zone_t zone_jumbo16; 299 300 /* 301 * Local prototypes. 302 */ 303 static int mb_ctor_mbuf(void *, int, void *, int); 304 static int mb_ctor_clust(void *, int, void *, int); 305 static int mb_ctor_pack(void *, int, void *, int); 306 static void mb_dtor_mbuf(void *, int, void *); 307 static void mb_dtor_pack(void *, int, void *); 308 static int mb_zinit_pack(void *, int, int); 309 static void mb_zfini_pack(void *, int); 310 static void mb_reclaim(uma_zone_t, int); 311 312 /* Ensure that MSIZE is a power of 2. */ 313 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 314 315 _Static_assert(sizeof(struct mbuf) <= MSIZE, 316 "size of mbuf exceeds MSIZE"); 317 /* 318 * Initialize FreeBSD Network buffer allocation. 319 */ 320 static void 321 mbuf_init(void *dummy) 322 { 323 324 /* 325 * Configure UMA zones for Mbufs, Clusters, and Packets. 326 */ 327 zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, 328 mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 329 MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); 330 if (nmbufs > 0) 331 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); 332 uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); 333 uma_zone_set_maxaction(zone_mbuf, mb_reclaim); 334 335 zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, 336 mb_ctor_clust, NULL, NULL, NULL, 337 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 338 if (nmbclusters > 0) 339 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); 340 uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); 341 uma_zone_set_maxaction(zone_clust, mb_reclaim); 342 343 zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, 344 mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); 345 346 /* Make jumbo frame zone too. Page size, 9k and 16k. */ 347 zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, 348 mb_ctor_clust, NULL, NULL, NULL, 349 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 350 if (nmbjumbop > 0) 351 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); 352 uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); 353 uma_zone_set_maxaction(zone_jumbop, mb_reclaim); 354 355 zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, 356 mb_ctor_clust, NULL, NULL, NULL, 357 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 358 if (nmbjumbo9 > 0) 359 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); 360 uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); 361 uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); 362 363 zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, 364 mb_ctor_clust, NULL, NULL, NULL, 365 UMA_ALIGN_PTR, UMA_ZONE_CONTIG); 366 if (nmbjumbo16 > 0) 367 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 368 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 369 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 370 371 /* 372 * Hook event handler for low-memory situation, used to 373 * drain protocols and push data back to the caches (UMA 374 * later pushes it back to VM). 375 */ 376 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 377 EVENTHANDLER_PRI_FIRST); 378 379 snd_tag_count = counter_u64_alloc(M_WAITOK); 380 } 381 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); 382 383 #ifdef DEBUGNET 384 /* 385 * debugnet makes use of a pre-allocated pool of mbufs and clusters. When 386 * debugnet is configured, we initialize a set of UMA cache zones which return 387 * items from this pool. At panic-time, the regular UMA zone pointers are 388 * overwritten with those of the cache zones so that drivers may allocate and 389 * free mbufs and clusters without attempting to allocate physical memory. 390 * 391 * We keep mbufs and clusters in a pair of mbuf queues. In particular, for 392 * the purpose of caching clusters, we treat them as mbufs. 393 */ 394 static struct mbufq dn_mbufq = 395 { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; 396 static struct mbufq dn_clustq = 397 { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; 398 399 static int dn_clsize; 400 static uma_zone_t dn_zone_mbuf; 401 static uma_zone_t dn_zone_clust; 402 static uma_zone_t dn_zone_pack; 403 404 static struct debugnet_saved_zones { 405 uma_zone_t dsz_mbuf; 406 uma_zone_t dsz_clust; 407 uma_zone_t dsz_pack; 408 uma_zone_t dsz_jumbop; 409 uma_zone_t dsz_jumbo9; 410 uma_zone_t dsz_jumbo16; 411 bool dsz_debugnet_zones_enabled; 412 } dn_saved_zones; 413 414 static int 415 dn_buf_import(void *arg, void **store, int count, int domain __unused, 416 int flags) 417 { 418 struct mbufq *q; 419 struct mbuf *m; 420 int i; 421 422 q = arg; 423 424 for (i = 0; i < count; i++) { 425 m = mbufq_dequeue(q); 426 if (m == NULL) 427 break; 428 trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); 429 store[i] = m; 430 } 431 KASSERT((flags & M_WAITOK) == 0 || i == count, 432 ("%s: ran out of pre-allocated mbufs", __func__)); 433 return (i); 434 } 435 436 static void 437 dn_buf_release(void *arg, void **store, int count) 438 { 439 struct mbufq *q; 440 struct mbuf *m; 441 int i; 442 443 q = arg; 444 445 for (i = 0; i < count; i++) { 446 m = store[i]; 447 (void)mbufq_enqueue(q, m); 448 } 449 } 450 451 static int 452 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, 453 int flags __unused) 454 { 455 struct mbuf *m; 456 void *clust; 457 int i; 458 459 for (i = 0; i < count; i++) { 460 m = m_get(MT_DATA, M_NOWAIT); 461 if (m == NULL) 462 break; 463 clust = uma_zalloc(dn_zone_clust, M_NOWAIT); 464 if (clust == NULL) { 465 m_free(m); 466 break; 467 } 468 mb_ctor_clust(clust, dn_clsize, m, 0); 469 store[i] = m; 470 } 471 KASSERT((flags & M_WAITOK) == 0 || i == count, 472 ("%s: ran out of pre-allocated mbufs", __func__)); 473 return (i); 474 } 475 476 static void 477 dn_pack_release(void *arg __unused, void **store, int count) 478 { 479 struct mbuf *m; 480 void *clust; 481 int i; 482 483 for (i = 0; i < count; i++) { 484 m = store[i]; 485 clust = m->m_ext.ext_buf; 486 uma_zfree(dn_zone_clust, clust); 487 uma_zfree(dn_zone_mbuf, m); 488 } 489 } 490 491 /* 492 * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy 493 * the corresponding UMA cache zones. 494 */ 495 void 496 debugnet_mbuf_drain(void) 497 { 498 struct mbuf *m; 499 void *item; 500 501 if (dn_zone_mbuf != NULL) { 502 uma_zdestroy(dn_zone_mbuf); 503 dn_zone_mbuf = NULL; 504 } 505 if (dn_zone_clust != NULL) { 506 uma_zdestroy(dn_zone_clust); 507 dn_zone_clust = NULL; 508 } 509 if (dn_zone_pack != NULL) { 510 uma_zdestroy(dn_zone_pack); 511 dn_zone_pack = NULL; 512 } 513 514 while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) 515 m_free(m); 516 while ((item = mbufq_dequeue(&dn_clustq)) != NULL) 517 uma_zfree(m_getzone(dn_clsize), item); 518 } 519 520 /* 521 * Callback invoked immediately prior to starting a debugnet connection. 522 */ 523 void 524 debugnet_mbuf_start(void) 525 { 526 527 MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); 528 529 /* Save the old zone pointers to restore when debugnet is closed. */ 530 dn_saved_zones = (struct debugnet_saved_zones) { 531 .dsz_debugnet_zones_enabled = true, 532 .dsz_mbuf = zone_mbuf, 533 .dsz_clust = zone_clust, 534 .dsz_pack = zone_pack, 535 .dsz_jumbop = zone_jumbop, 536 .dsz_jumbo9 = zone_jumbo9, 537 .dsz_jumbo16 = zone_jumbo16, 538 }; 539 540 /* 541 * All cluster zones return buffers of the size requested by the 542 * drivers. It's up to the driver to reinitialize the zones if the 543 * MTU of a debugnet-enabled interface changes. 544 */ 545 printf("debugnet: overwriting mbuf zone pointers\n"); 546 zone_mbuf = dn_zone_mbuf; 547 zone_clust = dn_zone_clust; 548 zone_pack = dn_zone_pack; 549 zone_jumbop = dn_zone_clust; 550 zone_jumbo9 = dn_zone_clust; 551 zone_jumbo16 = dn_zone_clust; 552 } 553 554 /* 555 * Callback invoked when a debugnet connection is closed/finished. 556 */ 557 void 558 debugnet_mbuf_finish(void) 559 { 560 561 MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); 562 563 printf("debugnet: restoring mbuf zone pointers\n"); 564 zone_mbuf = dn_saved_zones.dsz_mbuf; 565 zone_clust = dn_saved_zones.dsz_clust; 566 zone_pack = dn_saved_zones.dsz_pack; 567 zone_jumbop = dn_saved_zones.dsz_jumbop; 568 zone_jumbo9 = dn_saved_zones.dsz_jumbo9; 569 zone_jumbo16 = dn_saved_zones.dsz_jumbo16; 570 571 memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); 572 } 573 574 /* 575 * Reinitialize the debugnet mbuf+cluster pool and cache zones. 576 */ 577 void 578 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) 579 { 580 struct mbuf *m; 581 void *item; 582 583 debugnet_mbuf_drain(); 584 585 dn_clsize = clsize; 586 587 dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, 588 MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, 589 dn_buf_import, dn_buf_release, 590 &dn_mbufq, UMA_ZONE_NOBUCKET); 591 592 dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, 593 clsize, mb_ctor_clust, NULL, NULL, NULL, 594 dn_buf_import, dn_buf_release, 595 &dn_clustq, UMA_ZONE_NOBUCKET); 596 597 dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, 598 MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, 599 dn_pack_import, dn_pack_release, 600 NULL, UMA_ZONE_NOBUCKET); 601 602 while (nmbuf-- > 0) { 603 m = m_get(MT_DATA, M_WAITOK); 604 uma_zfree(dn_zone_mbuf, m); 605 } 606 while (nclust-- > 0) { 607 item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); 608 uma_zfree(dn_zone_clust, item); 609 } 610 } 611 #endif /* DEBUGNET */ 612 613 /* 614 * Constructor for Mbuf primary zone. 615 * 616 * The 'arg' pointer points to a mb_args structure which 617 * contains call-specific information required to support the 618 * mbuf allocation API. See mbuf.h. 619 */ 620 static int 621 mb_ctor_mbuf(void *mem, int size, void *arg, int how) 622 { 623 struct mbuf *m; 624 struct mb_args *args; 625 int error; 626 int flags; 627 short type; 628 629 args = (struct mb_args *)arg; 630 type = args->type; 631 632 /* 633 * The mbuf is initialized later. The caller has the 634 * responsibility to set up any MAC labels too. 635 */ 636 if (type == MT_NOINIT) 637 return (0); 638 639 m = (struct mbuf *)mem; 640 flags = args->flags; 641 MPASS((flags & M_NOFREE) == 0); 642 643 error = m_init(m, how, type, flags); 644 645 return (error); 646 } 647 648 /* 649 * The Mbuf primary zone destructor. 650 */ 651 static void 652 mb_dtor_mbuf(void *mem, int size, void *arg) 653 { 654 struct mbuf *m; 655 unsigned long flags; 656 657 m = (struct mbuf *)mem; 658 flags = (unsigned long)arg; 659 660 KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); 661 if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) 662 m_tag_delete_chain(m, NULL); 663 } 664 665 /* 666 * The Mbuf Packet zone destructor. 667 */ 668 static void 669 mb_dtor_pack(void *mem, int size, void *arg) 670 { 671 struct mbuf *m; 672 673 m = (struct mbuf *)mem; 674 if ((m->m_flags & M_PKTHDR) != 0) 675 m_tag_delete_chain(m, NULL); 676 677 /* Make sure we've got a clean cluster back. */ 678 KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); 679 KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); 680 KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); 681 KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); 682 KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); 683 KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); 684 KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); 685 #ifdef INVARIANTS 686 trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); 687 #endif 688 /* 689 * If there are processes blocked on zone_clust, waiting for pages 690 * to be freed up, cause them to be woken up by draining the 691 * packet zone. We are exposed to a race here (in the check for 692 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that 693 * is deliberate. We don't want to acquire the zone lock for every 694 * mbuf free. 695 */ 696 if (uma_zone_exhausted(zone_clust)) 697 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 698 } 699 700 /* 701 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. 702 * 703 * Here the 'arg' pointer points to the Mbuf which we 704 * are configuring cluster storage for. If 'arg' is 705 * empty we allocate just the cluster without setting 706 * the mbuf to it. See mbuf.h. 707 */ 708 static int 709 mb_ctor_clust(void *mem, int size, void *arg, int how) 710 { 711 struct mbuf *m; 712 713 m = (struct mbuf *)arg; 714 if (m != NULL) { 715 m->m_ext.ext_buf = (char *)mem; 716 m->m_data = m->m_ext.ext_buf; 717 m->m_flags |= M_EXT; 718 m->m_ext.ext_free = NULL; 719 m->m_ext.ext_arg1 = NULL; 720 m->m_ext.ext_arg2 = NULL; 721 m->m_ext.ext_size = size; 722 m->m_ext.ext_type = m_gettype(size); 723 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 724 m->m_ext.ext_count = 1; 725 } 726 727 return (0); 728 } 729 730 /* 731 * The Packet secondary zone's init routine, executed on the 732 * object's transition from mbuf keg slab to zone cache. 733 */ 734 static int 735 mb_zinit_pack(void *mem, int size, int how) 736 { 737 struct mbuf *m; 738 739 m = (struct mbuf *)mem; /* m is virgin. */ 740 if (uma_zalloc_arg(zone_clust, m, how) == NULL || 741 m->m_ext.ext_buf == NULL) 742 return (ENOMEM); 743 m->m_ext.ext_type = EXT_PACKET; /* Override. */ 744 #ifdef INVARIANTS 745 trash_init(m->m_ext.ext_buf, MCLBYTES, how); 746 #endif 747 return (0); 748 } 749 750 /* 751 * The Packet secondary zone's fini routine, executed on the 752 * object's transition from zone cache to keg slab. 753 */ 754 static void 755 mb_zfini_pack(void *mem, int size) 756 { 757 struct mbuf *m; 758 759 m = (struct mbuf *)mem; 760 #ifdef INVARIANTS 761 trash_fini(m->m_ext.ext_buf, MCLBYTES); 762 #endif 763 uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); 764 #ifdef INVARIANTS 765 trash_dtor(mem, size, NULL); 766 #endif 767 } 768 769 /* 770 * The "packet" keg constructor. 771 */ 772 static int 773 mb_ctor_pack(void *mem, int size, void *arg, int how) 774 { 775 struct mbuf *m; 776 struct mb_args *args; 777 int error, flags; 778 short type; 779 780 m = (struct mbuf *)mem; 781 args = (struct mb_args *)arg; 782 flags = args->flags; 783 type = args->type; 784 MPASS((flags & M_NOFREE) == 0); 785 786 #ifdef INVARIANTS 787 trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); 788 #endif 789 790 error = m_init(m, how, type, flags); 791 792 /* m_ext is already initialized. */ 793 m->m_data = m->m_ext.ext_buf; 794 m->m_flags = (flags | M_EXT); 795 796 return (error); 797 } 798 799 /* 800 * This is the protocol drain routine. Called by UMA whenever any of the 801 * mbuf zones is closed to its limit. 802 * 803 * No locks should be held when this is called. The drain routines have to 804 * presently acquire some locks which raises the possibility of lock order 805 * reversal. 806 */ 807 static void 808 mb_reclaim(uma_zone_t zone __unused, int pending __unused) 809 { 810 struct epoch_tracker et; 811 struct domain *dp; 812 struct protosw *pr; 813 814 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); 815 816 NET_EPOCH_ENTER(et); 817 for (dp = domains; dp != NULL; dp = dp->dom_next) 818 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 819 if (pr->pr_drain != NULL) 820 (*pr->pr_drain)(); 821 NET_EPOCH_EXIT(et); 822 } 823 824 /* 825 * Free "count" units of I/O from an mbuf chain. They could be held 826 * in M_EXTPG or just as a normal mbuf. This code is intended to be 827 * called in an error path (I/O error, closed connection, etc). 828 */ 829 void 830 mb_free_notready(struct mbuf *m, int count) 831 { 832 int i; 833 834 for (i = 0; i < count && m != NULL; i++) { 835 if ((m->m_flags & M_EXTPG) != 0) { 836 m->m_epg_nrdy--; 837 if (m->m_epg_nrdy != 0) 838 continue; 839 } 840 m = m_free(m); 841 } 842 KASSERT(i == count, ("Removed only %d items from %p", i, m)); 843 } 844 845 /* 846 * Compress an unmapped mbuf into a simple mbuf when it holds a small 847 * amount of data. This is used as a DOS defense to avoid having 848 * small packets tie up wired pages, an ext_pgs structure, and an 849 * mbuf. Since this converts the existing mbuf in place, it can only 850 * be used if there are no other references to 'm'. 851 */ 852 int 853 mb_unmapped_compress(struct mbuf *m) 854 { 855 volatile u_int *refcnt; 856 char buf[MLEN]; 857 858 /* 859 * Assert that 'm' does not have a packet header. If 'm' had 860 * a packet header, it would only be able to hold MHLEN bytes 861 * and m_data would have to be initialized differently. 862 */ 863 KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), 864 ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); 865 KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); 866 867 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 868 refcnt = &m->m_ext.ext_count; 869 } else { 870 KASSERT(m->m_ext.ext_cnt != NULL, 871 ("%s: no refcounting pointer on %p", __func__, m)); 872 refcnt = m->m_ext.ext_cnt; 873 } 874 875 if (*refcnt != 1) 876 return (EBUSY); 877 878 m_copydata(m, 0, m->m_len, buf); 879 880 /* Free the backing pages. */ 881 m->m_ext.ext_free(m); 882 883 /* Turn 'm' into a "normal" mbuf. */ 884 m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); 885 m->m_data = m->m_dat; 886 887 /* Copy data back into m. */ 888 bcopy(buf, mtod(m, char *), m->m_len); 889 890 return (0); 891 } 892 893 /* 894 * These next few routines are used to permit downgrading an unmapped 895 * mbuf to a chain of mapped mbufs. This is used when an interface 896 * doesn't supported unmapped mbufs or if checksums need to be 897 * computed in software. 898 * 899 * Each unmapped mbuf is converted to a chain of mbufs. First, any 900 * TLS header data is stored in a regular mbuf. Second, each page of 901 * unmapped data is stored in an mbuf with an EXT_SFBUF external 902 * cluster. These mbufs use an sf_buf to provide a valid KVA for the 903 * associated physical page. They also hold a reference on the 904 * original M_EXTPG mbuf to ensure the physical page doesn't go away. 905 * Finally, any TLS trailer data is stored in a regular mbuf. 906 * 907 * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF 908 * mbufs. It frees the associated sf_buf and releases its reference 909 * on the original M_EXTPG mbuf. 910 * 911 * _mb_unmapped_to_ext() is a helper function that converts a single 912 * unmapped mbuf into a chain of mbufs. 913 * 914 * mb_unmapped_to_ext() is the public function that walks an mbuf 915 * chain converting any unmapped mbufs to mapped mbufs. It returns 916 * the new chain of unmapped mbufs on success. On failure it frees 917 * the original mbuf chain and returns NULL. 918 */ 919 static void 920 mb_unmapped_free_mext(struct mbuf *m) 921 { 922 struct sf_buf *sf; 923 struct mbuf *old_m; 924 925 sf = m->m_ext.ext_arg1; 926 sf_buf_free(sf); 927 928 /* Drop the reference on the backing M_EXTPG mbuf. */ 929 old_m = m->m_ext.ext_arg2; 930 mb_free_extpg(old_m); 931 } 932 933 static struct mbuf * 934 _mb_unmapped_to_ext(struct mbuf *m) 935 { 936 struct mbuf *m_new, *top, *prev, *mref; 937 struct sf_buf *sf; 938 vm_page_t pg; 939 int i, len, off, pglen, pgoff, seglen, segoff; 940 volatile u_int *refcnt; 941 u_int ref_inc = 0; 942 943 M_ASSERTEXTPG(m); 944 len = m->m_len; 945 KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", 946 __func__, m)); 947 948 /* See if this is the mbuf that holds the embedded refcount. */ 949 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 950 refcnt = &m->m_ext.ext_count; 951 mref = m; 952 } else { 953 KASSERT(m->m_ext.ext_cnt != NULL, 954 ("%s: no refcounting pointer on %p", __func__, m)); 955 refcnt = m->m_ext.ext_cnt; 956 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 957 } 958 959 /* Skip over any data removed from the front. */ 960 off = mtod(m, vm_offset_t); 961 962 top = NULL; 963 if (m->m_epg_hdrlen != 0) { 964 if (off >= m->m_epg_hdrlen) { 965 off -= m->m_epg_hdrlen; 966 } else { 967 seglen = m->m_epg_hdrlen - off; 968 segoff = off; 969 seglen = min(seglen, len); 970 off = 0; 971 len -= seglen; 972 m_new = m_get(M_NOWAIT, MT_DATA); 973 if (m_new == NULL) 974 goto fail; 975 m_new->m_len = seglen; 976 prev = top = m_new; 977 memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], 978 seglen); 979 } 980 } 981 pgoff = m->m_epg_1st_off; 982 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 983 pglen = m_epg_pagelen(m, i, pgoff); 984 if (off >= pglen) { 985 off -= pglen; 986 pgoff = 0; 987 continue; 988 } 989 seglen = pglen - off; 990 segoff = pgoff + off; 991 off = 0; 992 seglen = min(seglen, len); 993 len -= seglen; 994 995 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 996 m_new = m_get(M_NOWAIT, MT_DATA); 997 if (m_new == NULL) 998 goto fail; 999 if (top == NULL) { 1000 top = prev = m_new; 1001 } else { 1002 prev->m_next = m_new; 1003 prev = m_new; 1004 } 1005 sf = sf_buf_alloc(pg, SFB_NOWAIT); 1006 if (sf == NULL) 1007 goto fail; 1008 1009 ref_inc++; 1010 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, 1011 mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); 1012 m_new->m_data += segoff; 1013 m_new->m_len = seglen; 1014 1015 pgoff = 0; 1016 }; 1017 if (len != 0) { 1018 KASSERT((off + len) <= m->m_epg_trllen, 1019 ("off + len > trail (%d + %d > %d)", off, len, 1020 m->m_epg_trllen)); 1021 m_new = m_get(M_NOWAIT, MT_DATA); 1022 if (m_new == NULL) 1023 goto fail; 1024 if (top == NULL) 1025 top = m_new; 1026 else 1027 prev->m_next = m_new; 1028 m_new->m_len = len; 1029 memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); 1030 } 1031 1032 if (ref_inc != 0) { 1033 /* 1034 * Obtain an additional reference on the old mbuf for 1035 * each created EXT_SFBUF mbuf. They will be dropped 1036 * in mb_unmapped_free_mext(). 1037 */ 1038 if (*refcnt == 1) 1039 *refcnt += ref_inc; 1040 else 1041 atomic_add_int(refcnt, ref_inc); 1042 } 1043 m_free(m); 1044 return (top); 1045 1046 fail: 1047 if (ref_inc != 0) { 1048 /* 1049 * Obtain an additional reference on the old mbuf for 1050 * each created EXT_SFBUF mbuf. They will be 1051 * immediately dropped when these mbufs are freed 1052 * below. 1053 */ 1054 if (*refcnt == 1) 1055 *refcnt += ref_inc; 1056 else 1057 atomic_add_int(refcnt, ref_inc); 1058 } 1059 m_free(m); 1060 m_freem(top); 1061 return (NULL); 1062 } 1063 1064 struct mbuf * 1065 mb_unmapped_to_ext(struct mbuf *top) 1066 { 1067 struct mbuf *m, *next, *prev = NULL; 1068 1069 prev = NULL; 1070 for (m = top; m != NULL; m = next) { 1071 /* m might be freed, so cache the next pointer. */ 1072 next = m->m_next; 1073 if (m->m_flags & M_EXTPG) { 1074 if (prev != NULL) { 1075 /* 1076 * Remove 'm' from the new chain so 1077 * that the 'top' chain terminates 1078 * before 'm' in case 'top' is freed 1079 * due to an error. 1080 */ 1081 prev->m_next = NULL; 1082 } 1083 m = _mb_unmapped_to_ext(m); 1084 if (m == NULL) { 1085 m_freem(top); 1086 m_freem(next); 1087 return (NULL); 1088 } 1089 if (prev == NULL) { 1090 top = m; 1091 } else { 1092 prev->m_next = m; 1093 } 1094 1095 /* 1096 * Replaced one mbuf with a chain, so we must 1097 * find the end of chain. 1098 */ 1099 prev = m_last(m); 1100 } else { 1101 if (prev != NULL) { 1102 prev->m_next = m; 1103 } 1104 prev = m; 1105 } 1106 } 1107 return (top); 1108 } 1109 1110 /* 1111 * Allocate an empty M_EXTPG mbuf. The ext_free routine is 1112 * responsible for freeing any pages backing this mbuf when it is 1113 * freed. 1114 */ 1115 struct mbuf * 1116 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) 1117 { 1118 struct mbuf *m; 1119 1120 m = m_get(how, MT_DATA); 1121 if (m == NULL) 1122 return (NULL); 1123 1124 m->m_epg_npgs = 0; 1125 m->m_epg_nrdy = 0; 1126 m->m_epg_1st_off = 0; 1127 m->m_epg_last_len = 0; 1128 m->m_epg_flags = 0; 1129 m->m_epg_hdrlen = 0; 1130 m->m_epg_trllen = 0; 1131 m->m_epg_tls = NULL; 1132 m->m_epg_so = NULL; 1133 m->m_data = NULL; 1134 m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); 1135 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 1136 m->m_ext.ext_count = 1; 1137 m->m_ext.ext_size = 0; 1138 m->m_ext.ext_free = ext_free; 1139 return (m); 1140 } 1141 1142 /* 1143 * Clean up after mbufs with M_EXT storage attached to them if the 1144 * reference count hits 1. 1145 */ 1146 void 1147 mb_free_ext(struct mbuf *m) 1148 { 1149 volatile u_int *refcnt; 1150 struct mbuf *mref; 1151 int freembuf; 1152 1153 KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); 1154 1155 /* See if this is the mbuf that holds the embedded refcount. */ 1156 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1157 refcnt = &m->m_ext.ext_count; 1158 mref = m; 1159 } else { 1160 KASSERT(m->m_ext.ext_cnt != NULL, 1161 ("%s: no refcounting pointer on %p", __func__, m)); 1162 refcnt = m->m_ext.ext_cnt; 1163 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1164 } 1165 1166 /* 1167 * Check if the header is embedded in the cluster. It is 1168 * important that we can't touch any of the mbuf fields 1169 * after we have freed the external storage, since mbuf 1170 * could have been embedded in it. For now, the mbufs 1171 * embedded into the cluster are always of type EXT_EXTREF, 1172 * and for this type we won't free the mref. 1173 */ 1174 if (m->m_flags & M_NOFREE) { 1175 freembuf = 0; 1176 KASSERT(m->m_ext.ext_type == EXT_EXTREF || 1177 m->m_ext.ext_type == EXT_RXRING, 1178 ("%s: no-free mbuf %p has wrong type", __func__, m)); 1179 } else 1180 freembuf = 1; 1181 1182 /* Free attached storage if this mbuf is the only reference to it. */ 1183 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1184 switch (m->m_ext.ext_type) { 1185 case EXT_PACKET: 1186 /* The packet zone is special. */ 1187 if (*refcnt == 0) 1188 *refcnt = 1; 1189 uma_zfree(zone_pack, mref); 1190 break; 1191 case EXT_CLUSTER: 1192 uma_zfree(zone_clust, m->m_ext.ext_buf); 1193 uma_zfree(zone_mbuf, mref); 1194 break; 1195 case EXT_JUMBOP: 1196 uma_zfree(zone_jumbop, m->m_ext.ext_buf); 1197 uma_zfree(zone_mbuf, mref); 1198 break; 1199 case EXT_JUMBO9: 1200 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 1201 uma_zfree(zone_mbuf, mref); 1202 break; 1203 case EXT_JUMBO16: 1204 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 1205 uma_zfree(zone_mbuf, mref); 1206 break; 1207 case EXT_SFBUF: 1208 case EXT_NET_DRV: 1209 case EXT_MOD_TYPE: 1210 case EXT_DISPOSABLE: 1211 KASSERT(mref->m_ext.ext_free != NULL, 1212 ("%s: ext_free not set", __func__)); 1213 mref->m_ext.ext_free(mref); 1214 uma_zfree(zone_mbuf, mref); 1215 break; 1216 case EXT_EXTREF: 1217 KASSERT(m->m_ext.ext_free != NULL, 1218 ("%s: ext_free not set", __func__)); 1219 m->m_ext.ext_free(m); 1220 break; 1221 case EXT_RXRING: 1222 KASSERT(m->m_ext.ext_free == NULL, 1223 ("%s: ext_free is set", __func__)); 1224 break; 1225 default: 1226 KASSERT(m->m_ext.ext_type == 0, 1227 ("%s: unknown ext_type", __func__)); 1228 } 1229 } 1230 1231 if (freembuf && m != mref) 1232 uma_zfree(zone_mbuf, m); 1233 } 1234 1235 /* 1236 * Clean up after mbufs with M_EXTPG storage attached to them if the 1237 * reference count hits 1. 1238 */ 1239 void 1240 mb_free_extpg(struct mbuf *m) 1241 { 1242 volatile u_int *refcnt; 1243 struct mbuf *mref; 1244 1245 M_ASSERTEXTPG(m); 1246 1247 /* See if this is the mbuf that holds the embedded refcount. */ 1248 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 1249 refcnt = &m->m_ext.ext_count; 1250 mref = m; 1251 } else { 1252 KASSERT(m->m_ext.ext_cnt != NULL, 1253 ("%s: no refcounting pointer on %p", __func__, m)); 1254 refcnt = m->m_ext.ext_cnt; 1255 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 1256 } 1257 1258 /* Free attached storage if this mbuf is the only reference to it. */ 1259 if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { 1260 KASSERT(mref->m_ext.ext_free != NULL, 1261 ("%s: ext_free not set", __func__)); 1262 1263 mref->m_ext.ext_free(mref); 1264 #ifdef KERN_TLS 1265 if (mref->m_epg_tls != NULL && 1266 !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) 1267 ktls_enqueue_to_free(mref); 1268 else 1269 #endif 1270 uma_zfree(zone_mbuf, mref); 1271 } 1272 1273 if (m != mref) 1274 uma_zfree(zone_mbuf, m); 1275 } 1276 1277 /* 1278 * Official mbuf(9) allocation KPI for stack and drivers: 1279 * 1280 * m_get() - a single mbuf without any attachments, sys/mbuf.h. 1281 * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. 1282 * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. 1283 * m_clget() - attach cluster to already allocated mbuf. 1284 * m_cljget() - attach jumbo cluster to already allocated mbuf. 1285 * m_get2() - allocate minimum mbuf that would fit size argument. 1286 * m_getm2() - allocate a chain of mbufs/clusters. 1287 * m_extadd() - attach external cluster to mbuf. 1288 * 1289 * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. 1290 * m_freem() - free chain of mbufs. 1291 */ 1292 1293 int 1294 m_clget(struct mbuf *m, int how) 1295 { 1296 1297 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1298 __func__, m)); 1299 m->m_ext.ext_buf = (char *)NULL; 1300 uma_zalloc_arg(zone_clust, m, how); 1301 /* 1302 * On a cluster allocation failure, drain the packet zone and retry, 1303 * we might be able to loosen a few clusters up on the drain. 1304 */ 1305 if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { 1306 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); 1307 uma_zalloc_arg(zone_clust, m, how); 1308 } 1309 MBUF_PROBE2(m__clget, m, how); 1310 return (m->m_flags & M_EXT); 1311 } 1312 1313 /* 1314 * m_cljget() is different from m_clget() as it can allocate clusters without 1315 * attaching them to an mbuf. In that case the return value is the pointer 1316 * to the cluster of the requested size. If an mbuf was specified, it gets 1317 * the cluster attached to it and the return value can be safely ignored. 1318 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1319 */ 1320 void * 1321 m_cljget(struct mbuf *m, int how, int size) 1322 { 1323 uma_zone_t zone; 1324 void *retval; 1325 1326 if (m != NULL) { 1327 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", 1328 __func__, m)); 1329 m->m_ext.ext_buf = NULL; 1330 } 1331 1332 zone = m_getzone(size); 1333 retval = uma_zalloc_arg(zone, m, how); 1334 1335 MBUF_PROBE4(m__cljget, m, how, size, retval); 1336 1337 return (retval); 1338 } 1339 1340 /* 1341 * m_get2() allocates minimum mbuf that would fit "size" argument. 1342 */ 1343 struct mbuf * 1344 m_get2(int size, int how, short type, int flags) 1345 { 1346 struct mb_args args; 1347 struct mbuf *m, *n; 1348 1349 args.flags = flags; 1350 args.type = type; 1351 1352 if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) 1353 return (uma_zalloc_arg(zone_mbuf, &args, how)); 1354 if (size <= MCLBYTES) 1355 return (uma_zalloc_arg(zone_pack, &args, how)); 1356 1357 if (size > MJUMPAGESIZE) 1358 return (NULL); 1359 1360 m = uma_zalloc_arg(zone_mbuf, &args, how); 1361 if (m == NULL) 1362 return (NULL); 1363 1364 n = uma_zalloc_arg(zone_jumbop, m, how); 1365 if (n == NULL) { 1366 uma_zfree(zone_mbuf, m); 1367 return (NULL); 1368 } 1369 1370 return (m); 1371 } 1372 1373 /* 1374 * m_getjcl() returns an mbuf with a cluster of the specified size attached. 1375 * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. 1376 */ 1377 struct mbuf * 1378 m_getjcl(int how, short type, int flags, int size) 1379 { 1380 struct mb_args args; 1381 struct mbuf *m, *n; 1382 uma_zone_t zone; 1383 1384 if (size == MCLBYTES) 1385 return m_getcl(how, type, flags); 1386 1387 args.flags = flags; 1388 args.type = type; 1389 1390 m = uma_zalloc_arg(zone_mbuf, &args, how); 1391 if (m == NULL) 1392 return (NULL); 1393 1394 zone = m_getzone(size); 1395 n = uma_zalloc_arg(zone, m, how); 1396 if (n == NULL) { 1397 uma_zfree(zone_mbuf, m); 1398 return (NULL); 1399 } 1400 return (m); 1401 } 1402 1403 /* 1404 * Allocate a given length worth of mbufs and/or clusters (whatever fits 1405 * best) and return a pointer to the top of the allocated chain. If an 1406 * existing mbuf chain is provided, then we will append the new chain 1407 * to the existing one and return a pointer to the provided mbuf. 1408 */ 1409 struct mbuf * 1410 m_getm2(struct mbuf *m, int len, int how, short type, int flags) 1411 { 1412 struct mbuf *mb, *nm = NULL, *mtail = NULL; 1413 1414 KASSERT(len >= 0, ("%s: len is < 0", __func__)); 1415 1416 /* Validate flags. */ 1417 flags &= (M_PKTHDR | M_EOR); 1418 1419 /* Packet header mbuf must be first in chain. */ 1420 if ((flags & M_PKTHDR) && m != NULL) 1421 flags &= ~M_PKTHDR; 1422 1423 /* Loop and append maximum sized mbufs to the chain tail. */ 1424 while (len > 0) { 1425 if (len > MCLBYTES) 1426 mb = m_getjcl(how, type, (flags & M_PKTHDR), 1427 MJUMPAGESIZE); 1428 else if (len >= MINCLSIZE) 1429 mb = m_getcl(how, type, (flags & M_PKTHDR)); 1430 else if (flags & M_PKTHDR) 1431 mb = m_gethdr(how, type); 1432 else 1433 mb = m_get(how, type); 1434 1435 /* Fail the whole operation if one mbuf can't be allocated. */ 1436 if (mb == NULL) { 1437 if (nm != NULL) 1438 m_freem(nm); 1439 return (NULL); 1440 } 1441 1442 /* Book keeping. */ 1443 len -= M_SIZE(mb); 1444 if (mtail != NULL) 1445 mtail->m_next = mb; 1446 else 1447 nm = mb; 1448 mtail = mb; 1449 flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ 1450 } 1451 if (flags & M_EOR) 1452 mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ 1453 1454 /* If mbuf was supplied, append new chain to the end of it. */ 1455 if (m != NULL) { 1456 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) 1457 ; 1458 mtail->m_next = nm; 1459 mtail->m_flags &= ~M_EOR; 1460 } else 1461 m = nm; 1462 1463 return (m); 1464 } 1465 1466 /*- 1467 * Configure a provided mbuf to refer to the provided external storage 1468 * buffer and setup a reference count for said buffer. 1469 * 1470 * Arguments: 1471 * mb The existing mbuf to which to attach the provided buffer. 1472 * buf The address of the provided external storage buffer. 1473 * size The size of the provided buffer. 1474 * freef A pointer to a routine that is responsible for freeing the 1475 * provided external storage buffer. 1476 * args A pointer to an argument structure (of any type) to be passed 1477 * to the provided freef routine (may be NULL). 1478 * flags Any other flags to be passed to the provided mbuf. 1479 * type The type that the external storage buffer should be 1480 * labeled with. 1481 * 1482 * Returns: 1483 * Nothing. 1484 */ 1485 void 1486 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, 1487 void *arg1, void *arg2, int flags, int type) 1488 { 1489 1490 KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); 1491 1492 mb->m_flags |= (M_EXT | flags); 1493 mb->m_ext.ext_buf = buf; 1494 mb->m_data = mb->m_ext.ext_buf; 1495 mb->m_ext.ext_size = size; 1496 mb->m_ext.ext_free = freef; 1497 mb->m_ext.ext_arg1 = arg1; 1498 mb->m_ext.ext_arg2 = arg2; 1499 mb->m_ext.ext_type = type; 1500 1501 if (type != EXT_EXTREF) { 1502 mb->m_ext.ext_count = 1; 1503 mb->m_ext.ext_flags = EXT_FLAG_EMBREF; 1504 } else 1505 mb->m_ext.ext_flags = 0; 1506 } 1507 1508 /* 1509 * Free an entire chain of mbufs and associated external buffers, if 1510 * applicable. 1511 */ 1512 void 1513 m_freem(struct mbuf *mb) 1514 { 1515 1516 MBUF_PROBE1(m__freem, mb); 1517 while (mb != NULL) 1518 mb = m_free(mb); 1519 } 1520 1521 void 1522 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp) 1523 { 1524 1525 if_ref(ifp); 1526 mst->ifp = ifp; 1527 refcount_init(&mst->refcount, 1); 1528 counter_u64_add(snd_tag_count, 1); 1529 } 1530 1531 void 1532 m_snd_tag_destroy(struct m_snd_tag *mst) 1533 { 1534 struct ifnet *ifp; 1535 1536 ifp = mst->ifp; 1537 ifp->if_snd_tag_free(mst); 1538 if_rele(ifp); 1539 counter_u64_add(snd_tag_count, -1); 1540 } 1541 1542 /* 1543 * Allocate an mbuf with anonymous external pages. 1544 */ 1545 struct mbuf * 1546 mb_alloc_ext_plus_pages(int len, int how) 1547 { 1548 struct mbuf *m; 1549 vm_page_t pg; 1550 int i, npgs; 1551 1552 m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1553 if (m == NULL) 1554 return (NULL); 1555 m->m_epg_flags |= EPG_FLAG_ANON; 1556 npgs = howmany(len, PAGE_SIZE); 1557 for (i = 0; i < npgs; i++) { 1558 do { 1559 pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 1560 VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); 1561 if (pg == NULL) { 1562 if (how == M_NOWAIT) { 1563 m->m_epg_npgs = i; 1564 m_free(m); 1565 return (NULL); 1566 } 1567 vm_wait(NULL); 1568 } 1569 } while (pg == NULL); 1570 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); 1571 } 1572 m->m_epg_npgs = npgs; 1573 return (m); 1574 } 1575 1576 /* 1577 * Copy the data in the mbuf chain to a chain of mbufs with anonymous external 1578 * unmapped pages. 1579 * len is the length of data in the input mbuf chain. 1580 * mlen is the maximum number of bytes put into each ext_page mbuf. 1581 */ 1582 struct mbuf * 1583 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, 1584 struct mbuf **mlast) 1585 { 1586 struct mbuf *m, *mout; 1587 char *pgpos, *mbpos; 1588 int i, mblen, mbufsiz, pglen, xfer; 1589 1590 if (len == 0) 1591 return (NULL); 1592 mbufsiz = min(mlen, len); 1593 m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); 1594 if (m == NULL) 1595 return (m); 1596 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); 1597 pglen = PAGE_SIZE; 1598 mblen = 0; 1599 i = 0; 1600 do { 1601 if (pglen == 0) { 1602 if (++i == m->m_epg_npgs) { 1603 m->m_epg_last_len = PAGE_SIZE; 1604 mbufsiz = min(mlen, len); 1605 m->m_next = mb_alloc_ext_plus_pages(mbufsiz, 1606 how); 1607 m = m->m_next; 1608 if (m == NULL) { 1609 m_freem(mout); 1610 return (m); 1611 } 1612 i = 0; 1613 } 1614 pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); 1615 pglen = PAGE_SIZE; 1616 } 1617 while (mblen == 0) { 1618 if (mp == NULL) { 1619 m_freem(mout); 1620 return (NULL); 1621 } 1622 KASSERT((mp->m_flags & M_EXTPG) == 0, 1623 ("mb_copym_ext_pgs: ext_pgs input mbuf")); 1624 mbpos = mtod(mp, char *); 1625 mblen = mp->m_len; 1626 mp = mp->m_next; 1627 } 1628 xfer = min(mblen, pglen); 1629 memcpy(pgpos, mbpos, xfer); 1630 pgpos += xfer; 1631 mbpos += xfer; 1632 pglen -= xfer; 1633 mblen -= xfer; 1634 len -= xfer; 1635 m->m_len += xfer; 1636 } while (len > 0); 1637 m->m_epg_last_len = PAGE_SIZE - pglen; 1638 if (mlast != NULL) 1639 *mlast = m; 1640 return (mout); 1641 } 1642