1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_param.h" 38 #include "opt_mbuf_stress_test.h" 39 #include "opt_mbuf_profiling.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/limits.h> 45 #include <sys/lock.h> 46 #include <sys/malloc.h> 47 #include <sys/mbuf.h> 48 #include <sys/sysctl.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/uio.h> 52 #include <sys/vmmeter.h> 53 #include <sys/sbuf.h> 54 #include <sys/sdt.h> 55 #include <vm/vm.h> 56 #include <vm/vm_pageout.h> 57 #include <vm/vm_page.h> 58 59 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, 60 "struct mbuf *", "mbufinfo_t *", 61 "uint32_t", "uint32_t", 62 "uint16_t", "uint16_t", 63 "uint32_t", "uint32_t", 64 "uint32_t", "uint32_t"); 65 66 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw, 67 "uint32_t", "uint32_t", 68 "uint16_t", "uint16_t", 69 "struct mbuf *", "mbufinfo_t *"); 70 71 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, 72 "uint32_t", "uint32_t", 73 "uint16_t", "uint16_t", 74 "struct mbuf *", "mbufinfo_t *"); 75 76 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw, 77 "uint32_t", "uint32_t", 78 "uint16_t", "uint16_t", 79 "struct mbuf *", "mbufinfo_t *"); 80 81 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, 82 "uint32_t", "uint32_t", 83 "uint16_t", "uint16_t", 84 "struct mbuf *", "mbufinfo_t *"); 85 86 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, 87 "uint32_t", "uint32_t", 88 "uint16_t", "uint16_t", 89 "uint32_t", "uint32_t", 90 "struct mbuf *", "mbufinfo_t *"); 91 92 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl, 93 "uint32_t", "uint32_t", 94 "uint16_t", "uint16_t", 95 "uint32_t", "uint32_t", 96 "uint32_t", "uint32_t", 97 "struct mbuf *", "mbufinfo_t *"); 98 99 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, 100 "struct mbuf *", "mbufinfo_t *", 101 "uint32_t", "uint32_t", 102 "uint32_t", "uint32_t"); 103 104 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, 105 "struct mbuf *", "mbufinfo_t *", 106 "uint32_t", "uint32_t", 107 "uint32_t", "uint32_t", 108 "void*", "void*"); 109 110 SDT_PROBE_DEFINE(sdt, , , m__cljset); 111 112 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, 113 "struct mbuf *", "mbufinfo_t *"); 114 115 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, 116 "struct mbuf *", "mbufinfo_t *"); 117 118 #include <security/mac/mac_framework.h> 119 120 /* 121 * Provide minimum possible defaults for link and protocol header space, 122 * assuming IPv4 over Ethernet. Enabling IPv6, IEEE802.11 or some other 123 * protocol may grow these values. 124 */ 125 u_int max_linkhdr = 16; 126 u_int max_protohdr = 40; 127 u_int max_hdr = 16 + 40; 128 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, 129 &max_linkhdr, 16, "Size of largest link layer header"); 130 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, 131 &max_protohdr, 40, "Size of largest protocol layer header"); 132 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, 133 &max_hdr, 16 + 40, "Size of largest link plus protocol header"); 134 135 static void 136 max_hdr_grow(void) 137 { 138 139 max_hdr = max_linkhdr + max_protohdr; 140 MPASS(max_hdr <= MHLEN); 141 } 142 143 void 144 max_linkhdr_grow(u_int new) 145 { 146 147 if (new > max_linkhdr) { 148 max_linkhdr = new; 149 max_hdr_grow(); 150 } 151 } 152 153 void 154 max_protohdr_grow(u_int new) 155 { 156 157 if (new > max_protohdr) { 158 max_protohdr = new; 159 max_hdr_grow(); 160 } 161 } 162 163 #ifdef MBUF_STRESS_TEST 164 int m_defragpackets; 165 int m_defragbytes; 166 int m_defraguseless; 167 int m_defragfailure; 168 int m_defragrandomfailures; 169 170 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, 171 &m_defragpackets, 0, ""); 172 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, 173 &m_defragbytes, 0, ""); 174 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, 175 &m_defraguseless, 0, ""); 176 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, 177 &m_defragfailure, 0, ""); 178 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, 179 &m_defragrandomfailures, 0, ""); 180 #endif 181 182 /* 183 * Ensure the correct size of various mbuf parameters. It could be off due 184 * to compiler-induced padding and alignment artifacts. 185 */ 186 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); 187 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); 188 189 /* 190 * mbuf data storage should be 64-bit aligned regardless of architectural 191 * pointer size; check this is the case with and without a packet header. 192 */ 193 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); 194 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); 195 196 /* 197 * While the specific values here don't matter too much (i.e., +/- a few 198 * words), we do want to ensure that changes to these values are carefully 199 * reasoned about and properly documented. This is especially the case as 200 * network-protocol and device-driver modules encode these layouts, and must 201 * be recompiled if the structures change. Check these values at compile time 202 * against the ones documented in comments in mbuf.h. 203 * 204 * NB: Possibly they should be documented there via #define's and not just 205 * comments. 206 */ 207 #if defined(__LP64__) 208 CTASSERT(offsetof(struct mbuf, m_dat) == 32); 209 CTASSERT(sizeof(struct pkthdr) == 64); 210 CTASSERT(sizeof(struct m_ext) == 160); 211 #else 212 CTASSERT(offsetof(struct mbuf, m_dat) == 24); 213 CTASSERT(sizeof(struct pkthdr) == 56); 214 #if defined(__powerpc__) && defined(BOOKE) 215 /* PowerPC booke has 64-bit physical pointers. */ 216 CTASSERT(sizeof(struct m_ext) == 176); 217 #else 218 CTASSERT(sizeof(struct m_ext) == 172); 219 #endif 220 #endif 221 222 /* 223 * Assert that the queue(3) macros produce code of the same size as an old 224 * plain pointer does. 225 */ 226 #ifdef INVARIANTS 227 static struct mbuf __used m_assertbuf; 228 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); 229 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); 230 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); 231 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); 232 #endif 233 234 /* 235 * Attach the cluster from *m to *n, set up m_ext in *n 236 * and bump the refcount of the cluster. 237 */ 238 void 239 mb_dupcl(struct mbuf *n, struct mbuf *m) 240 { 241 volatile u_int *refcnt; 242 243 KASSERT(m->m_flags & (M_EXT|M_EXTPG), 244 ("%s: M_EXT|M_EXTPG not set on %p", __func__, m)); 245 KASSERT(!(n->m_flags & (M_EXT|M_EXTPG)), 246 ("%s: M_EXT|M_EXTPG set on %p", __func__, n)); 247 248 /* 249 * Cache access optimization. 250 * 251 * o Regular M_EXT storage doesn't need full copy of m_ext, since 252 * the holder of the 'ext_count' is responsible to carry the free 253 * routine and its arguments. 254 * o M_EXTPG data is split between main part of mbuf and m_ext, the 255 * main part is copied in full, the m_ext part is similar to M_EXT. 256 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is 257 * special - it needs full copy of m_ext into each mbuf, since any 258 * copy could end up as the last to free. 259 */ 260 if (m->m_flags & M_EXTPG) { 261 bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy, 262 __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy)); 263 bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen); 264 } else if (m->m_ext.ext_type == EXT_EXTREF) 265 bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext)); 266 else 267 bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); 268 269 n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG); 270 271 /* See if this is the mbuf that holds the embedded refcount. */ 272 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 273 refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; 274 n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; 275 } else { 276 KASSERT(m->m_ext.ext_cnt != NULL, 277 ("%s: no refcounting pointer on %p", __func__, m)); 278 refcnt = m->m_ext.ext_cnt; 279 } 280 281 if (*refcnt == 1) 282 *refcnt += 1; 283 else 284 atomic_add_int(refcnt, 1); 285 } 286 287 void 288 m_demote_pkthdr(struct mbuf *m) 289 { 290 291 M_ASSERTPKTHDR(m); 292 M_ASSERT_NO_SND_TAG(m); 293 294 m_tag_delete_chain(m, NULL); 295 m->m_flags &= ~M_PKTHDR; 296 bzero(&m->m_pkthdr, sizeof(struct pkthdr)); 297 } 298 299 /* 300 * Clean up mbuf (chain) from any tags and packet headers. 301 * If "all" is set then the first mbuf in the chain will be 302 * cleaned too. 303 */ 304 void 305 m_demote(struct mbuf *m0, int all, int flags) 306 { 307 struct mbuf *m; 308 309 flags |= M_DEMOTEFLAGS; 310 311 for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { 312 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", 313 __func__, m, m0)); 314 if (m->m_flags & M_PKTHDR) 315 m_demote_pkthdr(m); 316 m->m_flags &= flags; 317 } 318 } 319 320 /* 321 * Sanity checks on mbuf (chain) for use in KASSERT() and general 322 * debugging. 323 * Returns 0 or panics when bad and 1 on all tests passed. 324 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they 325 * blow up later. 326 */ 327 int 328 m_sanity(struct mbuf *m0, int sanitize) 329 { 330 struct mbuf *m; 331 caddr_t a, b; 332 int pktlen = 0; 333 334 #ifdef INVARIANTS 335 #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) 336 #else 337 #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) 338 #endif 339 340 for (m = m0; m != NULL; m = m->m_next) { 341 /* 342 * Basic pointer checks. If any of these fails then some 343 * unrelated kernel memory before or after us is trashed. 344 * No way to recover from that. 345 */ 346 a = M_START(m); 347 b = a + M_SIZE(m); 348 if ((caddr_t)m->m_data < a) 349 M_SANITY_ACTION("m_data outside mbuf data range left"); 350 if ((caddr_t)m->m_data > b) 351 M_SANITY_ACTION("m_data outside mbuf data range right"); 352 if ((caddr_t)m->m_data + m->m_len > b) 353 M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); 354 355 /* m->m_nextpkt may only be set on first mbuf in chain. */ 356 if (m != m0 && m->m_nextpkt != NULL) { 357 if (sanitize) { 358 m_freem(m->m_nextpkt); 359 m->m_nextpkt = (struct mbuf *)0xDEADC0DE; 360 } else 361 M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); 362 } 363 364 /* packet length (not mbuf length!) calculation */ 365 if (m0->m_flags & M_PKTHDR) 366 pktlen += m->m_len; 367 368 /* m_tags may only be attached to first mbuf in chain. */ 369 if (m != m0 && m->m_flags & M_PKTHDR && 370 !SLIST_EMPTY(&m->m_pkthdr.tags)) { 371 if (sanitize) { 372 m_tag_delete_chain(m, NULL); 373 /* put in 0xDEADC0DE perhaps? */ 374 } else 375 M_SANITY_ACTION("m_tags on in-chain mbuf"); 376 } 377 378 /* M_PKTHDR may only be set on first mbuf in chain */ 379 if (m != m0 && m->m_flags & M_PKTHDR) { 380 if (sanitize) { 381 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 382 m->m_flags &= ~M_PKTHDR; 383 /* put in 0xDEADCODE and leave hdr flag in */ 384 } else 385 M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); 386 } 387 } 388 m = m0; 389 if (pktlen && pktlen != m->m_pkthdr.len) { 390 if (sanitize) 391 m->m_pkthdr.len = 0; 392 else 393 M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); 394 } 395 return 1; 396 397 #undef M_SANITY_ACTION 398 } 399 400 /* 401 * Non-inlined part of m_init(). 402 */ 403 int 404 m_pkthdr_init(struct mbuf *m, int how) 405 { 406 #ifdef MAC 407 int error; 408 #endif 409 m->m_data = m->m_pktdat; 410 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 411 #ifdef NUMA 412 m->m_pkthdr.numa_domain = M_NODOM; 413 #endif 414 #ifdef MAC 415 /* If the label init fails, fail the alloc */ 416 error = mac_mbuf_init(m, how); 417 if (error) 418 return (error); 419 #endif 420 421 return (0); 422 } 423 424 /* 425 * "Move" mbuf pkthdr from "from" to "to". 426 * "from" must have M_PKTHDR set, and "to" must be empty. 427 */ 428 void 429 m_move_pkthdr(struct mbuf *to, struct mbuf *from) 430 { 431 432 #if 0 433 /* see below for why these are not enabled */ 434 M_ASSERTPKTHDR(to); 435 /* Note: with MAC, this may not be a good assertion. */ 436 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), 437 ("m_move_pkthdr: to has tags")); 438 #endif 439 #ifdef MAC 440 /* 441 * XXXMAC: It could be this should also occur for non-MAC? 442 */ 443 if (to->m_flags & M_PKTHDR) 444 m_tag_delete_chain(to, NULL); 445 #endif 446 to->m_flags = (from->m_flags & M_COPYFLAGS) | 447 (to->m_flags & (M_EXT | M_EXTPG)); 448 if ((to->m_flags & M_EXT) == 0) 449 to->m_data = to->m_pktdat; 450 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 451 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ 452 from->m_flags &= ~M_PKTHDR; 453 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { 454 from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; 455 from->m_pkthdr.snd_tag = NULL; 456 } 457 } 458 459 /* 460 * Duplicate "from"'s mbuf pkthdr in "to". 461 * "from" must have M_PKTHDR set, and "to" must be empty. 462 * In particular, this does a deep copy of the packet tags. 463 */ 464 int 465 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) 466 { 467 468 #if 0 469 /* 470 * The mbuf allocator only initializes the pkthdr 471 * when the mbuf is allocated with m_gethdr(). Many users 472 * (e.g. m_copy*, m_prepend) use m_get() and then 473 * smash the pkthdr as needed causing these 474 * assertions to trip. For now just disable them. 475 */ 476 M_ASSERTPKTHDR(to); 477 /* Note: with MAC, this may not be a good assertion. */ 478 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); 479 #endif 480 MBUF_CHECKSLEEP(how); 481 #ifdef MAC 482 if (to->m_flags & M_PKTHDR) 483 m_tag_delete_chain(to, NULL); 484 #endif 485 to->m_flags = (from->m_flags & M_COPYFLAGS) | 486 (to->m_flags & (M_EXT | M_EXTPG)); 487 if ((to->m_flags & M_EXT) == 0) 488 to->m_data = to->m_pktdat; 489 to->m_pkthdr = from->m_pkthdr; 490 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) 491 m_snd_tag_ref(from->m_pkthdr.snd_tag); 492 SLIST_INIT(&to->m_pkthdr.tags); 493 return (m_tag_copy_chain(to, from, how)); 494 } 495 496 /* 497 * Lesser-used path for M_PREPEND: 498 * allocate new mbuf to prepend to chain, 499 * copy junk along. 500 */ 501 struct mbuf * 502 m_prepend(struct mbuf *m, int len, int how) 503 { 504 struct mbuf *mn; 505 506 if (m->m_flags & M_PKTHDR) 507 mn = m_gethdr(how, m->m_type); 508 else 509 mn = m_get(how, m->m_type); 510 if (mn == NULL) { 511 m_freem(m); 512 return (NULL); 513 } 514 if (m->m_flags & M_PKTHDR) 515 m_move_pkthdr(mn, m); 516 mn->m_next = m; 517 m = mn; 518 if (len < M_SIZE(m)) 519 M_ALIGN(m, len); 520 m->m_len = len; 521 return (m); 522 } 523 524 /* 525 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 526 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 527 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. 528 * Note that the copy is read-only, because clusters are not copied, 529 * only their reference counts are incremented. 530 */ 531 struct mbuf * 532 m_copym(struct mbuf *m, int off0, int len, int wait) 533 { 534 struct mbuf *n, **np; 535 int off = off0; 536 struct mbuf *top; 537 int copyhdr = 0; 538 539 KASSERT(off >= 0, ("m_copym, negative off %d", off)); 540 KASSERT(len >= 0, ("m_copym, negative len %d", len)); 541 MBUF_CHECKSLEEP(wait); 542 if (off == 0 && m->m_flags & M_PKTHDR) 543 copyhdr = 1; 544 while (off > 0) { 545 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); 546 if (off < m->m_len) 547 break; 548 off -= m->m_len; 549 m = m->m_next; 550 } 551 np = ⊤ 552 top = NULL; 553 while (len > 0) { 554 if (m == NULL) { 555 KASSERT(len == M_COPYALL, 556 ("m_copym, length > size of mbuf chain")); 557 break; 558 } 559 if (copyhdr) 560 n = m_gethdr(wait, m->m_type); 561 else 562 n = m_get(wait, m->m_type); 563 *np = n; 564 if (n == NULL) 565 goto nospace; 566 if (copyhdr) { 567 if (!m_dup_pkthdr(n, m, wait)) 568 goto nospace; 569 if (len == M_COPYALL) 570 n->m_pkthdr.len -= off0; 571 else 572 n->m_pkthdr.len = len; 573 copyhdr = 0; 574 } 575 n->m_len = min(len, m->m_len - off); 576 if (m->m_flags & (M_EXT|M_EXTPG)) { 577 n->m_data = m->m_data + off; 578 mb_dupcl(n, m); 579 } else 580 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 581 (u_int)n->m_len); 582 if (len != M_COPYALL) 583 len -= n->m_len; 584 off = 0; 585 m = m->m_next; 586 np = &n->m_next; 587 } 588 589 return (top); 590 nospace: 591 m_freem(top); 592 return (NULL); 593 } 594 595 /* 596 * Copy an entire packet, including header (which must be present). 597 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. 598 * Note that the copy is read-only, because clusters are not copied, 599 * only their reference counts are incremented. 600 * Preserve alignment of the first mbuf so if the creator has left 601 * some room at the beginning (e.g. for inserting protocol headers) 602 * the copies still have the room available. 603 */ 604 struct mbuf * 605 m_copypacket(struct mbuf *m, int how) 606 { 607 struct mbuf *top, *n, *o; 608 609 MBUF_CHECKSLEEP(how); 610 n = m_get(how, m->m_type); 611 top = n; 612 if (n == NULL) 613 goto nospace; 614 615 if (!m_dup_pkthdr(n, m, how)) 616 goto nospace; 617 n->m_len = m->m_len; 618 if (m->m_flags & (M_EXT|M_EXTPG)) { 619 n->m_data = m->m_data; 620 mb_dupcl(n, m); 621 } else { 622 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); 623 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 624 } 625 626 m = m->m_next; 627 while (m) { 628 o = m_get(how, m->m_type); 629 if (o == NULL) 630 goto nospace; 631 632 n->m_next = o; 633 n = n->m_next; 634 635 n->m_len = m->m_len; 636 if (m->m_flags & (M_EXT|M_EXTPG)) { 637 n->m_data = m->m_data; 638 mb_dupcl(n, m); 639 } else { 640 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 641 } 642 643 m = m->m_next; 644 } 645 return top; 646 nospace: 647 m_freem(top); 648 return (NULL); 649 } 650 651 static void 652 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) 653 { 654 struct iovec iov; 655 struct uio uio; 656 int error __diagused; 657 658 KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); 659 KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); 660 KASSERT(off < m->m_len, 661 ("m_copyfromunmapped: len exceeds mbuf length")); 662 iov.iov_base = cp; 663 iov.iov_len = len; 664 uio.uio_resid = len; 665 uio.uio_iov = &iov; 666 uio.uio_segflg = UIO_SYSSPACE; 667 uio.uio_iovcnt = 1; 668 uio.uio_offset = 0; 669 uio.uio_rw = UIO_READ; 670 error = m_unmapped_uiomove(m, off, &uio, len); 671 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 672 len)); 673 } 674 675 /* 676 * Copy data from an mbuf chain starting "off" bytes from the beginning, 677 * continuing for "len" bytes, into the indicated buffer. 678 */ 679 void 680 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) 681 { 682 u_int count; 683 684 KASSERT(off >= 0, ("m_copydata, negative off %d", off)); 685 KASSERT(len >= 0, ("m_copydata, negative len %d", len)); 686 while (off > 0) { 687 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); 688 if (off < m->m_len) 689 break; 690 off -= m->m_len; 691 m = m->m_next; 692 } 693 while (len > 0) { 694 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); 695 count = min(m->m_len - off, len); 696 if ((m->m_flags & M_EXTPG) != 0) 697 m_copyfromunmapped(m, off, count, cp); 698 else 699 bcopy(mtod(m, caddr_t) + off, cp, count); 700 len -= count; 701 cp += count; 702 off = 0; 703 m = m->m_next; 704 } 705 } 706 707 /* 708 * Copy a packet header mbuf chain into a completely new chain, including 709 * copying any mbuf clusters. Use this instead of m_copypacket() when 710 * you need a writable copy of an mbuf chain. 711 */ 712 struct mbuf * 713 m_dup(const struct mbuf *m, int how) 714 { 715 struct mbuf **p, *top = NULL; 716 int remain, moff, nsize; 717 718 MBUF_CHECKSLEEP(how); 719 /* Sanity check */ 720 if (m == NULL) 721 return (NULL); 722 M_ASSERTPKTHDR(m); 723 724 /* While there's more data, get a new mbuf, tack it on, and fill it */ 725 remain = m->m_pkthdr.len; 726 moff = 0; 727 p = ⊤ 728 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ 729 struct mbuf *n; 730 731 /* Get the next new mbuf */ 732 if (remain >= MINCLSIZE) { 733 n = m_getcl(how, m->m_type, 0); 734 nsize = MCLBYTES; 735 } else { 736 n = m_get(how, m->m_type); 737 nsize = MLEN; 738 } 739 if (n == NULL) 740 goto nospace; 741 742 if (top == NULL) { /* First one, must be PKTHDR */ 743 if (!m_dup_pkthdr(n, m, how)) { 744 m_free(n); 745 goto nospace; 746 } 747 if ((n->m_flags & M_EXT) == 0) 748 nsize = MHLEN; 749 n->m_flags &= ~M_RDONLY; 750 } 751 n->m_len = 0; 752 753 /* Link it into the new chain */ 754 *p = n; 755 p = &n->m_next; 756 757 /* Copy data from original mbuf(s) into new mbuf */ 758 while (n->m_len < nsize && m != NULL) { 759 int chunk = min(nsize - n->m_len, m->m_len - moff); 760 761 m_copydata(m, moff, chunk, n->m_data + n->m_len); 762 moff += chunk; 763 n->m_len += chunk; 764 remain -= chunk; 765 if (moff == m->m_len) { 766 m = m->m_next; 767 moff = 0; 768 } 769 } 770 771 /* Check correct total mbuf length */ 772 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), 773 ("%s: bogus m_pkthdr.len", __func__)); 774 } 775 return (top); 776 777 nospace: 778 m_freem(top); 779 return (NULL); 780 } 781 782 /* 783 * Concatenate mbuf chain n to m. 784 * Both chains must be of the same type (e.g. MT_DATA). 785 * Any m_pkthdr is not updated. 786 */ 787 void 788 m_cat(struct mbuf *m, struct mbuf *n) 789 { 790 while (m->m_next) 791 m = m->m_next; 792 while (n) { 793 if (!M_WRITABLE(m) || 794 (n->m_flags & M_EXTPG) != 0 || 795 M_TRAILINGSPACE(m) < n->m_len) { 796 /* just join the two chains */ 797 m->m_next = n; 798 return; 799 } 800 /* splat the data from one into the other */ 801 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 802 (u_int)n->m_len); 803 m->m_len += n->m_len; 804 n = m_free(n); 805 } 806 } 807 808 /* 809 * Concatenate two pkthdr mbuf chains. 810 */ 811 void 812 m_catpkt(struct mbuf *m, struct mbuf *n) 813 { 814 815 M_ASSERTPKTHDR(m); 816 M_ASSERTPKTHDR(n); 817 818 m->m_pkthdr.len += n->m_pkthdr.len; 819 m_demote(n, 1, 0); 820 821 m_cat(m, n); 822 } 823 824 void 825 m_adj(struct mbuf *mp, int req_len) 826 { 827 int len = req_len; 828 struct mbuf *m; 829 int count; 830 831 if ((m = mp) == NULL) 832 return; 833 if (len >= 0) { 834 /* 835 * Trim from head. 836 */ 837 while (m != NULL && len > 0) { 838 if (m->m_len <= len) { 839 len -= m->m_len; 840 m->m_len = 0; 841 m = m->m_next; 842 } else { 843 m->m_len -= len; 844 m->m_data += len; 845 len = 0; 846 } 847 } 848 if (mp->m_flags & M_PKTHDR) 849 mp->m_pkthdr.len -= (req_len - len); 850 } else { 851 /* 852 * Trim from tail. Scan the mbuf chain, 853 * calculating its length and finding the last mbuf. 854 * If the adjustment only affects this mbuf, then just 855 * adjust and return. Otherwise, rescan and truncate 856 * after the remaining size. 857 */ 858 len = -len; 859 count = 0; 860 for (;;) { 861 count += m->m_len; 862 if (m->m_next == (struct mbuf *)0) 863 break; 864 m = m->m_next; 865 } 866 if (m->m_len >= len) { 867 m->m_len -= len; 868 if (mp->m_flags & M_PKTHDR) 869 mp->m_pkthdr.len -= len; 870 return; 871 } 872 count -= len; 873 if (count < 0) 874 count = 0; 875 /* 876 * Correct length for chain is "count". 877 * Find the mbuf with last data, adjust its length, 878 * and toss data from remaining mbufs on chain. 879 */ 880 m = mp; 881 if (m->m_flags & M_PKTHDR) 882 m->m_pkthdr.len = count; 883 for (; m; m = m->m_next) { 884 if (m->m_len >= count) { 885 m->m_len = count; 886 if (m->m_next != NULL) { 887 m_freem(m->m_next); 888 m->m_next = NULL; 889 } 890 break; 891 } 892 count -= m->m_len; 893 } 894 } 895 } 896 897 void 898 m_adj_decap(struct mbuf *mp, int len) 899 { 900 uint8_t rsstype; 901 902 m_adj(mp, len); 903 if ((mp->m_flags & M_PKTHDR) != 0) { 904 /* 905 * If flowid was calculated by card from the inner 906 * headers, move flowid to the decapsulated mbuf 907 * chain, otherwise clear. This depends on the 908 * internals of m_adj, which keeps pkthdr as is, in 909 * particular not changing rsstype and flowid. 910 */ 911 rsstype = mp->m_pkthdr.rsstype; 912 if ((rsstype & M_HASHTYPE_INNER) != 0) { 913 M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER); 914 } else { 915 M_HASHTYPE_CLEAR(mp); 916 } 917 } 918 } 919 920 /* 921 * Rearange an mbuf chain so that len bytes are contiguous 922 * and in the data area of an mbuf (so that mtod will work 923 * for a structure of size len). Returns the resulting 924 * mbuf chain on success, frees it and returns null on failure. 925 * If there is room, it will add up to max_protohdr-len extra bytes to the 926 * contiguous region in an attempt to avoid being called next time. 927 */ 928 struct mbuf * 929 m_pullup(struct mbuf *n, int len) 930 { 931 struct mbuf *m; 932 int count; 933 int space; 934 935 KASSERT((n->m_flags & M_EXTPG) == 0, 936 ("%s: unmapped mbuf %p", __func__, n)); 937 938 /* 939 * If first mbuf has no cluster, and has room for len bytes 940 * without shifting current data, pullup into it, 941 * otherwise allocate a new mbuf to prepend to the chain. 942 */ 943 if ((n->m_flags & M_EXT) == 0 && 944 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 945 if (n->m_len >= len) 946 return (n); 947 m = n; 948 n = n->m_next; 949 len -= m->m_len; 950 } else { 951 if (len > MHLEN) 952 goto bad; 953 m = m_get(M_NOWAIT, n->m_type); 954 if (m == NULL) 955 goto bad; 956 if (n->m_flags & M_PKTHDR) 957 m_move_pkthdr(m, n); 958 } 959 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 960 do { 961 count = min(min(max(len, max_protohdr), space), n->m_len); 962 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 963 (u_int)count); 964 len -= count; 965 m->m_len += count; 966 n->m_len -= count; 967 space -= count; 968 if (n->m_len) 969 n->m_data += count; 970 else 971 n = m_free(n); 972 } while (len > 0 && n); 973 if (len > 0) { 974 (void) m_free(m); 975 goto bad; 976 } 977 m->m_next = n; 978 return (m); 979 bad: 980 m_freem(n); 981 return (NULL); 982 } 983 984 /* 985 * Like m_pullup(), except a new mbuf is always allocated, and we allow 986 * the amount of empty space before the data in the new mbuf to be specified 987 * (in the event that the caller expects to prepend later). 988 */ 989 struct mbuf * 990 m_copyup(struct mbuf *n, int len, int dstoff) 991 { 992 struct mbuf *m; 993 int count, space; 994 995 if (len > (MHLEN - dstoff)) 996 goto bad; 997 m = m_get(M_NOWAIT, n->m_type); 998 if (m == NULL) 999 goto bad; 1000 if (n->m_flags & M_PKTHDR) 1001 m_move_pkthdr(m, n); 1002 m->m_data += dstoff; 1003 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 1004 do { 1005 count = min(min(max(len, max_protohdr), space), n->m_len); 1006 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 1007 (unsigned)count); 1008 len -= count; 1009 m->m_len += count; 1010 n->m_len -= count; 1011 space -= count; 1012 if (n->m_len) 1013 n->m_data += count; 1014 else 1015 n = m_free(n); 1016 } while (len > 0 && n); 1017 if (len > 0) { 1018 (void) m_free(m); 1019 goto bad; 1020 } 1021 m->m_next = n; 1022 return (m); 1023 bad: 1024 m_freem(n); 1025 return (NULL); 1026 } 1027 1028 /* 1029 * Partition an mbuf chain in two pieces, returning the tail -- 1030 * all but the first len0 bytes. In case of failure, it returns NULL and 1031 * attempts to restore the chain to its original state. 1032 * 1033 * Note that the resulting mbufs might be read-only, because the new 1034 * mbuf can end up sharing an mbuf cluster with the original mbuf if 1035 * the "breaking point" happens to lie within a cluster mbuf. Use the 1036 * M_WRITABLE() macro to check for this case. 1037 */ 1038 struct mbuf * 1039 m_split(struct mbuf *m0, int len0, int wait) 1040 { 1041 struct mbuf *m, *n; 1042 u_int len = len0, remain; 1043 1044 MBUF_CHECKSLEEP(wait); 1045 for (m = m0; m && len > m->m_len; m = m->m_next) 1046 len -= m->m_len; 1047 if (m == NULL) 1048 return (NULL); 1049 remain = m->m_len - len; 1050 if (m0->m_flags & M_PKTHDR && remain == 0) { 1051 n = m_gethdr(wait, m0->m_type); 1052 if (n == NULL) 1053 return (NULL); 1054 n->m_next = m->m_next; 1055 m->m_next = NULL; 1056 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1057 n->m_pkthdr.snd_tag = 1058 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1059 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1060 } else 1061 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1062 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1063 m0->m_pkthdr.len = len0; 1064 return (n); 1065 } else if (m0->m_flags & M_PKTHDR) { 1066 n = m_gethdr(wait, m0->m_type); 1067 if (n == NULL) 1068 return (NULL); 1069 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1070 n->m_pkthdr.snd_tag = 1071 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1072 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1073 } else 1074 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1075 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1076 m0->m_pkthdr.len = len0; 1077 if (m->m_flags & (M_EXT|M_EXTPG)) 1078 goto extpacket; 1079 if (remain > MHLEN) { 1080 /* m can't be the lead packet */ 1081 M_ALIGN(n, 0); 1082 n->m_next = m_split(m, len, wait); 1083 if (n->m_next == NULL) { 1084 (void) m_free(n); 1085 return (NULL); 1086 } else { 1087 n->m_len = 0; 1088 return (n); 1089 } 1090 } else 1091 M_ALIGN(n, remain); 1092 } else if (remain == 0) { 1093 n = m->m_next; 1094 m->m_next = NULL; 1095 return (n); 1096 } else { 1097 n = m_get(wait, m->m_type); 1098 if (n == NULL) 1099 return (NULL); 1100 M_ALIGN(n, remain); 1101 } 1102 extpacket: 1103 if (m->m_flags & (M_EXT|M_EXTPG)) { 1104 n->m_data = m->m_data + len; 1105 mb_dupcl(n, m); 1106 } else { 1107 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); 1108 } 1109 n->m_len = remain; 1110 m->m_len = len; 1111 n->m_next = m->m_next; 1112 m->m_next = NULL; 1113 return (n); 1114 } 1115 /* 1116 * Routine to copy from device local memory into mbufs. 1117 * Note that `off' argument is offset into first mbuf of target chain from 1118 * which to begin copying the data to. 1119 */ 1120 struct mbuf * 1121 m_devget(char *buf, int totlen, int off, struct ifnet *ifp, 1122 void (*copy)(char *from, caddr_t to, u_int len)) 1123 { 1124 struct mbuf *m; 1125 struct mbuf *top = NULL, **mp = ⊤ 1126 int len; 1127 1128 if (off < 0 || off > MHLEN) 1129 return (NULL); 1130 1131 while (totlen > 0) { 1132 if (top == NULL) { /* First one, must be PKTHDR */ 1133 if (totlen + off >= MINCLSIZE) { 1134 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 1135 len = MCLBYTES; 1136 } else { 1137 m = m_gethdr(M_NOWAIT, MT_DATA); 1138 len = MHLEN; 1139 1140 /* Place initial small packet/header at end of mbuf */ 1141 if (m && totlen + off + max_linkhdr <= MHLEN) { 1142 m->m_data += max_linkhdr; 1143 len -= max_linkhdr; 1144 } 1145 } 1146 if (m == NULL) 1147 return NULL; 1148 m->m_pkthdr.rcvif = ifp; 1149 m->m_pkthdr.len = totlen; 1150 } else { 1151 if (totlen + off >= MINCLSIZE) { 1152 m = m_getcl(M_NOWAIT, MT_DATA, 0); 1153 len = MCLBYTES; 1154 } else { 1155 m = m_get(M_NOWAIT, MT_DATA); 1156 len = MLEN; 1157 } 1158 if (m == NULL) { 1159 m_freem(top); 1160 return NULL; 1161 } 1162 } 1163 if (off) { 1164 m->m_data += off; 1165 len -= off; 1166 off = 0; 1167 } 1168 m->m_len = len = min(totlen, len); 1169 if (copy) 1170 copy(buf, mtod(m, caddr_t), (u_int)len); 1171 else 1172 bcopy(buf, mtod(m, caddr_t), (u_int)len); 1173 buf += len; 1174 *mp = m; 1175 mp = &m->m_next; 1176 totlen -= len; 1177 } 1178 return (top); 1179 } 1180 1181 static void 1182 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp) 1183 { 1184 struct iovec iov; 1185 struct uio uio; 1186 int error __diagused; 1187 1188 KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off)); 1189 KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len)); 1190 KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length")); 1191 iov.iov_base = __DECONST(caddr_t, cp); 1192 iov.iov_len = len; 1193 uio.uio_resid = len; 1194 uio.uio_iov = &iov; 1195 uio.uio_segflg = UIO_SYSSPACE; 1196 uio.uio_iovcnt = 1; 1197 uio.uio_offset = 0; 1198 uio.uio_rw = UIO_WRITE; 1199 error = m_unmapped_uiomove(m, off, &uio, len); 1200 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 1201 len)); 1202 } 1203 1204 /* 1205 * Copy data from a buffer back into the indicated mbuf chain, 1206 * starting "off" bytes from the beginning, extending the mbuf 1207 * chain if necessary. 1208 */ 1209 void 1210 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) 1211 { 1212 int mlen; 1213 struct mbuf *m = m0, *n; 1214 int totlen = 0; 1215 1216 if (m0 == NULL) 1217 return; 1218 while (off > (mlen = m->m_len)) { 1219 off -= mlen; 1220 totlen += mlen; 1221 if (m->m_next == NULL) { 1222 n = m_get(M_NOWAIT, m->m_type); 1223 if (n == NULL) 1224 goto out; 1225 bzero(mtod(n, caddr_t), MLEN); 1226 n->m_len = min(MLEN, len + off); 1227 m->m_next = n; 1228 } 1229 m = m->m_next; 1230 } 1231 while (len > 0) { 1232 if (m->m_next == NULL && (len > m->m_len - off)) { 1233 m->m_len += min(len - (m->m_len - off), 1234 M_TRAILINGSPACE(m)); 1235 } 1236 mlen = min (m->m_len - off, len); 1237 if ((m->m_flags & M_EXTPG) != 0) 1238 m_copytounmapped(m, off, mlen, cp); 1239 else 1240 bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); 1241 cp += mlen; 1242 len -= mlen; 1243 mlen += off; 1244 off = 0; 1245 totlen += mlen; 1246 if (len == 0) 1247 break; 1248 if (m->m_next == NULL) { 1249 n = m_get(M_NOWAIT, m->m_type); 1250 if (n == NULL) 1251 break; 1252 n->m_len = min(MLEN, len); 1253 m->m_next = n; 1254 } 1255 m = m->m_next; 1256 } 1257 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) 1258 m->m_pkthdr.len = totlen; 1259 } 1260 1261 /* 1262 * Append the specified data to the indicated mbuf chain, 1263 * Extend the mbuf chain if the new data does not fit in 1264 * existing space. 1265 * 1266 * Return 1 if able to complete the job; otherwise 0. 1267 */ 1268 int 1269 m_append(struct mbuf *m0, int len, c_caddr_t cp) 1270 { 1271 struct mbuf *m, *n; 1272 int remainder, space; 1273 1274 for (m = m0; m->m_next != NULL; m = m->m_next) 1275 ; 1276 remainder = len; 1277 space = M_TRAILINGSPACE(m); 1278 if (space > 0) { 1279 /* 1280 * Copy into available space. 1281 */ 1282 if (space > remainder) 1283 space = remainder; 1284 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 1285 m->m_len += space; 1286 cp += space, remainder -= space; 1287 } 1288 while (remainder > 0) { 1289 /* 1290 * Allocate a new mbuf; could check space 1291 * and allocate a cluster instead. 1292 */ 1293 n = m_get(M_NOWAIT, m->m_type); 1294 if (n == NULL) 1295 break; 1296 n->m_len = min(MLEN, remainder); 1297 bcopy(cp, mtod(n, caddr_t), n->m_len); 1298 cp += n->m_len, remainder -= n->m_len; 1299 m->m_next = n; 1300 m = n; 1301 } 1302 if (m0->m_flags & M_PKTHDR) 1303 m0->m_pkthdr.len += len - remainder; 1304 return (remainder == 0); 1305 } 1306 1307 static int 1308 m_apply_extpg_one(struct mbuf *m, int off, int len, 1309 int (*f)(void *, void *, u_int), void *arg) 1310 { 1311 void *p; 1312 u_int i, count, pgoff, pglen; 1313 int rval; 1314 1315 KASSERT(PMAP_HAS_DMAP, 1316 ("m_apply_extpg_one does not support unmapped mbufs")); 1317 off += mtod(m, vm_offset_t); 1318 if (off < m->m_epg_hdrlen) { 1319 count = min(m->m_epg_hdrlen - off, len); 1320 rval = f(arg, m->m_epg_hdr + off, count); 1321 if (rval) 1322 return (rval); 1323 len -= count; 1324 off = 0; 1325 } else 1326 off -= m->m_epg_hdrlen; 1327 pgoff = m->m_epg_1st_off; 1328 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 1329 pglen = m_epg_pagelen(m, i, pgoff); 1330 if (off < pglen) { 1331 count = min(pglen - off, len); 1332 p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off); 1333 rval = f(arg, p, count); 1334 if (rval) 1335 return (rval); 1336 len -= count; 1337 off = 0; 1338 } else 1339 off -= pglen; 1340 pgoff = 0; 1341 } 1342 if (len > 0) { 1343 KASSERT(off < m->m_epg_trllen, 1344 ("m_apply_extpg_one: offset beyond trailer")); 1345 KASSERT(len <= m->m_epg_trllen - off, 1346 ("m_apply_extpg_one: length beyond trailer")); 1347 return (f(arg, m->m_epg_trail + off, len)); 1348 } 1349 return (0); 1350 } 1351 1352 /* Apply function f to the data in a single mbuf. */ 1353 static int 1354 m_apply_one(struct mbuf *m, int off, int len, 1355 int (*f)(void *, void *, u_int), void *arg) 1356 { 1357 if ((m->m_flags & M_EXTPG) != 0) 1358 return (m_apply_extpg_one(m, off, len, f, arg)); 1359 else 1360 return (f(arg, mtod(m, caddr_t) + off, len)); 1361 } 1362 1363 /* 1364 * Apply function f to the data in an mbuf chain starting "off" bytes from 1365 * the beginning, continuing for "len" bytes. 1366 */ 1367 int 1368 m_apply(struct mbuf *m, int off, int len, 1369 int (*f)(void *, void *, u_int), void *arg) 1370 { 1371 u_int count; 1372 int rval; 1373 1374 KASSERT(off >= 0, ("m_apply, negative off %d", off)); 1375 KASSERT(len >= 0, ("m_apply, negative len %d", len)); 1376 while (off > 0) { 1377 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 1378 if (off < m->m_len) 1379 break; 1380 off -= m->m_len; 1381 m = m->m_next; 1382 } 1383 while (len > 0) { 1384 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 1385 count = min(m->m_len - off, len); 1386 rval = m_apply_one(m, off, count, f, arg); 1387 if (rval) 1388 return (rval); 1389 len -= count; 1390 off = 0; 1391 m = m->m_next; 1392 } 1393 return (0); 1394 } 1395 1396 /* 1397 * Return a pointer to mbuf/offset of location in mbuf chain. 1398 */ 1399 struct mbuf * 1400 m_getptr(struct mbuf *m, int loc, int *off) 1401 { 1402 1403 while (loc >= 0) { 1404 /* Normal end of search. */ 1405 if (m->m_len > loc) { 1406 *off = loc; 1407 return (m); 1408 } else { 1409 loc -= m->m_len; 1410 if (m->m_next == NULL) { 1411 if (loc == 0) { 1412 /* Point at the end of valid data. */ 1413 *off = m->m_len; 1414 return (m); 1415 } 1416 return (NULL); 1417 } 1418 m = m->m_next; 1419 } 1420 } 1421 return (NULL); 1422 } 1423 1424 void 1425 m_print(const struct mbuf *m, int maxlen) 1426 { 1427 int len; 1428 int pdata; 1429 const struct mbuf *m2; 1430 1431 if (m == NULL) { 1432 printf("mbuf: %p\n", m); 1433 return; 1434 } 1435 1436 if (m->m_flags & M_PKTHDR) 1437 len = m->m_pkthdr.len; 1438 else 1439 len = -1; 1440 m2 = m; 1441 while (m2 != NULL && (len == -1 || len)) { 1442 pdata = m2->m_len; 1443 if (maxlen != -1 && pdata > maxlen) 1444 pdata = maxlen; 1445 printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, 1446 m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" 1447 "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" 1448 "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); 1449 if (pdata) 1450 printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); 1451 if (len != -1) 1452 len -= m2->m_len; 1453 m2 = m2->m_next; 1454 } 1455 if (len > 0) 1456 printf("%d bytes unaccounted for.\n", len); 1457 return; 1458 } 1459 1460 u_int 1461 m_fixhdr(struct mbuf *m0) 1462 { 1463 u_int len; 1464 1465 len = m_length(m0, NULL); 1466 m0->m_pkthdr.len = len; 1467 return (len); 1468 } 1469 1470 u_int 1471 m_length(struct mbuf *m0, struct mbuf **last) 1472 { 1473 struct mbuf *m; 1474 u_int len; 1475 1476 len = 0; 1477 for (m = m0; m != NULL; m = m->m_next) { 1478 len += m->m_len; 1479 if (m->m_next == NULL) 1480 break; 1481 } 1482 if (last != NULL) 1483 *last = m; 1484 return (len); 1485 } 1486 1487 /* 1488 * Defragment a mbuf chain, returning the shortest possible 1489 * chain of mbufs and clusters. If allocation fails and 1490 * this cannot be completed, NULL will be returned, but 1491 * the passed in chain will be unchanged. Upon success, 1492 * the original chain will be freed, and the new chain 1493 * will be returned. 1494 * 1495 * If a non-packet header is passed in, the original 1496 * mbuf (chain?) will be returned unharmed. 1497 */ 1498 struct mbuf * 1499 m_defrag(struct mbuf *m0, int how) 1500 { 1501 struct mbuf *m_new = NULL, *m_final = NULL; 1502 int progress = 0, length; 1503 1504 MBUF_CHECKSLEEP(how); 1505 if (!(m0->m_flags & M_PKTHDR)) 1506 return (m0); 1507 1508 m_fixhdr(m0); /* Needed sanity check */ 1509 1510 #ifdef MBUF_STRESS_TEST 1511 if (m_defragrandomfailures) { 1512 int temp = arc4random() & 0xff; 1513 if (temp == 0xba) 1514 goto nospace; 1515 } 1516 #endif 1517 1518 if (m0->m_pkthdr.len > MHLEN) 1519 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 1520 else 1521 m_final = m_gethdr(how, MT_DATA); 1522 1523 if (m_final == NULL) 1524 goto nospace; 1525 1526 if (m_dup_pkthdr(m_final, m0, how) == 0) 1527 goto nospace; 1528 1529 m_new = m_final; 1530 1531 while (progress < m0->m_pkthdr.len) { 1532 length = m0->m_pkthdr.len - progress; 1533 if (length > MCLBYTES) 1534 length = MCLBYTES; 1535 1536 if (m_new == NULL) { 1537 if (length > MLEN) 1538 m_new = m_getcl(how, MT_DATA, 0); 1539 else 1540 m_new = m_get(how, MT_DATA); 1541 if (m_new == NULL) 1542 goto nospace; 1543 } 1544 1545 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 1546 progress += length; 1547 m_new->m_len = length; 1548 if (m_new != m_final) 1549 m_cat(m_final, m_new); 1550 m_new = NULL; 1551 } 1552 #ifdef MBUF_STRESS_TEST 1553 if (m0->m_next == NULL) 1554 m_defraguseless++; 1555 #endif 1556 m_freem(m0); 1557 m0 = m_final; 1558 #ifdef MBUF_STRESS_TEST 1559 m_defragpackets++; 1560 m_defragbytes += m0->m_pkthdr.len; 1561 #endif 1562 return (m0); 1563 nospace: 1564 #ifdef MBUF_STRESS_TEST 1565 m_defragfailure++; 1566 #endif 1567 if (m_final) 1568 m_freem(m_final); 1569 return (NULL); 1570 } 1571 1572 /* 1573 * Return the number of fragments an mbuf will use. This is usually 1574 * used as a proxy for the number of scatter/gather elements needed by 1575 * a DMA engine to access an mbuf. In general mapped mbufs are 1576 * assumed to be backed by physically contiguous buffers that only 1577 * need a single fragment. Unmapped mbufs, on the other hand, can 1578 * span disjoint physical pages. 1579 */ 1580 static int 1581 frags_per_mbuf(struct mbuf *m) 1582 { 1583 int frags; 1584 1585 if ((m->m_flags & M_EXTPG) == 0) 1586 return (1); 1587 1588 /* 1589 * The header and trailer are counted as a single fragment 1590 * each when present. 1591 * 1592 * XXX: This overestimates the number of fragments by assuming 1593 * all the backing physical pages are disjoint. 1594 */ 1595 frags = 0; 1596 if (m->m_epg_hdrlen != 0) 1597 frags++; 1598 frags += m->m_epg_npgs; 1599 if (m->m_epg_trllen != 0) 1600 frags++; 1601 1602 return (frags); 1603 } 1604 1605 /* 1606 * Defragment an mbuf chain, returning at most maxfrags separate 1607 * mbufs+clusters. If this is not possible NULL is returned and 1608 * the original mbuf chain is left in its present (potentially 1609 * modified) state. We use two techniques: collapsing consecutive 1610 * mbufs and replacing consecutive mbufs by a cluster. 1611 * 1612 * NB: this should really be named m_defrag but that name is taken 1613 */ 1614 struct mbuf * 1615 m_collapse(struct mbuf *m0, int how, int maxfrags) 1616 { 1617 struct mbuf *m, *n, *n2, **prev; 1618 u_int curfrags; 1619 1620 /* 1621 * Calculate the current number of frags. 1622 */ 1623 curfrags = 0; 1624 for (m = m0; m != NULL; m = m->m_next) 1625 curfrags += frags_per_mbuf(m); 1626 /* 1627 * First, try to collapse mbufs. Note that we always collapse 1628 * towards the front so we don't need to deal with moving the 1629 * pkthdr. This may be suboptimal if the first mbuf has much 1630 * less data than the following. 1631 */ 1632 m = m0; 1633 again: 1634 for (;;) { 1635 n = m->m_next; 1636 if (n == NULL) 1637 break; 1638 if (M_WRITABLE(m) && 1639 n->m_len < M_TRAILINGSPACE(m)) { 1640 m_copydata(n, 0, n->m_len, 1641 mtod(m, char *) + m->m_len); 1642 m->m_len += n->m_len; 1643 m->m_next = n->m_next; 1644 curfrags -= frags_per_mbuf(n); 1645 m_free(n); 1646 if (curfrags <= maxfrags) 1647 return m0; 1648 } else 1649 m = n; 1650 } 1651 KASSERT(maxfrags > 1, 1652 ("maxfrags %u, but normal collapse failed", maxfrags)); 1653 /* 1654 * Collapse consecutive mbufs to a cluster. 1655 */ 1656 prev = &m0->m_next; /* NB: not the first mbuf */ 1657 while ((n = *prev) != NULL) { 1658 if ((n2 = n->m_next) != NULL && 1659 n->m_len + n2->m_len < MCLBYTES) { 1660 m = m_getcl(how, MT_DATA, 0); 1661 if (m == NULL) 1662 goto bad; 1663 m_copydata(n, 0, n->m_len, mtod(m, char *)); 1664 m_copydata(n2, 0, n2->m_len, 1665 mtod(m, char *) + n->m_len); 1666 m->m_len = n->m_len + n2->m_len; 1667 m->m_next = n2->m_next; 1668 *prev = m; 1669 curfrags += 1; /* For the new cluster */ 1670 curfrags -= frags_per_mbuf(n); 1671 curfrags -= frags_per_mbuf(n2); 1672 m_free(n); 1673 m_free(n2); 1674 if (curfrags <= maxfrags) 1675 return m0; 1676 /* 1677 * Still not there, try the normal collapse 1678 * again before we allocate another cluster. 1679 */ 1680 goto again; 1681 } 1682 prev = &n->m_next; 1683 } 1684 /* 1685 * No place where we can collapse to a cluster; punt. 1686 * This can occur if, for example, you request 2 frags 1687 * but the packet requires that both be clusters (we 1688 * never reallocate the first mbuf to avoid moving the 1689 * packet header). 1690 */ 1691 bad: 1692 return NULL; 1693 } 1694 1695 #ifdef MBUF_STRESS_TEST 1696 1697 /* 1698 * Fragment an mbuf chain. There's no reason you'd ever want to do 1699 * this in normal usage, but it's great for stress testing various 1700 * mbuf consumers. 1701 * 1702 * If fragmentation is not possible, the original chain will be 1703 * returned. 1704 * 1705 * Possible length values: 1706 * 0 no fragmentation will occur 1707 * > 0 each fragment will be of the specified length 1708 * -1 each fragment will be the same random value in length 1709 * -2 each fragment's length will be entirely random 1710 * (Random values range from 1 to 256) 1711 */ 1712 struct mbuf * 1713 m_fragment(struct mbuf *m0, int how, int length) 1714 { 1715 struct mbuf *m_first, *m_last; 1716 int divisor = 255, progress = 0, fraglen; 1717 1718 if (!(m0->m_flags & M_PKTHDR)) 1719 return (m0); 1720 1721 if (length == 0 || length < -2) 1722 return (m0); 1723 if (length > MCLBYTES) 1724 length = MCLBYTES; 1725 if (length < 0 && divisor > MCLBYTES) 1726 divisor = MCLBYTES; 1727 if (length == -1) 1728 length = 1 + (arc4random() % divisor); 1729 if (length > 0) 1730 fraglen = length; 1731 1732 m_fixhdr(m0); /* Needed sanity check */ 1733 1734 m_first = m_getcl(how, MT_DATA, M_PKTHDR); 1735 if (m_first == NULL) 1736 goto nospace; 1737 1738 if (m_dup_pkthdr(m_first, m0, how) == 0) 1739 goto nospace; 1740 1741 m_last = m_first; 1742 1743 while (progress < m0->m_pkthdr.len) { 1744 if (length == -2) 1745 fraglen = 1 + (arc4random() % divisor); 1746 if (fraglen > m0->m_pkthdr.len - progress) 1747 fraglen = m0->m_pkthdr.len - progress; 1748 1749 if (progress != 0) { 1750 struct mbuf *m_new = m_getcl(how, MT_DATA, 0); 1751 if (m_new == NULL) 1752 goto nospace; 1753 1754 m_last->m_next = m_new; 1755 m_last = m_new; 1756 } 1757 1758 m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t)); 1759 progress += fraglen; 1760 m_last->m_len = fraglen; 1761 } 1762 m_freem(m0); 1763 m0 = m_first; 1764 return (m0); 1765 nospace: 1766 if (m_first) 1767 m_freem(m_first); 1768 /* Return the original chain on failure */ 1769 return (m0); 1770 } 1771 1772 #endif 1773 1774 /* 1775 * Free pages from mbuf_ext_pgs, assuming they were allocated via 1776 * vm_page_alloc() and aren't associated with any object. Complement 1777 * to allocator from m_uiotombuf_nomap(). 1778 */ 1779 void 1780 mb_free_mext_pgs(struct mbuf *m) 1781 { 1782 vm_page_t pg; 1783 1784 M_ASSERTEXTPG(m); 1785 for (int i = 0; i < m->m_epg_npgs; i++) { 1786 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1787 vm_page_unwire_noq(pg); 1788 vm_page_free(pg); 1789 } 1790 } 1791 1792 static struct mbuf * 1793 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) 1794 { 1795 struct mbuf *m, *mb, *prev; 1796 vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; 1797 int error, length, i, needed; 1798 ssize_t total; 1799 int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED; 1800 1801 MPASS((flags & M_PKTHDR) == 0); 1802 MPASS((how & M_ZERO) == 0); 1803 1804 /* 1805 * len can be zero or an arbitrary large value bound by 1806 * the total data supplied by the uio. 1807 */ 1808 if (len > 0) 1809 total = MIN(uio->uio_resid, len); 1810 else 1811 total = uio->uio_resid; 1812 1813 if (maxseg == 0) 1814 maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; 1815 1816 /* 1817 * If total is zero, return an empty mbuf. This can occur 1818 * for TLS 1.0 connections which send empty fragments as 1819 * a countermeasure against the known-IV weakness in CBC 1820 * ciphersuites. 1821 */ 1822 if (__predict_false(total == 0)) { 1823 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1824 if (mb == NULL) 1825 return (NULL); 1826 mb->m_epg_flags = EPG_FLAG_ANON; 1827 return (mb); 1828 } 1829 1830 /* 1831 * Allocate the pages 1832 */ 1833 m = NULL; 1834 while (total > 0) { 1835 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1836 if (mb == NULL) 1837 goto failed; 1838 if (m == NULL) 1839 m = mb; 1840 else 1841 prev->m_next = mb; 1842 prev = mb; 1843 mb->m_epg_flags = EPG_FLAG_ANON; 1844 needed = length = MIN(maxseg, total); 1845 for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { 1846 retry_page: 1847 pg_array[i] = vm_page_alloc_noobj(pflags); 1848 if (pg_array[i] == NULL) { 1849 if (how & M_NOWAIT) { 1850 goto failed; 1851 } else { 1852 vm_wait(NULL); 1853 goto retry_page; 1854 } 1855 } 1856 mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); 1857 mb->m_epg_npgs++; 1858 } 1859 mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1); 1860 MBUF_EXT_PGS_ASSERT_SANITY(mb); 1861 total -= length; 1862 error = uiomove_fromphys(pg_array, 0, length, uio); 1863 if (error != 0) 1864 goto failed; 1865 mb->m_len = length; 1866 mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs; 1867 if (flags & M_PKTHDR) 1868 m->m_pkthdr.len += length; 1869 } 1870 return (m); 1871 1872 failed: 1873 m_freem(m); 1874 return (NULL); 1875 } 1876 1877 /* 1878 * Copy the contents of uio into a properly sized mbuf chain. 1879 */ 1880 struct mbuf * 1881 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) 1882 { 1883 struct mbuf *m, *mb; 1884 int error, length; 1885 ssize_t total; 1886 int progress = 0; 1887 1888 if (flags & M_EXTPG) 1889 return (m_uiotombuf_nomap(uio, how, len, align, flags)); 1890 1891 /* 1892 * len can be zero or an arbitrary large value bound by 1893 * the total data supplied by the uio. 1894 */ 1895 if (len > 0) 1896 total = (uio->uio_resid < len) ? uio->uio_resid : len; 1897 else 1898 total = uio->uio_resid; 1899 1900 /* 1901 * The smallest unit returned by m_getm2() is a single mbuf 1902 * with pkthdr. We can't align past it. 1903 */ 1904 if (align >= MHLEN) 1905 return (NULL); 1906 1907 /* 1908 * Give us the full allocation or nothing. 1909 * If len is zero return the smallest empty mbuf. 1910 */ 1911 m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); 1912 if (m == NULL) 1913 return (NULL); 1914 m->m_data += align; 1915 1916 /* Fill all mbufs with uio data and update header information. */ 1917 for (mb = m; mb != NULL; mb = mb->m_next) { 1918 length = min(M_TRAILINGSPACE(mb), total - progress); 1919 1920 error = uiomove(mtod(mb, void *), length, uio); 1921 if (error) { 1922 m_freem(m); 1923 return (NULL); 1924 } 1925 1926 mb->m_len = length; 1927 progress += length; 1928 if (flags & M_PKTHDR) { 1929 m->m_pkthdr.len += length; 1930 m->m_pkthdr.memlen += MSIZE; 1931 if (mb->m_flags & M_EXT) 1932 m->m_pkthdr.memlen += mb->m_ext.ext_size; 1933 } 1934 } 1935 KASSERT(progress == total, ("%s: progress != total", __func__)); 1936 1937 return (m); 1938 } 1939 1940 /* 1941 * Copy data to/from an unmapped mbuf into a uio limited by len if set. 1942 */ 1943 int 1944 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len) 1945 { 1946 vm_page_t pg; 1947 int error, i, off, pglen, pgoff, seglen, segoff; 1948 1949 M_ASSERTEXTPG(m); 1950 error = 0; 1951 1952 /* Skip over any data removed from the front. */ 1953 off = mtod(m, vm_offset_t); 1954 1955 off += m_off; 1956 if (m->m_epg_hdrlen != 0) { 1957 if (off >= m->m_epg_hdrlen) { 1958 off -= m->m_epg_hdrlen; 1959 } else { 1960 seglen = m->m_epg_hdrlen - off; 1961 segoff = off; 1962 seglen = min(seglen, len); 1963 off = 0; 1964 len -= seglen; 1965 error = uiomove(__DECONST(void *, 1966 &m->m_epg_hdr[segoff]), seglen, uio); 1967 } 1968 } 1969 pgoff = m->m_epg_1st_off; 1970 for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) { 1971 pglen = m_epg_pagelen(m, i, pgoff); 1972 if (off >= pglen) { 1973 off -= pglen; 1974 pgoff = 0; 1975 continue; 1976 } 1977 seglen = pglen - off; 1978 segoff = pgoff + off; 1979 off = 0; 1980 seglen = min(seglen, len); 1981 len -= seglen; 1982 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1983 error = uiomove_fromphys(&pg, segoff, seglen, uio); 1984 pgoff = 0; 1985 }; 1986 if (len != 0 && error == 0) { 1987 KASSERT((off + len) <= m->m_epg_trllen, 1988 ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, 1989 m->m_epg_trllen, m_off)); 1990 error = uiomove(__DECONST(void *, &m->m_epg_trail[off]), 1991 len, uio); 1992 } 1993 return (error); 1994 } 1995 1996 /* 1997 * Copy an mbuf chain into a uio limited by len if set. 1998 */ 1999 int 2000 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) 2001 { 2002 int error, length, total; 2003 int progress = 0; 2004 2005 if (len > 0) 2006 total = min(uio->uio_resid, len); 2007 else 2008 total = uio->uio_resid; 2009 2010 /* Fill the uio with data from the mbufs. */ 2011 for (; m != NULL; m = m->m_next) { 2012 length = min(m->m_len, total - progress); 2013 2014 if ((m->m_flags & M_EXTPG) != 0) 2015 error = m_unmapped_uiomove(m, 0, uio, length); 2016 else 2017 error = uiomove(mtod(m, void *), length, uio); 2018 if (error) 2019 return (error); 2020 2021 progress += length; 2022 } 2023 2024 return (0); 2025 } 2026 2027 /* 2028 * Create a writable copy of the mbuf chain. While doing this 2029 * we compact the chain with a goal of producing a chain with 2030 * at most two mbufs. The second mbuf in this chain is likely 2031 * to be a cluster. The primary purpose of this work is to create 2032 * a writable packet for encryption, compression, etc. The 2033 * secondary goal is to linearize the data so the data can be 2034 * passed to crypto hardware in the most efficient manner possible. 2035 */ 2036 struct mbuf * 2037 m_unshare(struct mbuf *m0, int how) 2038 { 2039 struct mbuf *m, *mprev; 2040 struct mbuf *n, *mfirst, *mlast; 2041 int len, off; 2042 2043 mprev = NULL; 2044 for (m = m0; m != NULL; m = mprev->m_next) { 2045 /* 2046 * Regular mbufs are ignored unless there's a cluster 2047 * in front of it that we can use to coalesce. We do 2048 * the latter mainly so later clusters can be coalesced 2049 * also w/o having to handle them specially (i.e. convert 2050 * mbuf+cluster -> cluster). This optimization is heavily 2051 * influenced by the assumption that we're running over 2052 * Ethernet where MCLBYTES is large enough that the max 2053 * packet size will permit lots of coalescing into a 2054 * single cluster. This in turn permits efficient 2055 * crypto operations, especially when using hardware. 2056 */ 2057 if ((m->m_flags & M_EXT) == 0) { 2058 if (mprev && (mprev->m_flags & M_EXT) && 2059 m->m_len <= M_TRAILINGSPACE(mprev)) { 2060 /* XXX: this ignores mbuf types */ 2061 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2062 mtod(m, caddr_t), m->m_len); 2063 mprev->m_len += m->m_len; 2064 mprev->m_next = m->m_next; /* unlink from chain */ 2065 m_free(m); /* reclaim mbuf */ 2066 } else { 2067 mprev = m; 2068 } 2069 continue; 2070 } 2071 /* 2072 * Writable mbufs are left alone (for now). 2073 */ 2074 if (M_WRITABLE(m)) { 2075 mprev = m; 2076 continue; 2077 } 2078 2079 /* 2080 * Not writable, replace with a copy or coalesce with 2081 * the previous mbuf if possible (since we have to copy 2082 * it anyway, we try to reduce the number of mbufs and 2083 * clusters so that future work is easier). 2084 */ 2085 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); 2086 /* NB: we only coalesce into a cluster or larger */ 2087 if (mprev != NULL && (mprev->m_flags & M_EXT) && 2088 m->m_len <= M_TRAILINGSPACE(mprev)) { 2089 /* XXX: this ignores mbuf types */ 2090 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2091 mtod(m, caddr_t), m->m_len); 2092 mprev->m_len += m->m_len; 2093 mprev->m_next = m->m_next; /* unlink from chain */ 2094 m_free(m); /* reclaim mbuf */ 2095 continue; 2096 } 2097 2098 /* 2099 * Allocate new space to hold the copy and copy the data. 2100 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by 2101 * splitting them into clusters. We could just malloc a 2102 * buffer and make it external but too many device drivers 2103 * don't know how to break up the non-contiguous memory when 2104 * doing DMA. 2105 */ 2106 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2107 if (n == NULL) { 2108 m_freem(m0); 2109 return (NULL); 2110 } 2111 if (m->m_flags & M_PKTHDR) { 2112 KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", 2113 __func__, m0, m)); 2114 m_move_pkthdr(n, m); 2115 } 2116 len = m->m_len; 2117 off = 0; 2118 mfirst = n; 2119 mlast = NULL; 2120 for (;;) { 2121 int cc = min(len, MCLBYTES); 2122 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); 2123 n->m_len = cc; 2124 if (mlast != NULL) 2125 mlast->m_next = n; 2126 mlast = n; 2127 #if 0 2128 newipsecstat.ips_clcopied++; 2129 #endif 2130 2131 len -= cc; 2132 if (len <= 0) 2133 break; 2134 off += cc; 2135 2136 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2137 if (n == NULL) { 2138 m_freem(mfirst); 2139 m_freem(m0); 2140 return (NULL); 2141 } 2142 } 2143 n->m_next = m->m_next; 2144 if (mprev == NULL) 2145 m0 = mfirst; /* new head of chain */ 2146 else 2147 mprev->m_next = mfirst; /* replace old mbuf */ 2148 m_free(m); /* release old mbuf */ 2149 mprev = mfirst; 2150 } 2151 return (m0); 2152 } 2153 2154 #ifdef MBUF_PROFILING 2155 2156 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ 2157 struct mbufprofile { 2158 uintmax_t wasted[MP_BUCKETS]; 2159 uintmax_t used[MP_BUCKETS]; 2160 uintmax_t segments[MP_BUCKETS]; 2161 } mbprof; 2162 2163 void 2164 m_profile(struct mbuf *m) 2165 { 2166 int segments = 0; 2167 int used = 0; 2168 int wasted = 0; 2169 2170 while (m) { 2171 segments++; 2172 used += m->m_len; 2173 if (m->m_flags & M_EXT) { 2174 wasted += MHLEN - sizeof(m->m_ext) + 2175 m->m_ext.ext_size - m->m_len; 2176 } else { 2177 if (m->m_flags & M_PKTHDR) 2178 wasted += MHLEN - m->m_len; 2179 else 2180 wasted += MLEN - m->m_len; 2181 } 2182 m = m->m_next; 2183 } 2184 /* be paranoid.. it helps */ 2185 if (segments > MP_BUCKETS - 1) 2186 segments = MP_BUCKETS - 1; 2187 if (used > 100000) 2188 used = 100000; 2189 if (wasted > 100000) 2190 wasted = 100000; 2191 /* store in the appropriate bucket */ 2192 /* don't bother locking. if it's slightly off, so what? */ 2193 mbprof.segments[segments]++; 2194 mbprof.used[fls(used)]++; 2195 mbprof.wasted[fls(wasted)]++; 2196 } 2197 2198 static int 2199 mbprof_handler(SYSCTL_HANDLER_ARGS) 2200 { 2201 char buf[256]; 2202 struct sbuf sb; 2203 int error; 2204 uint64_t *p; 2205 2206 sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); 2207 2208 p = &mbprof.wasted[0]; 2209 sbuf_printf(&sb, 2210 "wasted:\n" 2211 "%ju %ju %ju %ju %ju %ju %ju %ju " 2212 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2213 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2214 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2215 #ifdef BIG_ARRAY 2216 p = &mbprof.wasted[16]; 2217 sbuf_printf(&sb, 2218 "%ju %ju %ju %ju %ju %ju %ju %ju " 2219 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2220 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2221 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2222 #endif 2223 p = &mbprof.used[0]; 2224 sbuf_printf(&sb, 2225 "used:\n" 2226 "%ju %ju %ju %ju %ju %ju %ju %ju " 2227 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2228 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2229 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2230 #ifdef BIG_ARRAY 2231 p = &mbprof.used[16]; 2232 sbuf_printf(&sb, 2233 "%ju %ju %ju %ju %ju %ju %ju %ju " 2234 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2235 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2236 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2237 #endif 2238 p = &mbprof.segments[0]; 2239 sbuf_printf(&sb, 2240 "segments:\n" 2241 "%ju %ju %ju %ju %ju %ju %ju %ju " 2242 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2243 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2244 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2245 #ifdef BIG_ARRAY 2246 p = &mbprof.segments[16]; 2247 sbuf_printf(&sb, 2248 "%ju %ju %ju %ju %ju %ju %ju %ju " 2249 "%ju %ju %ju %ju %ju %ju %ju %jju", 2250 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2251 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2252 #endif 2253 2254 error = sbuf_finish(&sb); 2255 sbuf_delete(&sb); 2256 return (error); 2257 } 2258 2259 static int 2260 mbprof_clr_handler(SYSCTL_HANDLER_ARGS) 2261 { 2262 int clear, error; 2263 2264 clear = 0; 2265 error = sysctl_handle_int(oidp, &clear, 0, req); 2266 if (error || !req->newptr) 2267 return (error); 2268 2269 if (clear) { 2270 bzero(&mbprof, sizeof(mbprof)); 2271 } 2272 2273 return (error); 2274 } 2275 2276 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, 2277 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2278 mbprof_handler, "A", 2279 "mbuf profiling statistics"); 2280 2281 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, 2282 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 2283 mbprof_clr_handler, "I", 2284 "clear mbuf profiling statistics"); 2285 #endif 2286