1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 32 */ 33 34 #include <sys/cdefs.h> 35 #include "opt_param.h" 36 #include "opt_mbuf_stress_test.h" 37 #include "opt_mbuf_profiling.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/kernel.h> 42 #include <sys/limits.h> 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/sysctl.h> 47 #include <sys/domain.h> 48 #include <sys/protosw.h> 49 #include <sys/uio.h> 50 #include <sys/vmmeter.h> 51 #include <sys/sbuf.h> 52 #include <sys/sdt.h> 53 #include <vm/vm.h> 54 #include <vm/vm_pageout.h> 55 #include <vm/vm_page.h> 56 57 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, 58 "struct mbuf *", "mbufinfo_t *", 59 "uint32_t", "uint32_t", 60 "uint16_t", "uint16_t", 61 "uint32_t", "uint32_t", 62 "uint32_t", "uint32_t"); 63 64 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw, 65 "uint32_t", "uint32_t", 66 "uint16_t", "uint16_t", 67 "struct mbuf *", "mbufinfo_t *"); 68 69 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, 70 "uint32_t", "uint32_t", 71 "uint16_t", "uint16_t", 72 "struct mbuf *", "mbufinfo_t *"); 73 74 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw, 75 "uint32_t", "uint32_t", 76 "uint16_t", "uint16_t", 77 "struct mbuf *", "mbufinfo_t *"); 78 79 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, 80 "uint32_t", "uint32_t", 81 "uint16_t", "uint16_t", 82 "struct mbuf *", "mbufinfo_t *"); 83 84 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, 85 "uint32_t", "uint32_t", 86 "uint16_t", "uint16_t", 87 "uint32_t", "uint32_t", 88 "struct mbuf *", "mbufinfo_t *"); 89 90 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl, 91 "uint32_t", "uint32_t", 92 "uint16_t", "uint16_t", 93 "uint32_t", "uint32_t", 94 "uint32_t", "uint32_t", 95 "struct mbuf *", "mbufinfo_t *"); 96 97 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, 98 "struct mbuf *", "mbufinfo_t *", 99 "uint32_t", "uint32_t", 100 "uint32_t", "uint32_t"); 101 102 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, 103 "struct mbuf *", "mbufinfo_t *", 104 "uint32_t", "uint32_t", 105 "uint32_t", "uint32_t", 106 "void*", "void*"); 107 108 SDT_PROBE_DEFINE(sdt, , , m__cljset); 109 110 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, 111 "struct mbuf *", "mbufinfo_t *"); 112 113 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, 114 "struct mbuf *", "mbufinfo_t *"); 115 116 #include <security/mac/mac_framework.h> 117 118 /* 119 * Provide minimum possible defaults for link and protocol header space, 120 * assuming IPv4 over Ethernet. Enabling IPv6, IEEE802.11 or some other 121 * protocol may grow these values. 122 */ 123 u_int max_linkhdr = 16; 124 u_int max_protohdr = 40; 125 u_int max_hdr = 16 + 40; 126 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, 127 &max_linkhdr, 16, "Size of largest link layer header"); 128 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, 129 &max_protohdr, 40, "Size of largest protocol layer header"); 130 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, 131 &max_hdr, 16 + 40, "Size of largest link plus protocol header"); 132 133 static void 134 max_hdr_grow(void) 135 { 136 137 max_hdr = max_linkhdr + max_protohdr; 138 MPASS(max_hdr <= MHLEN); 139 } 140 141 void 142 max_linkhdr_grow(u_int new) 143 { 144 145 if (new > max_linkhdr) { 146 max_linkhdr = new; 147 max_hdr_grow(); 148 } 149 } 150 151 void 152 max_protohdr_grow(u_int new) 153 { 154 155 if (new > max_protohdr) { 156 max_protohdr = new; 157 max_hdr_grow(); 158 } 159 } 160 161 #ifdef MBUF_STRESS_TEST 162 int m_defragpackets; 163 int m_defragbytes; 164 int m_defraguseless; 165 int m_defragfailure; 166 int m_defragrandomfailures; 167 168 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, 169 &m_defragpackets, 0, ""); 170 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, 171 &m_defragbytes, 0, ""); 172 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, 173 &m_defraguseless, 0, ""); 174 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, 175 &m_defragfailure, 0, ""); 176 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, 177 &m_defragrandomfailures, 0, ""); 178 #endif 179 180 /* 181 * Ensure the correct size of various mbuf parameters. It could be off due 182 * to compiler-induced padding and alignment artifacts. 183 */ 184 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); 185 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); 186 187 /* 188 * mbuf data storage should be 64-bit aligned regardless of architectural 189 * pointer size; check this is the case with and without a packet header. 190 */ 191 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); 192 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); 193 194 /* 195 * While the specific values here don't matter too much (i.e., +/- a few 196 * words), we do want to ensure that changes to these values are carefully 197 * reasoned about and properly documented. This is especially the case as 198 * network-protocol and device-driver modules encode these layouts, and must 199 * be recompiled if the structures change. Check these values at compile time 200 * against the ones documented in comments in mbuf.h. 201 * 202 * NB: Possibly they should be documented there via #define's and not just 203 * comments. 204 */ 205 #if defined(__LP64__) 206 CTASSERT(offsetof(struct mbuf, m_dat) == 32); 207 CTASSERT(sizeof(struct pkthdr) == 64); 208 CTASSERT(sizeof(struct m_ext) == 160); 209 #else 210 CTASSERT(offsetof(struct mbuf, m_dat) == 24); 211 CTASSERT(sizeof(struct pkthdr) == 56); 212 #if defined(__powerpc__) && defined(BOOKE) 213 /* PowerPC booke has 64-bit physical pointers. */ 214 CTASSERT(sizeof(struct m_ext) == 176); 215 #else 216 CTASSERT(sizeof(struct m_ext) == 172); 217 #endif 218 #endif 219 220 /* 221 * Assert that the queue(3) macros produce code of the same size as an old 222 * plain pointer does. 223 */ 224 #ifdef INVARIANTS 225 static struct mbuf __used m_assertbuf; 226 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); 227 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); 228 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); 229 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); 230 #endif 231 232 /* 233 * Attach the cluster from *m to *n, set up m_ext in *n 234 * and bump the refcount of the cluster. 235 */ 236 void 237 mb_dupcl(struct mbuf *n, struct mbuf *m) 238 { 239 volatile u_int *refcnt; 240 241 KASSERT(m->m_flags & (M_EXT|M_EXTPG), 242 ("%s: M_EXT|M_EXTPG not set on %p", __func__, m)); 243 KASSERT(!(n->m_flags & (M_EXT|M_EXTPG)), 244 ("%s: M_EXT|M_EXTPG set on %p", __func__, n)); 245 246 /* 247 * Cache access optimization. 248 * 249 * o Regular M_EXT storage doesn't need full copy of m_ext, since 250 * the holder of the 'ext_count' is responsible to carry the free 251 * routine and its arguments. 252 * o M_EXTPG data is split between main part of mbuf and m_ext, the 253 * main part is copied in full, the m_ext part is similar to M_EXT. 254 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is 255 * special - it needs full copy of m_ext into each mbuf, since any 256 * copy could end up as the last to free. 257 */ 258 if (m->m_flags & M_EXTPG) { 259 bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy, 260 __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy)); 261 bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen); 262 } else if (m->m_ext.ext_type == EXT_EXTREF) 263 bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext)); 264 else 265 bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); 266 267 n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG); 268 269 /* See if this is the mbuf that holds the embedded refcount. */ 270 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 271 refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; 272 n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; 273 } else { 274 KASSERT(m->m_ext.ext_cnt != NULL, 275 ("%s: no refcounting pointer on %p", __func__, m)); 276 refcnt = m->m_ext.ext_cnt; 277 } 278 279 if (*refcnt == 1) 280 *refcnt += 1; 281 else 282 atomic_add_int(refcnt, 1); 283 } 284 285 void 286 m_demote_pkthdr(struct mbuf *m) 287 { 288 289 M_ASSERTPKTHDR(m); 290 M_ASSERT_NO_SND_TAG(m); 291 292 m_tag_delete_chain(m, NULL); 293 m->m_flags &= ~M_PKTHDR; 294 bzero(&m->m_pkthdr, sizeof(struct pkthdr)); 295 } 296 297 /* 298 * Clean up mbuf (chain) from any tags and packet headers. 299 * If "all" is set then the first mbuf in the chain will be 300 * cleaned too. 301 */ 302 void 303 m_demote(struct mbuf *m0, int all, int flags) 304 { 305 struct mbuf *m; 306 307 flags |= M_DEMOTEFLAGS; 308 309 for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { 310 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", 311 __func__, m, m0)); 312 if (m->m_flags & M_PKTHDR) 313 m_demote_pkthdr(m); 314 m->m_flags &= flags; 315 } 316 } 317 318 /* 319 * Sanity checks on mbuf (chain) for use in KASSERT() and general 320 * debugging. 321 * Returns 0 or panics when bad and 1 on all tests passed. 322 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they 323 * blow up later. 324 */ 325 int 326 m_sanity(struct mbuf *m0, int sanitize) 327 { 328 struct mbuf *m; 329 caddr_t a, b; 330 int pktlen = 0; 331 332 #ifdef INVARIANTS 333 #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) 334 #else 335 #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) 336 #endif 337 338 for (m = m0; m != NULL; m = m->m_next) { 339 /* 340 * Basic pointer checks. If any of these fails then some 341 * unrelated kernel memory before or after us is trashed. 342 * No way to recover from that. 343 */ 344 a = M_START(m); 345 b = a + M_SIZE(m); 346 if ((caddr_t)m->m_data < a) 347 M_SANITY_ACTION("m_data outside mbuf data range left"); 348 if ((caddr_t)m->m_data > b) 349 M_SANITY_ACTION("m_data outside mbuf data range right"); 350 if ((caddr_t)m->m_data + m->m_len > b) 351 M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); 352 353 /* m->m_nextpkt may only be set on first mbuf in chain. */ 354 if (m != m0 && m->m_nextpkt != NULL) { 355 if (sanitize) { 356 m_freem(m->m_nextpkt); 357 m->m_nextpkt = (struct mbuf *)0xDEADC0DE; 358 } else 359 M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); 360 } 361 362 /* packet length (not mbuf length!) calculation */ 363 if (m0->m_flags & M_PKTHDR) 364 pktlen += m->m_len; 365 366 /* m_tags may only be attached to first mbuf in chain. */ 367 if (m != m0 && m->m_flags & M_PKTHDR && 368 !SLIST_EMPTY(&m->m_pkthdr.tags)) { 369 if (sanitize) { 370 m_tag_delete_chain(m, NULL); 371 /* put in 0xDEADC0DE perhaps? */ 372 } else 373 M_SANITY_ACTION("m_tags on in-chain mbuf"); 374 } 375 376 /* M_PKTHDR may only be set on first mbuf in chain */ 377 if (m != m0 && m->m_flags & M_PKTHDR) { 378 if (sanitize) { 379 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 380 m->m_flags &= ~M_PKTHDR; 381 /* put in 0xDEADCODE and leave hdr flag in */ 382 } else 383 M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); 384 } 385 } 386 m = m0; 387 if (pktlen && pktlen != m->m_pkthdr.len) { 388 if (sanitize) 389 m->m_pkthdr.len = 0; 390 else 391 M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); 392 } 393 return 1; 394 395 #undef M_SANITY_ACTION 396 } 397 398 /* 399 * Non-inlined part of m_init(). 400 */ 401 int 402 m_pkthdr_init(struct mbuf *m, int how) 403 { 404 #ifdef MAC 405 int error; 406 #endif 407 m->m_data = m->m_pktdat; 408 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 409 #ifdef NUMA 410 m->m_pkthdr.numa_domain = M_NODOM; 411 #endif 412 #ifdef MAC 413 /* If the label init fails, fail the alloc */ 414 error = mac_mbuf_init(m, how); 415 if (error) 416 return (error); 417 #endif 418 419 return (0); 420 } 421 422 /* 423 * "Move" mbuf pkthdr from "from" to "to". 424 * "from" must have M_PKTHDR set, and "to" must be empty. 425 */ 426 void 427 m_move_pkthdr(struct mbuf *to, struct mbuf *from) 428 { 429 430 #if 0 431 /* see below for why these are not enabled */ 432 M_ASSERTPKTHDR(to); 433 /* Note: with MAC, this may not be a good assertion. */ 434 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), 435 ("m_move_pkthdr: to has tags")); 436 #endif 437 #ifdef MAC 438 /* 439 * XXXMAC: It could be this should also occur for non-MAC? 440 */ 441 if (to->m_flags & M_PKTHDR) 442 m_tag_delete_chain(to, NULL); 443 #endif 444 to->m_flags = (from->m_flags & M_COPYFLAGS) | 445 (to->m_flags & (M_EXT | M_EXTPG)); 446 if ((to->m_flags & M_EXT) == 0) 447 to->m_data = to->m_pktdat; 448 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 449 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ 450 from->m_flags &= ~M_PKTHDR; 451 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { 452 from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; 453 from->m_pkthdr.snd_tag = NULL; 454 } 455 } 456 457 /* 458 * Duplicate "from"'s mbuf pkthdr in "to". 459 * "from" must have M_PKTHDR set, and "to" must be empty. 460 * In particular, this does a deep copy of the packet tags. 461 */ 462 int 463 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) 464 { 465 466 #if 0 467 /* 468 * The mbuf allocator only initializes the pkthdr 469 * when the mbuf is allocated with m_gethdr(). Many users 470 * (e.g. m_copy*, m_prepend) use m_get() and then 471 * smash the pkthdr as needed causing these 472 * assertions to trip. For now just disable them. 473 */ 474 M_ASSERTPKTHDR(to); 475 /* Note: with MAC, this may not be a good assertion. */ 476 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); 477 #endif 478 MBUF_CHECKSLEEP(how); 479 #ifdef MAC 480 if (to->m_flags & M_PKTHDR) 481 m_tag_delete_chain(to, NULL); 482 #endif 483 to->m_flags = (from->m_flags & M_COPYFLAGS) | 484 (to->m_flags & (M_EXT | M_EXTPG)); 485 if ((to->m_flags & M_EXT) == 0) 486 to->m_data = to->m_pktdat; 487 to->m_pkthdr = from->m_pkthdr; 488 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) 489 m_snd_tag_ref(from->m_pkthdr.snd_tag); 490 SLIST_INIT(&to->m_pkthdr.tags); 491 return (m_tag_copy_chain(to, from, how)); 492 } 493 494 /* 495 * Lesser-used path for M_PREPEND: 496 * allocate new mbuf to prepend to chain, 497 * copy junk along. 498 */ 499 struct mbuf * 500 m_prepend(struct mbuf *m, int len, int how) 501 { 502 struct mbuf *mn; 503 504 if (m->m_flags & M_PKTHDR) 505 mn = m_gethdr(how, m->m_type); 506 else 507 mn = m_get(how, m->m_type); 508 if (mn == NULL) { 509 m_freem(m); 510 return (NULL); 511 } 512 if (m->m_flags & M_PKTHDR) 513 m_move_pkthdr(mn, m); 514 mn->m_next = m; 515 m = mn; 516 if (len < M_SIZE(m)) 517 M_ALIGN(m, len); 518 m->m_len = len; 519 return (m); 520 } 521 522 /* 523 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 524 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 525 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. 526 * Note that the copy is read-only, because clusters are not copied, 527 * only their reference counts are incremented. 528 */ 529 struct mbuf * 530 m_copym(struct mbuf *m, int off0, int len, int wait) 531 { 532 struct mbuf *n, **np; 533 int off = off0; 534 struct mbuf *top; 535 int copyhdr = 0; 536 537 KASSERT(off >= 0, ("m_copym, negative off %d", off)); 538 KASSERT(len >= 0, ("m_copym, negative len %d", len)); 539 MBUF_CHECKSLEEP(wait); 540 if (off == 0 && m->m_flags & M_PKTHDR) 541 copyhdr = 1; 542 while (off > 0) { 543 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); 544 if (off < m->m_len) 545 break; 546 off -= m->m_len; 547 m = m->m_next; 548 } 549 np = ⊤ 550 top = NULL; 551 while (len > 0) { 552 if (m == NULL) { 553 KASSERT(len == M_COPYALL, 554 ("m_copym, length > size of mbuf chain")); 555 break; 556 } 557 if (copyhdr) 558 n = m_gethdr(wait, m->m_type); 559 else 560 n = m_get(wait, m->m_type); 561 *np = n; 562 if (n == NULL) 563 goto nospace; 564 if (copyhdr) { 565 if (!m_dup_pkthdr(n, m, wait)) 566 goto nospace; 567 if (len == M_COPYALL) 568 n->m_pkthdr.len -= off0; 569 else 570 n->m_pkthdr.len = len; 571 copyhdr = 0; 572 } 573 n->m_len = min(len, m->m_len - off); 574 if (m->m_flags & (M_EXT|M_EXTPG)) { 575 n->m_data = m->m_data + off; 576 mb_dupcl(n, m); 577 } else 578 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 579 (u_int)n->m_len); 580 if (len != M_COPYALL) 581 len -= n->m_len; 582 off = 0; 583 m = m->m_next; 584 np = &n->m_next; 585 } 586 587 return (top); 588 nospace: 589 m_freem(top); 590 return (NULL); 591 } 592 593 /* 594 * Copy an entire packet, including header (which must be present). 595 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. 596 * Note that the copy is read-only, because clusters are not copied, 597 * only their reference counts are incremented. 598 * Preserve alignment of the first mbuf so if the creator has left 599 * some room at the beginning (e.g. for inserting protocol headers) 600 * the copies still have the room available. 601 */ 602 struct mbuf * 603 m_copypacket(struct mbuf *m, int how) 604 { 605 struct mbuf *top, *n, *o; 606 607 MBUF_CHECKSLEEP(how); 608 n = m_get(how, m->m_type); 609 top = n; 610 if (n == NULL) 611 goto nospace; 612 613 if (!m_dup_pkthdr(n, m, how)) 614 goto nospace; 615 n->m_len = m->m_len; 616 if (m->m_flags & (M_EXT|M_EXTPG)) { 617 n->m_data = m->m_data; 618 mb_dupcl(n, m); 619 } else { 620 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); 621 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 622 } 623 624 m = m->m_next; 625 while (m) { 626 o = m_get(how, m->m_type); 627 if (o == NULL) 628 goto nospace; 629 630 n->m_next = o; 631 n = n->m_next; 632 633 n->m_len = m->m_len; 634 if (m->m_flags & (M_EXT|M_EXTPG)) { 635 n->m_data = m->m_data; 636 mb_dupcl(n, m); 637 } else { 638 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 639 } 640 641 m = m->m_next; 642 } 643 return top; 644 nospace: 645 m_freem(top); 646 return (NULL); 647 } 648 649 static void 650 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) 651 { 652 struct iovec iov; 653 struct uio uio; 654 int error __diagused; 655 656 KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); 657 KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); 658 KASSERT(off < m->m_len, 659 ("m_copyfromunmapped: len exceeds mbuf length")); 660 iov.iov_base = cp; 661 iov.iov_len = len; 662 uio.uio_resid = len; 663 uio.uio_iov = &iov; 664 uio.uio_segflg = UIO_SYSSPACE; 665 uio.uio_iovcnt = 1; 666 uio.uio_offset = 0; 667 uio.uio_rw = UIO_READ; 668 error = m_unmapped_uiomove(m, off, &uio, len); 669 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 670 len)); 671 } 672 673 /* 674 * Copy data from an mbuf chain starting "off" bytes from the beginning, 675 * continuing for "len" bytes, into the indicated buffer. 676 */ 677 void 678 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) 679 { 680 u_int count; 681 682 KASSERT(off >= 0, ("m_copydata, negative off %d", off)); 683 KASSERT(len >= 0, ("m_copydata, negative len %d", len)); 684 while (off > 0) { 685 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); 686 if (off < m->m_len) 687 break; 688 off -= m->m_len; 689 m = m->m_next; 690 } 691 while (len > 0) { 692 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); 693 count = min(m->m_len - off, len); 694 if ((m->m_flags & M_EXTPG) != 0) 695 m_copyfromunmapped(m, off, count, cp); 696 else 697 bcopy(mtod(m, caddr_t) + off, cp, count); 698 len -= count; 699 cp += count; 700 off = 0; 701 m = m->m_next; 702 } 703 } 704 705 /* 706 * Copy a packet header mbuf chain into a completely new chain, including 707 * copying any mbuf clusters. Use this instead of m_copypacket() when 708 * you need a writable copy of an mbuf chain. 709 */ 710 struct mbuf * 711 m_dup(const struct mbuf *m, int how) 712 { 713 struct mbuf **p, *top = NULL; 714 int remain, moff, nsize; 715 716 MBUF_CHECKSLEEP(how); 717 /* Sanity check */ 718 if (m == NULL) 719 return (NULL); 720 M_ASSERTPKTHDR(m); 721 722 /* While there's more data, get a new mbuf, tack it on, and fill it */ 723 remain = m->m_pkthdr.len; 724 moff = 0; 725 p = ⊤ 726 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ 727 struct mbuf *n; 728 729 /* Get the next new mbuf */ 730 if (remain >= MINCLSIZE) { 731 n = m_getcl(how, m->m_type, 0); 732 nsize = MCLBYTES; 733 } else { 734 n = m_get(how, m->m_type); 735 nsize = MLEN; 736 } 737 if (n == NULL) 738 goto nospace; 739 740 if (top == NULL) { /* First one, must be PKTHDR */ 741 if (!m_dup_pkthdr(n, m, how)) { 742 m_free(n); 743 goto nospace; 744 } 745 if ((n->m_flags & M_EXT) == 0) 746 nsize = MHLEN; 747 n->m_flags &= ~M_RDONLY; 748 } 749 n->m_len = 0; 750 751 /* Link it into the new chain */ 752 *p = n; 753 p = &n->m_next; 754 755 /* Copy data from original mbuf(s) into new mbuf */ 756 while (n->m_len < nsize && m != NULL) { 757 int chunk = min(nsize - n->m_len, m->m_len - moff); 758 759 m_copydata(m, moff, chunk, n->m_data + n->m_len); 760 moff += chunk; 761 n->m_len += chunk; 762 remain -= chunk; 763 if (moff == m->m_len) { 764 m = m->m_next; 765 moff = 0; 766 } 767 } 768 769 /* Check correct total mbuf length */ 770 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), 771 ("%s: bogus m_pkthdr.len", __func__)); 772 } 773 return (top); 774 775 nospace: 776 m_freem(top); 777 return (NULL); 778 } 779 780 /* 781 * Concatenate mbuf chain n to m. 782 * Both chains must be of the same type (e.g. MT_DATA). 783 * Any m_pkthdr is not updated. 784 */ 785 void 786 m_cat(struct mbuf *m, struct mbuf *n) 787 { 788 while (m->m_next) 789 m = m->m_next; 790 while (n) { 791 if (!M_WRITABLE(m) || 792 (n->m_flags & M_EXTPG) != 0 || 793 M_TRAILINGSPACE(m) < n->m_len) { 794 /* just join the two chains */ 795 m->m_next = n; 796 return; 797 } 798 /* splat the data from one into the other */ 799 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 800 (u_int)n->m_len); 801 m->m_len += n->m_len; 802 n = m_free(n); 803 } 804 } 805 806 /* 807 * Concatenate two pkthdr mbuf chains. 808 */ 809 void 810 m_catpkt(struct mbuf *m, struct mbuf *n) 811 { 812 813 M_ASSERTPKTHDR(m); 814 M_ASSERTPKTHDR(n); 815 816 m->m_pkthdr.len += n->m_pkthdr.len; 817 m_demote(n, 1, 0); 818 819 m_cat(m, n); 820 } 821 822 void 823 m_adj(struct mbuf *mp, int req_len) 824 { 825 int len = req_len; 826 struct mbuf *m; 827 int count; 828 829 if ((m = mp) == NULL) 830 return; 831 if (len >= 0) { 832 /* 833 * Trim from head. 834 */ 835 while (m != NULL && len > 0) { 836 if (m->m_len <= len) { 837 len -= m->m_len; 838 m->m_len = 0; 839 m = m->m_next; 840 } else { 841 m->m_len -= len; 842 m->m_data += len; 843 len = 0; 844 } 845 } 846 if (mp->m_flags & M_PKTHDR) 847 mp->m_pkthdr.len -= (req_len - len); 848 } else { 849 /* 850 * Trim from tail. Scan the mbuf chain, 851 * calculating its length and finding the last mbuf. 852 * If the adjustment only affects this mbuf, then just 853 * adjust and return. Otherwise, rescan and truncate 854 * after the remaining size. 855 */ 856 len = -len; 857 count = 0; 858 for (;;) { 859 count += m->m_len; 860 if (m->m_next == (struct mbuf *)0) 861 break; 862 m = m->m_next; 863 } 864 if (m->m_len >= len) { 865 m->m_len -= len; 866 if (mp->m_flags & M_PKTHDR) 867 mp->m_pkthdr.len -= len; 868 return; 869 } 870 count -= len; 871 if (count < 0) 872 count = 0; 873 /* 874 * Correct length for chain is "count". 875 * Find the mbuf with last data, adjust its length, 876 * and toss data from remaining mbufs on chain. 877 */ 878 m = mp; 879 if (m->m_flags & M_PKTHDR) 880 m->m_pkthdr.len = count; 881 for (; m; m = m->m_next) { 882 if (m->m_len >= count) { 883 m->m_len = count; 884 if (m->m_next != NULL) { 885 m_freem(m->m_next); 886 m->m_next = NULL; 887 } 888 break; 889 } 890 count -= m->m_len; 891 } 892 } 893 } 894 895 void 896 m_adj_decap(struct mbuf *mp, int len) 897 { 898 uint8_t rsstype; 899 900 m_adj(mp, len); 901 if ((mp->m_flags & M_PKTHDR) != 0) { 902 /* 903 * If flowid was calculated by card from the inner 904 * headers, move flowid to the decapsulated mbuf 905 * chain, otherwise clear. This depends on the 906 * internals of m_adj, which keeps pkthdr as is, in 907 * particular not changing rsstype and flowid. 908 */ 909 rsstype = mp->m_pkthdr.rsstype; 910 if ((rsstype & M_HASHTYPE_INNER) != 0) { 911 M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER); 912 } else { 913 M_HASHTYPE_CLEAR(mp); 914 } 915 } 916 } 917 918 /* 919 * Rearange an mbuf chain so that len bytes are contiguous 920 * and in the data area of an mbuf (so that mtod will work 921 * for a structure of size len). Returns the resulting 922 * mbuf chain on success, frees it and returns null on failure. 923 * If there is room, it will add up to max_protohdr-len extra bytes to the 924 * contiguous region in an attempt to avoid being called next time. 925 */ 926 struct mbuf * 927 m_pullup(struct mbuf *n, int len) 928 { 929 struct mbuf *m; 930 int count; 931 int space; 932 933 KASSERT((n->m_flags & M_EXTPG) == 0, 934 ("%s: unmapped mbuf %p", __func__, n)); 935 936 /* 937 * If first mbuf has no cluster, and has room for len bytes 938 * without shifting current data, pullup into it, 939 * otherwise allocate a new mbuf to prepend to the chain. 940 */ 941 if ((n->m_flags & M_EXT) == 0 && 942 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 943 if (n->m_len >= len) 944 return (n); 945 m = n; 946 n = n->m_next; 947 len -= m->m_len; 948 } else { 949 if (len > MHLEN) 950 goto bad; 951 m = m_get(M_NOWAIT, n->m_type); 952 if (m == NULL) 953 goto bad; 954 if (n->m_flags & M_PKTHDR) 955 m_move_pkthdr(m, n); 956 } 957 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 958 do { 959 count = min(min(max(len, max_protohdr), space), n->m_len); 960 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 961 (u_int)count); 962 len -= count; 963 m->m_len += count; 964 n->m_len -= count; 965 space -= count; 966 if (n->m_len) 967 n->m_data += count; 968 else 969 n = m_free(n); 970 } while (len > 0 && n); 971 if (len > 0) { 972 (void) m_free(m); 973 goto bad; 974 } 975 m->m_next = n; 976 return (m); 977 bad: 978 m_freem(n); 979 return (NULL); 980 } 981 982 /* 983 * Like m_pullup(), except a new mbuf is always allocated, and we allow 984 * the amount of empty space before the data in the new mbuf to be specified 985 * (in the event that the caller expects to prepend later). 986 */ 987 struct mbuf * 988 m_copyup(struct mbuf *n, int len, int dstoff) 989 { 990 struct mbuf *m; 991 int count, space; 992 993 if (len > (MHLEN - dstoff)) 994 goto bad; 995 m = m_get(M_NOWAIT, n->m_type); 996 if (m == NULL) 997 goto bad; 998 if (n->m_flags & M_PKTHDR) 999 m_move_pkthdr(m, n); 1000 m->m_data += dstoff; 1001 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 1002 do { 1003 count = min(min(max(len, max_protohdr), space), n->m_len); 1004 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 1005 (unsigned)count); 1006 len -= count; 1007 m->m_len += count; 1008 n->m_len -= count; 1009 space -= count; 1010 if (n->m_len) 1011 n->m_data += count; 1012 else 1013 n = m_free(n); 1014 } while (len > 0 && n); 1015 if (len > 0) { 1016 (void) m_free(m); 1017 goto bad; 1018 } 1019 m->m_next = n; 1020 return (m); 1021 bad: 1022 m_freem(n); 1023 return (NULL); 1024 } 1025 1026 /* 1027 * Partition an mbuf chain in two pieces, returning the tail -- 1028 * all but the first len0 bytes. In case of failure, it returns NULL and 1029 * attempts to restore the chain to its original state. 1030 * 1031 * Note that the resulting mbufs might be read-only, because the new 1032 * mbuf can end up sharing an mbuf cluster with the original mbuf if 1033 * the "breaking point" happens to lie within a cluster mbuf. Use the 1034 * M_WRITABLE() macro to check for this case. 1035 */ 1036 struct mbuf * 1037 m_split(struct mbuf *m0, int len0, int wait) 1038 { 1039 struct mbuf *m, *n; 1040 u_int len = len0, remain; 1041 1042 MBUF_CHECKSLEEP(wait); 1043 for (m = m0; m && len > m->m_len; m = m->m_next) 1044 len -= m->m_len; 1045 if (m == NULL) 1046 return (NULL); 1047 remain = m->m_len - len; 1048 if (m0->m_flags & M_PKTHDR && remain == 0) { 1049 n = m_gethdr(wait, m0->m_type); 1050 if (n == NULL) 1051 return (NULL); 1052 n->m_next = m->m_next; 1053 m->m_next = NULL; 1054 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1055 n->m_pkthdr.snd_tag = 1056 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1057 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1058 } else 1059 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1060 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1061 m0->m_pkthdr.len = len0; 1062 return (n); 1063 } else if (m0->m_flags & M_PKTHDR) { 1064 n = m_gethdr(wait, m0->m_type); 1065 if (n == NULL) 1066 return (NULL); 1067 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1068 n->m_pkthdr.snd_tag = 1069 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1070 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1071 } else 1072 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1073 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1074 m0->m_pkthdr.len = len0; 1075 if (m->m_flags & (M_EXT|M_EXTPG)) 1076 goto extpacket; 1077 if (remain > MHLEN) { 1078 /* m can't be the lead packet */ 1079 M_ALIGN(n, 0); 1080 n->m_next = m_split(m, len, wait); 1081 if (n->m_next == NULL) { 1082 (void) m_free(n); 1083 return (NULL); 1084 } else { 1085 n->m_len = 0; 1086 return (n); 1087 } 1088 } else 1089 M_ALIGN(n, remain); 1090 } else if (remain == 0) { 1091 n = m->m_next; 1092 m->m_next = NULL; 1093 return (n); 1094 } else { 1095 n = m_get(wait, m->m_type); 1096 if (n == NULL) 1097 return (NULL); 1098 M_ALIGN(n, remain); 1099 } 1100 extpacket: 1101 if (m->m_flags & (M_EXT|M_EXTPG)) { 1102 n->m_data = m->m_data + len; 1103 mb_dupcl(n, m); 1104 } else { 1105 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); 1106 } 1107 n->m_len = remain; 1108 m->m_len = len; 1109 n->m_next = m->m_next; 1110 m->m_next = NULL; 1111 return (n); 1112 } 1113 /* 1114 * Routine to copy from device local memory into mbufs. 1115 * Note that `off' argument is offset into first mbuf of target chain from 1116 * which to begin copying the data to. 1117 */ 1118 struct mbuf * 1119 m_devget(char *buf, int totlen, int off, struct ifnet *ifp, 1120 void (*copy)(char *from, caddr_t to, u_int len)) 1121 { 1122 struct mbuf *m; 1123 struct mbuf *top = NULL, **mp = ⊤ 1124 int len; 1125 1126 if (off < 0 || off > MHLEN) 1127 return (NULL); 1128 1129 while (totlen > 0) { 1130 if (top == NULL) { /* First one, must be PKTHDR */ 1131 if (totlen + off >= MINCLSIZE) { 1132 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 1133 len = MCLBYTES; 1134 } else { 1135 m = m_gethdr(M_NOWAIT, MT_DATA); 1136 len = MHLEN; 1137 1138 /* Place initial small packet/header at end of mbuf */ 1139 if (m && totlen + off + max_linkhdr <= MHLEN) { 1140 m->m_data += max_linkhdr; 1141 len -= max_linkhdr; 1142 } 1143 } 1144 if (m == NULL) 1145 return NULL; 1146 m->m_pkthdr.rcvif = ifp; 1147 m->m_pkthdr.len = totlen; 1148 } else { 1149 if (totlen + off >= MINCLSIZE) { 1150 m = m_getcl(M_NOWAIT, MT_DATA, 0); 1151 len = MCLBYTES; 1152 } else { 1153 m = m_get(M_NOWAIT, MT_DATA); 1154 len = MLEN; 1155 } 1156 if (m == NULL) { 1157 m_freem(top); 1158 return NULL; 1159 } 1160 } 1161 if (off) { 1162 m->m_data += off; 1163 len -= off; 1164 off = 0; 1165 } 1166 m->m_len = len = min(totlen, len); 1167 if (copy) 1168 copy(buf, mtod(m, caddr_t), (u_int)len); 1169 else 1170 bcopy(buf, mtod(m, caddr_t), (u_int)len); 1171 buf += len; 1172 *mp = m; 1173 mp = &m->m_next; 1174 totlen -= len; 1175 } 1176 return (top); 1177 } 1178 1179 static void 1180 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp) 1181 { 1182 struct iovec iov; 1183 struct uio uio; 1184 int error __diagused; 1185 1186 KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off)); 1187 KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len)); 1188 KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length")); 1189 iov.iov_base = __DECONST(caddr_t, cp); 1190 iov.iov_len = len; 1191 uio.uio_resid = len; 1192 uio.uio_iov = &iov; 1193 uio.uio_segflg = UIO_SYSSPACE; 1194 uio.uio_iovcnt = 1; 1195 uio.uio_offset = 0; 1196 uio.uio_rw = UIO_WRITE; 1197 error = m_unmapped_uiomove(m, off, &uio, len); 1198 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 1199 len)); 1200 } 1201 1202 /* 1203 * Copy data from a buffer back into the indicated mbuf chain, 1204 * starting "off" bytes from the beginning, extending the mbuf 1205 * chain if necessary. 1206 */ 1207 void 1208 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) 1209 { 1210 int mlen; 1211 struct mbuf *m = m0, *n; 1212 int totlen = 0; 1213 1214 if (m0 == NULL) 1215 return; 1216 while (off > (mlen = m->m_len)) { 1217 off -= mlen; 1218 totlen += mlen; 1219 if (m->m_next == NULL) { 1220 n = m_get(M_NOWAIT, m->m_type); 1221 if (n == NULL) 1222 goto out; 1223 bzero(mtod(n, caddr_t), MLEN); 1224 n->m_len = min(MLEN, len + off); 1225 m->m_next = n; 1226 } 1227 m = m->m_next; 1228 } 1229 while (len > 0) { 1230 if (m->m_next == NULL && (len > m->m_len - off)) { 1231 m->m_len += min(len - (m->m_len - off), 1232 M_TRAILINGSPACE(m)); 1233 } 1234 mlen = min (m->m_len - off, len); 1235 if ((m->m_flags & M_EXTPG) != 0) 1236 m_copytounmapped(m, off, mlen, cp); 1237 else 1238 bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); 1239 cp += mlen; 1240 len -= mlen; 1241 mlen += off; 1242 off = 0; 1243 totlen += mlen; 1244 if (len == 0) 1245 break; 1246 if (m->m_next == NULL) { 1247 n = m_get(M_NOWAIT, m->m_type); 1248 if (n == NULL) 1249 break; 1250 n->m_len = min(MLEN, len); 1251 m->m_next = n; 1252 } 1253 m = m->m_next; 1254 } 1255 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) 1256 m->m_pkthdr.len = totlen; 1257 } 1258 1259 /* 1260 * Append the specified data to the indicated mbuf chain, 1261 * Extend the mbuf chain if the new data does not fit in 1262 * existing space. 1263 * 1264 * Return 1 if able to complete the job; otherwise 0. 1265 */ 1266 int 1267 m_append(struct mbuf *m0, int len, c_caddr_t cp) 1268 { 1269 struct mbuf *m, *n; 1270 int remainder, space; 1271 1272 for (m = m0; m->m_next != NULL; m = m->m_next) 1273 ; 1274 remainder = len; 1275 space = M_TRAILINGSPACE(m); 1276 if (space > 0) { 1277 /* 1278 * Copy into available space. 1279 */ 1280 if (space > remainder) 1281 space = remainder; 1282 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 1283 m->m_len += space; 1284 cp += space, remainder -= space; 1285 } 1286 while (remainder > 0) { 1287 /* 1288 * Allocate a new mbuf; could check space 1289 * and allocate a cluster instead. 1290 */ 1291 n = m_get(M_NOWAIT, m->m_type); 1292 if (n == NULL) 1293 break; 1294 n->m_len = min(MLEN, remainder); 1295 bcopy(cp, mtod(n, caddr_t), n->m_len); 1296 cp += n->m_len, remainder -= n->m_len; 1297 m->m_next = n; 1298 m = n; 1299 } 1300 if (m0->m_flags & M_PKTHDR) 1301 m0->m_pkthdr.len += len - remainder; 1302 return (remainder == 0); 1303 } 1304 1305 static int 1306 m_apply_extpg_one(struct mbuf *m, int off, int len, 1307 int (*f)(void *, void *, u_int), void *arg) 1308 { 1309 void *p; 1310 u_int i, count, pgoff, pglen; 1311 int rval; 1312 1313 KASSERT(PMAP_HAS_DMAP, 1314 ("m_apply_extpg_one does not support unmapped mbufs")); 1315 off += mtod(m, vm_offset_t); 1316 if (off < m->m_epg_hdrlen) { 1317 count = min(m->m_epg_hdrlen - off, len); 1318 rval = f(arg, m->m_epg_hdr + off, count); 1319 if (rval) 1320 return (rval); 1321 len -= count; 1322 off = 0; 1323 } else 1324 off -= m->m_epg_hdrlen; 1325 pgoff = m->m_epg_1st_off; 1326 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 1327 pglen = m_epg_pagelen(m, i, pgoff); 1328 if (off < pglen) { 1329 count = min(pglen - off, len); 1330 p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off); 1331 rval = f(arg, p, count); 1332 if (rval) 1333 return (rval); 1334 len -= count; 1335 off = 0; 1336 } else 1337 off -= pglen; 1338 pgoff = 0; 1339 } 1340 if (len > 0) { 1341 KASSERT(off < m->m_epg_trllen, 1342 ("m_apply_extpg_one: offset beyond trailer")); 1343 KASSERT(len <= m->m_epg_trllen - off, 1344 ("m_apply_extpg_one: length beyond trailer")); 1345 return (f(arg, m->m_epg_trail + off, len)); 1346 } 1347 return (0); 1348 } 1349 1350 /* Apply function f to the data in a single mbuf. */ 1351 static int 1352 m_apply_one(struct mbuf *m, int off, int len, 1353 int (*f)(void *, void *, u_int), void *arg) 1354 { 1355 if ((m->m_flags & M_EXTPG) != 0) 1356 return (m_apply_extpg_one(m, off, len, f, arg)); 1357 else 1358 return (f(arg, mtod(m, caddr_t) + off, len)); 1359 } 1360 1361 /* 1362 * Apply function f to the data in an mbuf chain starting "off" bytes from 1363 * the beginning, continuing for "len" bytes. 1364 */ 1365 int 1366 m_apply(struct mbuf *m, int off, int len, 1367 int (*f)(void *, void *, u_int), void *arg) 1368 { 1369 u_int count; 1370 int rval; 1371 1372 KASSERT(off >= 0, ("m_apply, negative off %d", off)); 1373 KASSERT(len >= 0, ("m_apply, negative len %d", len)); 1374 while (off > 0) { 1375 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 1376 if (off < m->m_len) 1377 break; 1378 off -= m->m_len; 1379 m = m->m_next; 1380 } 1381 while (len > 0) { 1382 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 1383 count = min(m->m_len - off, len); 1384 rval = m_apply_one(m, off, count, f, arg); 1385 if (rval) 1386 return (rval); 1387 len -= count; 1388 off = 0; 1389 m = m->m_next; 1390 } 1391 return (0); 1392 } 1393 1394 /* 1395 * Return a pointer to mbuf/offset of location in mbuf chain. 1396 */ 1397 struct mbuf * 1398 m_getptr(struct mbuf *m, int loc, int *off) 1399 { 1400 1401 while (loc >= 0) { 1402 /* Normal end of search. */ 1403 if (m->m_len > loc) { 1404 *off = loc; 1405 return (m); 1406 } else { 1407 loc -= m->m_len; 1408 if (m->m_next == NULL) { 1409 if (loc == 0) { 1410 /* Point at the end of valid data. */ 1411 *off = m->m_len; 1412 return (m); 1413 } 1414 return (NULL); 1415 } 1416 m = m->m_next; 1417 } 1418 } 1419 return (NULL); 1420 } 1421 1422 void 1423 m_print(const struct mbuf *m, int maxlen) 1424 { 1425 int len; 1426 int pdata; 1427 const struct mbuf *m2; 1428 1429 if (m == NULL) { 1430 printf("mbuf: %p\n", m); 1431 return; 1432 } 1433 1434 if (m->m_flags & M_PKTHDR) 1435 len = m->m_pkthdr.len; 1436 else 1437 len = -1; 1438 m2 = m; 1439 while (m2 != NULL && (len == -1 || len)) { 1440 pdata = m2->m_len; 1441 if (maxlen != -1 && pdata > maxlen) 1442 pdata = maxlen; 1443 printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, 1444 m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" 1445 "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" 1446 "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); 1447 if (pdata) 1448 printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); 1449 if (len != -1) 1450 len -= m2->m_len; 1451 m2 = m2->m_next; 1452 } 1453 if (len > 0) 1454 printf("%d bytes unaccounted for.\n", len); 1455 return; 1456 } 1457 1458 u_int 1459 m_fixhdr(struct mbuf *m0) 1460 { 1461 u_int len; 1462 1463 len = m_length(m0, NULL); 1464 m0->m_pkthdr.len = len; 1465 return (len); 1466 } 1467 1468 u_int 1469 m_length(struct mbuf *m0, struct mbuf **last) 1470 { 1471 struct mbuf *m; 1472 u_int len; 1473 1474 len = 0; 1475 for (m = m0; m != NULL; m = m->m_next) { 1476 len += m->m_len; 1477 if (m->m_next == NULL) 1478 break; 1479 } 1480 if (last != NULL) 1481 *last = m; 1482 return (len); 1483 } 1484 1485 /* 1486 * Defragment a mbuf chain, returning the shortest possible 1487 * chain of mbufs and clusters. If allocation fails and 1488 * this cannot be completed, NULL will be returned, but 1489 * the passed in chain will be unchanged. Upon success, 1490 * the original chain will be freed, and the new chain 1491 * will be returned. 1492 * 1493 * If a non-packet header is passed in, the original 1494 * mbuf (chain?) will be returned unharmed. 1495 */ 1496 struct mbuf * 1497 m_defrag(struct mbuf *m0, int how) 1498 { 1499 struct mbuf *m_new = NULL, *m_final = NULL; 1500 int progress = 0, length; 1501 1502 MBUF_CHECKSLEEP(how); 1503 if (!(m0->m_flags & M_PKTHDR)) 1504 return (m0); 1505 1506 m_fixhdr(m0); /* Needed sanity check */ 1507 1508 #ifdef MBUF_STRESS_TEST 1509 if (m_defragrandomfailures) { 1510 int temp = arc4random() & 0xff; 1511 if (temp == 0xba) 1512 goto nospace; 1513 } 1514 #endif 1515 1516 if (m0->m_pkthdr.len > MHLEN) 1517 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 1518 else 1519 m_final = m_gethdr(how, MT_DATA); 1520 1521 if (m_final == NULL) 1522 goto nospace; 1523 1524 if (m_dup_pkthdr(m_final, m0, how) == 0) 1525 goto nospace; 1526 1527 m_new = m_final; 1528 1529 while (progress < m0->m_pkthdr.len) { 1530 length = m0->m_pkthdr.len - progress; 1531 if (length > MCLBYTES) 1532 length = MCLBYTES; 1533 1534 if (m_new == NULL) { 1535 if (length > MLEN) 1536 m_new = m_getcl(how, MT_DATA, 0); 1537 else 1538 m_new = m_get(how, MT_DATA); 1539 if (m_new == NULL) 1540 goto nospace; 1541 } 1542 1543 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 1544 progress += length; 1545 m_new->m_len = length; 1546 if (m_new != m_final) 1547 m_cat(m_final, m_new); 1548 m_new = NULL; 1549 } 1550 #ifdef MBUF_STRESS_TEST 1551 if (m0->m_next == NULL) 1552 m_defraguseless++; 1553 #endif 1554 m_freem(m0); 1555 m0 = m_final; 1556 #ifdef MBUF_STRESS_TEST 1557 m_defragpackets++; 1558 m_defragbytes += m0->m_pkthdr.len; 1559 #endif 1560 return (m0); 1561 nospace: 1562 #ifdef MBUF_STRESS_TEST 1563 m_defragfailure++; 1564 #endif 1565 if (m_final) 1566 m_freem(m_final); 1567 return (NULL); 1568 } 1569 1570 /* 1571 * Return the number of fragments an mbuf will use. This is usually 1572 * used as a proxy for the number of scatter/gather elements needed by 1573 * a DMA engine to access an mbuf. In general mapped mbufs are 1574 * assumed to be backed by physically contiguous buffers that only 1575 * need a single fragment. Unmapped mbufs, on the other hand, can 1576 * span disjoint physical pages. 1577 */ 1578 static int 1579 frags_per_mbuf(struct mbuf *m) 1580 { 1581 int frags; 1582 1583 if ((m->m_flags & M_EXTPG) == 0) 1584 return (1); 1585 1586 /* 1587 * The header and trailer are counted as a single fragment 1588 * each when present. 1589 * 1590 * XXX: This overestimates the number of fragments by assuming 1591 * all the backing physical pages are disjoint. 1592 */ 1593 frags = 0; 1594 if (m->m_epg_hdrlen != 0) 1595 frags++; 1596 frags += m->m_epg_npgs; 1597 if (m->m_epg_trllen != 0) 1598 frags++; 1599 1600 return (frags); 1601 } 1602 1603 /* 1604 * Defragment an mbuf chain, returning at most maxfrags separate 1605 * mbufs+clusters. If this is not possible NULL is returned and 1606 * the original mbuf chain is left in its present (potentially 1607 * modified) state. We use two techniques: collapsing consecutive 1608 * mbufs and replacing consecutive mbufs by a cluster. 1609 * 1610 * NB: this should really be named m_defrag but that name is taken 1611 */ 1612 struct mbuf * 1613 m_collapse(struct mbuf *m0, int how, int maxfrags) 1614 { 1615 struct mbuf *m, *n, *n2, **prev; 1616 u_int curfrags; 1617 1618 /* 1619 * Calculate the current number of frags. 1620 */ 1621 curfrags = 0; 1622 for (m = m0; m != NULL; m = m->m_next) 1623 curfrags += frags_per_mbuf(m); 1624 /* 1625 * First, try to collapse mbufs. Note that we always collapse 1626 * towards the front so we don't need to deal with moving the 1627 * pkthdr. This may be suboptimal if the first mbuf has much 1628 * less data than the following. 1629 */ 1630 m = m0; 1631 again: 1632 for (;;) { 1633 n = m->m_next; 1634 if (n == NULL) 1635 break; 1636 if (M_WRITABLE(m) && 1637 n->m_len < M_TRAILINGSPACE(m)) { 1638 m_copydata(n, 0, n->m_len, 1639 mtod(m, char *) + m->m_len); 1640 m->m_len += n->m_len; 1641 m->m_next = n->m_next; 1642 curfrags -= frags_per_mbuf(n); 1643 m_free(n); 1644 if (curfrags <= maxfrags) 1645 return m0; 1646 } else 1647 m = n; 1648 } 1649 KASSERT(maxfrags > 1, 1650 ("maxfrags %u, but normal collapse failed", maxfrags)); 1651 /* 1652 * Collapse consecutive mbufs to a cluster. 1653 */ 1654 prev = &m0->m_next; /* NB: not the first mbuf */ 1655 while ((n = *prev) != NULL) { 1656 if ((n2 = n->m_next) != NULL && 1657 n->m_len + n2->m_len < MCLBYTES) { 1658 m = m_getcl(how, MT_DATA, 0); 1659 if (m == NULL) 1660 goto bad; 1661 m_copydata(n, 0, n->m_len, mtod(m, char *)); 1662 m_copydata(n2, 0, n2->m_len, 1663 mtod(m, char *) + n->m_len); 1664 m->m_len = n->m_len + n2->m_len; 1665 m->m_next = n2->m_next; 1666 *prev = m; 1667 curfrags += 1; /* For the new cluster */ 1668 curfrags -= frags_per_mbuf(n); 1669 curfrags -= frags_per_mbuf(n2); 1670 m_free(n); 1671 m_free(n2); 1672 if (curfrags <= maxfrags) 1673 return m0; 1674 /* 1675 * Still not there, try the normal collapse 1676 * again before we allocate another cluster. 1677 */ 1678 goto again; 1679 } 1680 prev = &n->m_next; 1681 } 1682 /* 1683 * No place where we can collapse to a cluster; punt. 1684 * This can occur if, for example, you request 2 frags 1685 * but the packet requires that both be clusters (we 1686 * never reallocate the first mbuf to avoid moving the 1687 * packet header). 1688 */ 1689 bad: 1690 return NULL; 1691 } 1692 1693 #ifdef MBUF_STRESS_TEST 1694 1695 /* 1696 * Fragment an mbuf chain. There's no reason you'd ever want to do 1697 * this in normal usage, but it's great for stress testing various 1698 * mbuf consumers. 1699 * 1700 * If fragmentation is not possible, the original chain will be 1701 * returned. 1702 * 1703 * Possible length values: 1704 * 0 no fragmentation will occur 1705 * > 0 each fragment will be of the specified length 1706 * -1 each fragment will be the same random value in length 1707 * -2 each fragment's length will be entirely random 1708 * (Random values range from 1 to 256) 1709 */ 1710 struct mbuf * 1711 m_fragment(struct mbuf *m0, int how, int length) 1712 { 1713 struct mbuf *m_first, *m_last; 1714 int divisor = 255, progress = 0, fraglen; 1715 1716 if (!(m0->m_flags & M_PKTHDR)) 1717 return (m0); 1718 1719 if (length == 0 || length < -2) 1720 return (m0); 1721 if (length > MCLBYTES) 1722 length = MCLBYTES; 1723 if (length < 0 && divisor > MCLBYTES) 1724 divisor = MCLBYTES; 1725 if (length == -1) 1726 length = 1 + (arc4random() % divisor); 1727 if (length > 0) 1728 fraglen = length; 1729 1730 m_fixhdr(m0); /* Needed sanity check */ 1731 1732 m_first = m_getcl(how, MT_DATA, M_PKTHDR); 1733 if (m_first == NULL) 1734 goto nospace; 1735 1736 if (m_dup_pkthdr(m_first, m0, how) == 0) 1737 goto nospace; 1738 1739 m_last = m_first; 1740 1741 while (progress < m0->m_pkthdr.len) { 1742 if (length == -2) 1743 fraglen = 1 + (arc4random() % divisor); 1744 if (fraglen > m0->m_pkthdr.len - progress) 1745 fraglen = m0->m_pkthdr.len - progress; 1746 1747 if (progress != 0) { 1748 struct mbuf *m_new = m_getcl(how, MT_DATA, 0); 1749 if (m_new == NULL) 1750 goto nospace; 1751 1752 m_last->m_next = m_new; 1753 m_last = m_new; 1754 } 1755 1756 m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t)); 1757 progress += fraglen; 1758 m_last->m_len = fraglen; 1759 } 1760 m_freem(m0); 1761 m0 = m_first; 1762 return (m0); 1763 nospace: 1764 if (m_first) 1765 m_freem(m_first); 1766 /* Return the original chain on failure */ 1767 return (m0); 1768 } 1769 1770 #endif 1771 1772 /* 1773 * Free pages from mbuf_ext_pgs, assuming they were allocated via 1774 * vm_page_alloc() and aren't associated with any object. Complement 1775 * to allocator from m_uiotombuf_nomap(). 1776 */ 1777 void 1778 mb_free_mext_pgs(struct mbuf *m) 1779 { 1780 vm_page_t pg; 1781 1782 M_ASSERTEXTPG(m); 1783 for (int i = 0; i < m->m_epg_npgs; i++) { 1784 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1785 vm_page_unwire_noq(pg); 1786 vm_page_free(pg); 1787 } 1788 } 1789 1790 static struct mbuf * 1791 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) 1792 { 1793 struct mbuf *m, *mb, *prev; 1794 vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; 1795 int error, length, i, needed; 1796 ssize_t total; 1797 int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED; 1798 1799 MPASS((flags & M_PKTHDR) == 0); 1800 MPASS((how & M_ZERO) == 0); 1801 1802 /* 1803 * len can be zero or an arbitrary large value bound by 1804 * the total data supplied by the uio. 1805 */ 1806 if (len > 0) 1807 total = MIN(uio->uio_resid, len); 1808 else 1809 total = uio->uio_resid; 1810 1811 if (maxseg == 0) 1812 maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; 1813 1814 /* 1815 * If total is zero, return an empty mbuf. This can occur 1816 * for TLS 1.0 connections which send empty fragments as 1817 * a countermeasure against the known-IV weakness in CBC 1818 * ciphersuites. 1819 */ 1820 if (__predict_false(total == 0)) { 1821 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1822 if (mb == NULL) 1823 return (NULL); 1824 mb->m_epg_flags = EPG_FLAG_ANON; 1825 return (mb); 1826 } 1827 1828 /* 1829 * Allocate the pages 1830 */ 1831 m = NULL; 1832 while (total > 0) { 1833 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1834 if (mb == NULL) 1835 goto failed; 1836 if (m == NULL) 1837 m = mb; 1838 else 1839 prev->m_next = mb; 1840 prev = mb; 1841 mb->m_epg_flags = EPG_FLAG_ANON; 1842 needed = length = MIN(maxseg, total); 1843 for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { 1844 retry_page: 1845 pg_array[i] = vm_page_alloc_noobj(pflags); 1846 if (pg_array[i] == NULL) { 1847 if (how & M_NOWAIT) { 1848 goto failed; 1849 } else { 1850 vm_wait(NULL); 1851 goto retry_page; 1852 } 1853 } 1854 mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); 1855 mb->m_epg_npgs++; 1856 } 1857 mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1); 1858 MBUF_EXT_PGS_ASSERT_SANITY(mb); 1859 total -= length; 1860 error = uiomove_fromphys(pg_array, 0, length, uio); 1861 if (error != 0) 1862 goto failed; 1863 mb->m_len = length; 1864 mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs; 1865 if (flags & M_PKTHDR) 1866 m->m_pkthdr.len += length; 1867 } 1868 return (m); 1869 1870 failed: 1871 m_freem(m); 1872 return (NULL); 1873 } 1874 1875 /* 1876 * Copy the contents of uio into a properly sized mbuf chain. 1877 */ 1878 struct mbuf * 1879 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) 1880 { 1881 struct mbuf *m, *mb; 1882 int error, length; 1883 ssize_t total; 1884 int progress = 0; 1885 1886 if (flags & M_EXTPG) 1887 return (m_uiotombuf_nomap(uio, how, len, align, flags)); 1888 1889 /* 1890 * len can be zero or an arbitrary large value bound by 1891 * the total data supplied by the uio. 1892 */ 1893 if (len > 0) 1894 total = (uio->uio_resid < len) ? uio->uio_resid : len; 1895 else 1896 total = uio->uio_resid; 1897 1898 /* 1899 * The smallest unit returned by m_getm2() is a single mbuf 1900 * with pkthdr. We can't align past it. 1901 */ 1902 if (align >= MHLEN) 1903 return (NULL); 1904 1905 /* 1906 * Give us the full allocation or nothing. 1907 * If len is zero return the smallest empty mbuf. 1908 */ 1909 m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); 1910 if (m == NULL) 1911 return (NULL); 1912 m->m_data += align; 1913 1914 /* Fill all mbufs with uio data and update header information. */ 1915 for (mb = m; mb != NULL; mb = mb->m_next) { 1916 length = min(M_TRAILINGSPACE(mb), total - progress); 1917 1918 error = uiomove(mtod(mb, void *), length, uio); 1919 if (error) { 1920 m_freem(m); 1921 return (NULL); 1922 } 1923 1924 mb->m_len = length; 1925 progress += length; 1926 if (flags & M_PKTHDR) { 1927 m->m_pkthdr.len += length; 1928 m->m_pkthdr.memlen += MSIZE; 1929 if (mb->m_flags & M_EXT) 1930 m->m_pkthdr.memlen += mb->m_ext.ext_size; 1931 } 1932 } 1933 KASSERT(progress == total, ("%s: progress != total", __func__)); 1934 1935 return (m); 1936 } 1937 1938 /* 1939 * Copy data to/from an unmapped mbuf into a uio limited by len if set. 1940 */ 1941 int 1942 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len) 1943 { 1944 vm_page_t pg; 1945 int error, i, off, pglen, pgoff, seglen, segoff; 1946 1947 M_ASSERTEXTPG(m); 1948 error = 0; 1949 1950 /* Skip over any data removed from the front. */ 1951 off = mtod(m, vm_offset_t); 1952 1953 off += m_off; 1954 if (m->m_epg_hdrlen != 0) { 1955 if (off >= m->m_epg_hdrlen) { 1956 off -= m->m_epg_hdrlen; 1957 } else { 1958 seglen = m->m_epg_hdrlen - off; 1959 segoff = off; 1960 seglen = min(seglen, len); 1961 off = 0; 1962 len -= seglen; 1963 error = uiomove(__DECONST(void *, 1964 &m->m_epg_hdr[segoff]), seglen, uio); 1965 } 1966 } 1967 pgoff = m->m_epg_1st_off; 1968 for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) { 1969 pglen = m_epg_pagelen(m, i, pgoff); 1970 if (off >= pglen) { 1971 off -= pglen; 1972 pgoff = 0; 1973 continue; 1974 } 1975 seglen = pglen - off; 1976 segoff = pgoff + off; 1977 off = 0; 1978 seglen = min(seglen, len); 1979 len -= seglen; 1980 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1981 error = uiomove_fromphys(&pg, segoff, seglen, uio); 1982 pgoff = 0; 1983 }; 1984 if (len != 0 && error == 0) { 1985 KASSERT((off + len) <= m->m_epg_trllen, 1986 ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, 1987 m->m_epg_trllen, m_off)); 1988 error = uiomove(__DECONST(void *, &m->m_epg_trail[off]), 1989 len, uio); 1990 } 1991 return (error); 1992 } 1993 1994 /* 1995 * Copy an mbuf chain into a uio limited by len if set. 1996 */ 1997 int 1998 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) 1999 { 2000 int error, length, total; 2001 int progress = 0; 2002 2003 if (len > 0) 2004 total = min(uio->uio_resid, len); 2005 else 2006 total = uio->uio_resid; 2007 2008 /* Fill the uio with data from the mbufs. */ 2009 for (; m != NULL; m = m->m_next) { 2010 length = min(m->m_len, total - progress); 2011 2012 if ((m->m_flags & M_EXTPG) != 0) 2013 error = m_unmapped_uiomove(m, 0, uio, length); 2014 else 2015 error = uiomove(mtod(m, void *), length, uio); 2016 if (error) 2017 return (error); 2018 2019 progress += length; 2020 } 2021 2022 return (0); 2023 } 2024 2025 /* 2026 * Create a writable copy of the mbuf chain. While doing this 2027 * we compact the chain with a goal of producing a chain with 2028 * at most two mbufs. The second mbuf in this chain is likely 2029 * to be a cluster. The primary purpose of this work is to create 2030 * a writable packet for encryption, compression, etc. The 2031 * secondary goal is to linearize the data so the data can be 2032 * passed to crypto hardware in the most efficient manner possible. 2033 */ 2034 struct mbuf * 2035 m_unshare(struct mbuf *m0, int how) 2036 { 2037 struct mbuf *m, *mprev; 2038 struct mbuf *n, *mfirst, *mlast; 2039 int len, off; 2040 2041 mprev = NULL; 2042 for (m = m0; m != NULL; m = mprev->m_next) { 2043 /* 2044 * Regular mbufs are ignored unless there's a cluster 2045 * in front of it that we can use to coalesce. We do 2046 * the latter mainly so later clusters can be coalesced 2047 * also w/o having to handle them specially (i.e. convert 2048 * mbuf+cluster -> cluster). This optimization is heavily 2049 * influenced by the assumption that we're running over 2050 * Ethernet where MCLBYTES is large enough that the max 2051 * packet size will permit lots of coalescing into a 2052 * single cluster. This in turn permits efficient 2053 * crypto operations, especially when using hardware. 2054 */ 2055 if ((m->m_flags & M_EXT) == 0) { 2056 if (mprev && (mprev->m_flags & M_EXT) && 2057 m->m_len <= M_TRAILINGSPACE(mprev)) { 2058 /* XXX: this ignores mbuf types */ 2059 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2060 mtod(m, caddr_t), m->m_len); 2061 mprev->m_len += m->m_len; 2062 mprev->m_next = m->m_next; /* unlink from chain */ 2063 m_free(m); /* reclaim mbuf */ 2064 } else { 2065 mprev = m; 2066 } 2067 continue; 2068 } 2069 /* 2070 * Writable mbufs are left alone (for now). 2071 */ 2072 if (M_WRITABLE(m)) { 2073 mprev = m; 2074 continue; 2075 } 2076 2077 /* 2078 * Not writable, replace with a copy or coalesce with 2079 * the previous mbuf if possible (since we have to copy 2080 * it anyway, we try to reduce the number of mbufs and 2081 * clusters so that future work is easier). 2082 */ 2083 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); 2084 /* NB: we only coalesce into a cluster or larger */ 2085 if (mprev != NULL && (mprev->m_flags & M_EXT) && 2086 m->m_len <= M_TRAILINGSPACE(mprev)) { 2087 /* XXX: this ignores mbuf types */ 2088 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2089 mtod(m, caddr_t), m->m_len); 2090 mprev->m_len += m->m_len; 2091 mprev->m_next = m->m_next; /* unlink from chain */ 2092 m_free(m); /* reclaim mbuf */ 2093 continue; 2094 } 2095 2096 /* 2097 * Allocate new space to hold the copy and copy the data. 2098 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by 2099 * splitting them into clusters. We could just malloc a 2100 * buffer and make it external but too many device drivers 2101 * don't know how to break up the non-contiguous memory when 2102 * doing DMA. 2103 */ 2104 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2105 if (n == NULL) { 2106 m_freem(m0); 2107 return (NULL); 2108 } 2109 if (m->m_flags & M_PKTHDR) { 2110 KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", 2111 __func__, m0, m)); 2112 m_move_pkthdr(n, m); 2113 } 2114 len = m->m_len; 2115 off = 0; 2116 mfirst = n; 2117 mlast = NULL; 2118 for (;;) { 2119 int cc = min(len, MCLBYTES); 2120 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); 2121 n->m_len = cc; 2122 if (mlast != NULL) 2123 mlast->m_next = n; 2124 mlast = n; 2125 #if 0 2126 newipsecstat.ips_clcopied++; 2127 #endif 2128 2129 len -= cc; 2130 if (len <= 0) 2131 break; 2132 off += cc; 2133 2134 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2135 if (n == NULL) { 2136 m_freem(mfirst); 2137 m_freem(m0); 2138 return (NULL); 2139 } 2140 } 2141 n->m_next = m->m_next; 2142 if (mprev == NULL) 2143 m0 = mfirst; /* new head of chain */ 2144 else 2145 mprev->m_next = mfirst; /* replace old mbuf */ 2146 m_free(m); /* release old mbuf */ 2147 mprev = mfirst; 2148 } 2149 return (m0); 2150 } 2151 2152 #ifdef MBUF_PROFILING 2153 2154 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ 2155 struct mbufprofile { 2156 uintmax_t wasted[MP_BUCKETS]; 2157 uintmax_t used[MP_BUCKETS]; 2158 uintmax_t segments[MP_BUCKETS]; 2159 } mbprof; 2160 2161 void 2162 m_profile(struct mbuf *m) 2163 { 2164 int segments = 0; 2165 int used = 0; 2166 int wasted = 0; 2167 2168 while (m) { 2169 segments++; 2170 used += m->m_len; 2171 if (m->m_flags & M_EXT) { 2172 wasted += MHLEN - sizeof(m->m_ext) + 2173 m->m_ext.ext_size - m->m_len; 2174 } else { 2175 if (m->m_flags & M_PKTHDR) 2176 wasted += MHLEN - m->m_len; 2177 else 2178 wasted += MLEN - m->m_len; 2179 } 2180 m = m->m_next; 2181 } 2182 /* be paranoid.. it helps */ 2183 if (segments > MP_BUCKETS - 1) 2184 segments = MP_BUCKETS - 1; 2185 if (used > 100000) 2186 used = 100000; 2187 if (wasted > 100000) 2188 wasted = 100000; 2189 /* store in the appropriate bucket */ 2190 /* don't bother locking. if it's slightly off, so what? */ 2191 mbprof.segments[segments]++; 2192 mbprof.used[fls(used)]++; 2193 mbprof.wasted[fls(wasted)]++; 2194 } 2195 2196 static int 2197 mbprof_handler(SYSCTL_HANDLER_ARGS) 2198 { 2199 char buf[256]; 2200 struct sbuf sb; 2201 int error; 2202 uint64_t *p; 2203 2204 sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); 2205 2206 p = &mbprof.wasted[0]; 2207 sbuf_printf(&sb, 2208 "wasted:\n" 2209 "%ju %ju %ju %ju %ju %ju %ju %ju " 2210 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2211 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2212 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2213 #ifdef BIG_ARRAY 2214 p = &mbprof.wasted[16]; 2215 sbuf_printf(&sb, 2216 "%ju %ju %ju %ju %ju %ju %ju %ju " 2217 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2218 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2219 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2220 #endif 2221 p = &mbprof.used[0]; 2222 sbuf_printf(&sb, 2223 "used:\n" 2224 "%ju %ju %ju %ju %ju %ju %ju %ju " 2225 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2226 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2227 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2228 #ifdef BIG_ARRAY 2229 p = &mbprof.used[16]; 2230 sbuf_printf(&sb, 2231 "%ju %ju %ju %ju %ju %ju %ju %ju " 2232 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2233 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2234 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2235 #endif 2236 p = &mbprof.segments[0]; 2237 sbuf_printf(&sb, 2238 "segments:\n" 2239 "%ju %ju %ju %ju %ju %ju %ju %ju " 2240 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2241 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2242 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2243 #ifdef BIG_ARRAY 2244 p = &mbprof.segments[16]; 2245 sbuf_printf(&sb, 2246 "%ju %ju %ju %ju %ju %ju %ju %ju " 2247 "%ju %ju %ju %ju %ju %ju %ju %jju", 2248 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2249 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2250 #endif 2251 2252 error = sbuf_finish(&sb); 2253 sbuf_delete(&sb); 2254 return (error); 2255 } 2256 2257 static int 2258 mbprof_clr_handler(SYSCTL_HANDLER_ARGS) 2259 { 2260 int clear, error; 2261 2262 clear = 0; 2263 error = sysctl_handle_int(oidp, &clear, 0, req); 2264 if (error || !req->newptr) 2265 return (error); 2266 2267 if (clear) { 2268 bzero(&mbprof, sizeof(mbprof)); 2269 } 2270 2271 return (error); 2272 } 2273 2274 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, 2275 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2276 mbprof_handler, "A", 2277 "mbuf profiling statistics"); 2278 2279 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, 2280 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 2281 mbprof_clr_handler, "I", 2282 "clear mbuf profiling statistics"); 2283 #endif 2284