1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include "opt_param.h" 34 #include "opt_mbuf_stress_test.h" 35 #include "opt_mbuf_profiling.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/limits.h> 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/sysctl.h> 45 #include <sys/domain.h> 46 #include <sys/protosw.h> 47 #include <sys/uio.h> 48 #include <sys/vmmeter.h> 49 #include <sys/sbuf.h> 50 #include <sys/sdt.h> 51 #include <vm/vm.h> 52 #include <vm/vm_pageout.h> 53 #include <vm/vm_page.h> 54 55 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, 56 "struct mbuf *", "mbufinfo_t *", 57 "uint32_t", "uint32_t", 58 "uint16_t", "uint16_t", 59 "uint32_t", "uint32_t", 60 "uint32_t", "uint32_t"); 61 62 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw, 63 "uint32_t", "uint32_t", 64 "uint16_t", "uint16_t", 65 "struct mbuf *", "mbufinfo_t *"); 66 67 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, 68 "uint32_t", "uint32_t", 69 "uint16_t", "uint16_t", 70 "struct mbuf *", "mbufinfo_t *"); 71 72 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw, 73 "uint32_t", "uint32_t", 74 "uint16_t", "uint16_t", 75 "struct mbuf *", "mbufinfo_t *"); 76 77 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, 78 "uint32_t", "uint32_t", 79 "uint16_t", "uint16_t", 80 "struct mbuf *", "mbufinfo_t *"); 81 82 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, 83 "uint32_t", "uint32_t", 84 "uint16_t", "uint16_t", 85 "uint32_t", "uint32_t", 86 "struct mbuf *", "mbufinfo_t *"); 87 88 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl, 89 "uint32_t", "uint32_t", 90 "uint16_t", "uint16_t", 91 "uint32_t", "uint32_t", 92 "uint32_t", "uint32_t", 93 "struct mbuf *", "mbufinfo_t *"); 94 95 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, 96 "struct mbuf *", "mbufinfo_t *", 97 "uint32_t", "uint32_t", 98 "uint32_t", "uint32_t"); 99 100 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, 101 "struct mbuf *", "mbufinfo_t *", 102 "uint32_t", "uint32_t", 103 "uint32_t", "uint32_t", 104 "void*", "void*"); 105 106 SDT_PROBE_DEFINE(sdt, , , m__cljset); 107 108 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, 109 "struct mbuf *", "mbufinfo_t *"); 110 111 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, 112 "struct mbuf *", "mbufinfo_t *"); 113 114 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freemp, 115 "struct mbuf *", "mbufinfo_t *"); 116 117 #include <security/mac/mac_framework.h> 118 119 /* 120 * Provide minimum possible defaults for link and protocol header space, 121 * assuming IPv4 over Ethernet. Enabling IPv6, IEEE802.11 or some other 122 * protocol may grow these values. 123 */ 124 u_int max_linkhdr = 16; 125 u_int max_protohdr = 40; 126 u_int max_hdr = 16 + 40; 127 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, 128 &max_linkhdr, 16, "Size of largest link layer header"); 129 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, 130 &max_protohdr, 40, "Size of largest protocol layer header"); 131 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, 132 &max_hdr, 16 + 40, "Size of largest link plus protocol header"); 133 134 static void 135 max_hdr_grow(void) 136 { 137 138 max_hdr = max_linkhdr + max_protohdr; 139 MPASS(max_hdr <= MHLEN); 140 } 141 142 void 143 max_linkhdr_grow(u_int new) 144 { 145 146 if (new > max_linkhdr) { 147 max_linkhdr = new; 148 max_hdr_grow(); 149 } 150 } 151 152 void 153 max_protohdr_grow(u_int new) 154 { 155 156 if (new > max_protohdr) { 157 max_protohdr = new; 158 max_hdr_grow(); 159 } 160 } 161 162 #ifdef MBUF_STRESS_TEST 163 int m_defragpackets; 164 int m_defragbytes; 165 int m_defraguseless; 166 int m_defragfailure; 167 int m_defragrandomfailures; 168 169 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, 170 &m_defragpackets, 0, ""); 171 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, 172 &m_defragbytes, 0, ""); 173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, 174 &m_defraguseless, 0, ""); 175 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, 176 &m_defragfailure, 0, ""); 177 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, 178 &m_defragrandomfailures, 0, ""); 179 #endif 180 181 /* 182 * Ensure the correct size of various mbuf parameters. It could be off due 183 * to compiler-induced padding and alignment artifacts. 184 */ 185 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); 186 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); 187 188 /* 189 * mbuf data storage should be 64-bit aligned regardless of architectural 190 * pointer size; check this is the case with and without a packet header. 191 */ 192 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); 193 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); 194 195 /* 196 * While the specific values here don't matter too much (i.e., +/- a few 197 * words), we do want to ensure that changes to these values are carefully 198 * reasoned about and properly documented. This is especially the case as 199 * network-protocol and device-driver modules encode these layouts, and must 200 * be recompiled if the structures change. Check these values at compile time 201 * against the ones documented in comments in mbuf.h. 202 * 203 * NB: Possibly they should be documented there via #define's and not just 204 * comments. 205 */ 206 #if defined(__LP64__) 207 CTASSERT(offsetof(struct mbuf, m_dat) == 32); 208 CTASSERT(sizeof(struct pkthdr) == 64); 209 CTASSERT(sizeof(struct m_ext) == 160); 210 #else 211 CTASSERT(offsetof(struct mbuf, m_dat) == 24); 212 CTASSERT(sizeof(struct pkthdr) == 56); 213 #if defined(__powerpc__) && defined(BOOKE) 214 /* PowerPC booke has 64-bit physical pointers. */ 215 CTASSERT(sizeof(struct m_ext) == 176); 216 #else 217 CTASSERT(sizeof(struct m_ext) == 172); 218 #endif 219 #endif 220 221 /* 222 * Assert that the queue(3) macros produce code of the same size as an old 223 * plain pointer does. 224 */ 225 #ifdef INVARIANTS 226 static struct mbuf __used m_assertbuf; 227 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); 228 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); 229 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); 230 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); 231 #endif 232 233 /* 234 * Attach the cluster from *m to *n, set up m_ext in *n 235 * and bump the refcount of the cluster. 236 */ 237 void 238 mb_dupcl(struct mbuf *n, struct mbuf *m) 239 { 240 volatile u_int *refcnt; 241 242 KASSERT(m->m_flags & (M_EXT | M_EXTPG), 243 ("%s: M_EXT | M_EXTPG not set on %p", __func__, m)); 244 KASSERT(!(n->m_flags & (M_EXT | M_EXTPG)), 245 ("%s: M_EXT | M_EXTPG set on %p", __func__, n)); 246 247 /* 248 * Cache access optimization. 249 * 250 * o Regular M_EXT storage doesn't need full copy of m_ext, since 251 * the holder of the 'ext_count' is responsible to carry the free 252 * routine and its arguments. 253 * o M_EXTPG data is split between main part of mbuf and m_ext, the 254 * main part is copied in full, the m_ext part is similar to M_EXT. 255 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is 256 * special - it needs full copy of m_ext into each mbuf, since any 257 * copy could end up as the last to free. 258 */ 259 if (m->m_flags & M_EXTPG) { 260 bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy, 261 __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy)); 262 bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen); 263 } else if (m->m_ext.ext_type == EXT_EXTREF) 264 bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext)); 265 else 266 bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); 267 268 n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG); 269 270 /* See if this is the mbuf that holds the embedded refcount. */ 271 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 272 refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; 273 n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; 274 } else { 275 KASSERT(m->m_ext.ext_cnt != NULL, 276 ("%s: no refcounting pointer on %p", __func__, m)); 277 refcnt = m->m_ext.ext_cnt; 278 } 279 280 if (*refcnt == 1) 281 *refcnt += 1; 282 else 283 atomic_add_int(refcnt, 1); 284 } 285 286 void 287 m_demote_pkthdr(struct mbuf *m) 288 { 289 290 M_ASSERTPKTHDR(m); 291 M_ASSERT_NO_SND_TAG(m); 292 293 m_tag_delete_chain(m, NULL); 294 m->m_flags &= ~M_PKTHDR; 295 bzero(&m->m_pkthdr, sizeof(struct pkthdr)); 296 } 297 298 /* 299 * Clean up mbuf (chain) from any tags and packet headers. 300 * If "all" is set then the first mbuf in the chain will be 301 * cleaned too. 302 */ 303 void 304 m_demote(struct mbuf *m0, int all, int flags) 305 { 306 struct mbuf *m; 307 308 flags |= M_DEMOTEFLAGS; 309 310 for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { 311 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", 312 __func__, m, m0)); 313 if (m->m_flags & M_PKTHDR) 314 m_demote_pkthdr(m); 315 m->m_flags &= flags; 316 } 317 } 318 319 /* 320 * Sanity checks on mbuf (chain) for use in KASSERT() and general 321 * debugging. 322 * Returns 0 or panics when bad and 1 on all tests passed. 323 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they 324 * blow up later. 325 */ 326 int 327 m_sanity(struct mbuf *m0, int sanitize) 328 { 329 struct mbuf *m; 330 caddr_t a, b; 331 int pktlen = 0; 332 333 #ifdef INVARIANTS 334 #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) 335 #else 336 #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) 337 #endif 338 339 for (m = m0; m != NULL; m = m->m_next) { 340 /* 341 * Basic pointer checks. If any of these fails then some 342 * unrelated kernel memory before or after us is trashed. 343 * No way to recover from that. 344 */ 345 a = M_START(m); 346 b = a + M_SIZE(m); 347 if ((caddr_t)m->m_data < a) 348 M_SANITY_ACTION("m_data outside mbuf data range left"); 349 if ((caddr_t)m->m_data > b) 350 M_SANITY_ACTION("m_data outside mbuf data range right"); 351 if ((caddr_t)m->m_data + m->m_len > b) 352 M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); 353 354 /* m->m_nextpkt may only be set on first mbuf in chain. */ 355 if (m != m0 && m->m_nextpkt != NULL) { 356 if (sanitize) { 357 m_freem(m->m_nextpkt); 358 m->m_nextpkt = (struct mbuf *)0xDEADC0DE; 359 } else 360 M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); 361 } 362 363 /* packet length (not mbuf length!) calculation */ 364 if (m0->m_flags & M_PKTHDR) 365 pktlen += m->m_len; 366 367 /* m_tags may only be attached to first mbuf in chain. */ 368 if (m != m0 && m->m_flags & M_PKTHDR && 369 !SLIST_EMPTY(&m->m_pkthdr.tags)) { 370 if (sanitize) { 371 m_tag_delete_chain(m, NULL); 372 /* put in 0xDEADC0DE perhaps? */ 373 } else 374 M_SANITY_ACTION("m_tags on in-chain mbuf"); 375 } 376 377 /* M_PKTHDR may only be set on first mbuf in chain */ 378 if (m != m0 && m->m_flags & M_PKTHDR) { 379 if (sanitize) { 380 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 381 m->m_flags &= ~M_PKTHDR; 382 /* put in 0xDEADCODE and leave hdr flag in */ 383 } else 384 M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); 385 } 386 } 387 m = m0; 388 if (pktlen && pktlen != m->m_pkthdr.len) { 389 if (sanitize) 390 m->m_pkthdr.len = 0; 391 else 392 M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); 393 } 394 return 1; 395 396 #undef M_SANITY_ACTION 397 } 398 399 /* 400 * Non-inlined part of m_init(). 401 */ 402 int 403 m_pkthdr_init(struct mbuf *m, int how) 404 { 405 #ifdef MAC 406 int error; 407 #endif 408 m->m_data = m->m_pktdat; 409 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 410 #ifdef NUMA 411 m->m_pkthdr.numa_domain = M_NODOM; 412 #endif 413 #ifdef MAC 414 /* If the label init fails, fail the alloc */ 415 error = mac_mbuf_init(m, how); 416 if (error) 417 return (error); 418 #endif 419 420 return (0); 421 } 422 423 /* 424 * "Move" mbuf pkthdr from "from" to "to". 425 * "from" must have M_PKTHDR set, and "to" must be empty. 426 */ 427 void 428 m_move_pkthdr(struct mbuf *to, struct mbuf *from) 429 { 430 431 #if 0 432 /* see below for why these are not enabled */ 433 M_ASSERTPKTHDR(to); 434 /* Note: with MAC, this may not be a good assertion. */ 435 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), 436 ("m_move_pkthdr: to has tags")); 437 #endif 438 #ifdef MAC 439 /* 440 * XXXMAC: It could be this should also occur for non-MAC? 441 */ 442 if (to->m_flags & M_PKTHDR) 443 m_tag_delete_chain(to, NULL); 444 #endif 445 to->m_flags = (from->m_flags & M_COPYFLAGS) | 446 (to->m_flags & (M_EXT | M_EXTPG)); 447 if ((to->m_flags & M_EXT) == 0) 448 to->m_data = to->m_pktdat; 449 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 450 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ 451 from->m_flags &= ~M_PKTHDR; 452 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { 453 from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; 454 from->m_pkthdr.snd_tag = NULL; 455 } 456 } 457 458 /* 459 * Duplicate "from"'s mbuf pkthdr in "to". 460 * "from" must have M_PKTHDR set, and "to" must be empty. 461 * In particular, this does a deep copy of the packet tags. 462 */ 463 int 464 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) 465 { 466 467 #if 0 468 /* 469 * The mbuf allocator only initializes the pkthdr 470 * when the mbuf is allocated with m_gethdr(). Many users 471 * (e.g. m_copy*, m_prepend) use m_get() and then 472 * smash the pkthdr as needed causing these 473 * assertions to trip. For now just disable them. 474 */ 475 M_ASSERTPKTHDR(to); 476 /* Note: with MAC, this may not be a good assertion. */ 477 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); 478 #endif 479 MBUF_CHECKSLEEP(how); 480 #ifdef MAC 481 if (to->m_flags & M_PKTHDR) 482 m_tag_delete_chain(to, NULL); 483 #endif 484 to->m_flags = (from->m_flags & M_COPYFLAGS) | 485 (to->m_flags & (M_EXT | M_EXTPG)); 486 if ((to->m_flags & M_EXT) == 0) 487 to->m_data = to->m_pktdat; 488 to->m_pkthdr = from->m_pkthdr; 489 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) 490 m_snd_tag_ref(from->m_pkthdr.snd_tag); 491 SLIST_INIT(&to->m_pkthdr.tags); 492 return (m_tag_copy_chain(to, from, how)); 493 } 494 495 /* 496 * Lesser-used path for M_PREPEND: 497 * allocate new mbuf to prepend to chain, 498 * copy junk along. 499 */ 500 struct mbuf * 501 m_prepend(struct mbuf *m, int len, int how) 502 { 503 struct mbuf *mn; 504 505 if (m->m_flags & M_PKTHDR) 506 mn = m_gethdr(how, m->m_type); 507 else 508 mn = m_get(how, m->m_type); 509 if (mn == NULL) { 510 m_freem(m); 511 return (NULL); 512 } 513 if (m->m_flags & M_PKTHDR) 514 m_move_pkthdr(mn, m); 515 mn->m_next = m; 516 m = mn; 517 if (len < M_SIZE(m)) 518 M_ALIGN(m, len); 519 m->m_len = len; 520 return (m); 521 } 522 523 /* 524 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 525 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 526 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. 527 * Note that the copy is read-only, because clusters are not copied, 528 * only their reference counts are incremented. 529 */ 530 struct mbuf * 531 m_copym(struct mbuf *m, int off0, int len, int wait) 532 { 533 struct mbuf *n, **np; 534 int off = off0; 535 struct mbuf *top; 536 int copyhdr = 0; 537 538 KASSERT(off >= 0, ("m_copym, negative off %d", off)); 539 KASSERT(len >= 0, ("m_copym, negative len %d", len)); 540 MBUF_CHECKSLEEP(wait); 541 if (off == 0 && m->m_flags & M_PKTHDR) 542 copyhdr = 1; 543 while (off > 0) { 544 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); 545 if (off < m->m_len) 546 break; 547 off -= m->m_len; 548 m = m->m_next; 549 } 550 np = ⊤ 551 top = NULL; 552 while (len > 0) { 553 if (m == NULL) { 554 KASSERT(len == M_COPYALL, 555 ("m_copym, length > size of mbuf chain")); 556 break; 557 } 558 if (copyhdr) 559 n = m_gethdr(wait, m->m_type); 560 else 561 n = m_get(wait, m->m_type); 562 *np = n; 563 if (n == NULL) 564 goto nospace; 565 if (copyhdr) { 566 if (!m_dup_pkthdr(n, m, wait)) 567 goto nospace; 568 if (len == M_COPYALL) 569 n->m_pkthdr.len -= off0; 570 else 571 n->m_pkthdr.len = len; 572 copyhdr = 0; 573 } 574 n->m_len = min(len, m->m_len - off); 575 if (m->m_flags & (M_EXT | M_EXTPG)) { 576 n->m_data = m->m_data + off; 577 mb_dupcl(n, m); 578 } else 579 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 580 (u_int)n->m_len); 581 if (len != M_COPYALL) 582 len -= n->m_len; 583 off = 0; 584 m = m->m_next; 585 np = &n->m_next; 586 } 587 588 return (top); 589 nospace: 590 m_freem(top); 591 return (NULL); 592 } 593 594 /* 595 * Copy an entire packet, including header (which must be present). 596 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. 597 * Note that the copy is read-only, because clusters are not copied, 598 * only their reference counts are incremented. 599 * Preserve alignment of the first mbuf so if the creator has left 600 * some room at the beginning (e.g. for inserting protocol headers) 601 * the copies still have the room available. 602 */ 603 struct mbuf * 604 m_copypacket(struct mbuf *m, int how) 605 { 606 struct mbuf *top, *n, *o; 607 608 MBUF_CHECKSLEEP(how); 609 n = m_get(how, m->m_type); 610 top = n; 611 if (n == NULL) 612 goto nospace; 613 614 if (!m_dup_pkthdr(n, m, how)) 615 goto nospace; 616 n->m_len = m->m_len; 617 if (m->m_flags & (M_EXT | M_EXTPG)) { 618 n->m_data = m->m_data; 619 mb_dupcl(n, m); 620 } else { 621 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); 622 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 623 } 624 625 m = m->m_next; 626 while (m) { 627 o = m_get(how, m->m_type); 628 if (o == NULL) 629 goto nospace; 630 631 n->m_next = o; 632 n = n->m_next; 633 634 n->m_len = m->m_len; 635 if (m->m_flags & (M_EXT | M_EXTPG)) { 636 n->m_data = m->m_data; 637 mb_dupcl(n, m); 638 } else { 639 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 640 } 641 642 m = m->m_next; 643 } 644 return top; 645 nospace: 646 m_freem(top); 647 return (NULL); 648 } 649 650 static void 651 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) 652 { 653 struct iovec iov; 654 struct uio uio; 655 int error __diagused; 656 657 KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); 658 KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); 659 KASSERT(off < m->m_len, 660 ("m_copyfromunmapped: len exceeds mbuf length")); 661 iov.iov_base = cp; 662 iov.iov_len = len; 663 uio.uio_resid = len; 664 uio.uio_iov = &iov; 665 uio.uio_segflg = UIO_SYSSPACE; 666 uio.uio_iovcnt = 1; 667 uio.uio_offset = 0; 668 uio.uio_rw = UIO_READ; 669 error = m_unmapped_uiomove(m, off, &uio, len); 670 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 671 len)); 672 } 673 674 /* 675 * Copy data from an mbuf chain starting "off" bytes from the beginning, 676 * continuing for "len" bytes, into the indicated buffer. 677 */ 678 void 679 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) 680 { 681 u_int count; 682 683 KASSERT(off >= 0, ("m_copydata, negative off %d", off)); 684 KASSERT(len >= 0, ("m_copydata, negative len %d", len)); 685 while (off > 0) { 686 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); 687 if (off < m->m_len) 688 break; 689 off -= m->m_len; 690 m = m->m_next; 691 } 692 while (len > 0) { 693 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); 694 count = min(m->m_len - off, len); 695 if ((m->m_flags & M_EXTPG) != 0) 696 m_copyfromunmapped(m, off, count, cp); 697 else 698 bcopy(mtod(m, caddr_t) + off, cp, count); 699 len -= count; 700 cp += count; 701 off = 0; 702 m = m->m_next; 703 } 704 } 705 706 /* 707 * Copy a packet header mbuf chain into a completely new chain, including 708 * copying any mbuf clusters. Use this instead of m_copypacket() when 709 * you need a writable copy of an mbuf chain. 710 */ 711 struct mbuf * 712 m_dup(const struct mbuf *m, int how) 713 { 714 struct mbuf **p, *top = NULL; 715 int remain, moff, nsize; 716 717 MBUF_CHECKSLEEP(how); 718 /* Sanity check */ 719 if (m == NULL) 720 return (NULL); 721 M_ASSERTPKTHDR(m); 722 723 /* While there's more data, get a new mbuf, tack it on, and fill it */ 724 remain = m->m_pkthdr.len; 725 moff = 0; 726 p = ⊤ 727 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ 728 struct mbuf *n; 729 730 /* Get the next new mbuf */ 731 if (remain >= MINCLSIZE) { 732 n = m_getcl(how, m->m_type, 0); 733 nsize = MCLBYTES; 734 } else { 735 n = m_get(how, m->m_type); 736 nsize = MLEN; 737 } 738 if (n == NULL) 739 goto nospace; 740 741 if (top == NULL) { /* First one, must be PKTHDR */ 742 if (!m_dup_pkthdr(n, m, how)) { 743 m_free(n); 744 goto nospace; 745 } 746 if ((n->m_flags & M_EXT) == 0) 747 nsize = MHLEN; 748 n->m_flags &= ~M_RDONLY; 749 } 750 n->m_len = 0; 751 752 /* Link it into the new chain */ 753 *p = n; 754 p = &n->m_next; 755 756 /* Copy data from original mbuf(s) into new mbuf */ 757 while (n->m_len < nsize && m != NULL) { 758 int chunk = min(nsize - n->m_len, m->m_len - moff); 759 760 m_copydata(m, moff, chunk, n->m_data + n->m_len); 761 moff += chunk; 762 n->m_len += chunk; 763 remain -= chunk; 764 if (moff == m->m_len) { 765 m = m->m_next; 766 moff = 0; 767 } 768 } 769 770 /* Check correct total mbuf length */ 771 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), 772 ("%s: bogus m_pkthdr.len", __func__)); 773 } 774 return (top); 775 776 nospace: 777 m_freem(top); 778 return (NULL); 779 } 780 781 /* 782 * Concatenate mbuf chain n to m. 783 * Both chains must be of the same type (e.g. MT_DATA). 784 * Any m_pkthdr is not updated. 785 */ 786 void 787 m_cat(struct mbuf *m, struct mbuf *n) 788 { 789 while (m->m_next) 790 m = m->m_next; 791 while (n) { 792 if (!M_WRITABLE(m) || 793 (n->m_flags & M_EXTPG) != 0 || 794 M_TRAILINGSPACE(m) < n->m_len) { 795 /* just join the two chains */ 796 m->m_next = n; 797 return; 798 } 799 /* splat the data from one into the other */ 800 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 801 (u_int)n->m_len); 802 m->m_len += n->m_len; 803 n = m_free(n); 804 } 805 } 806 807 /* 808 * Concatenate two pkthdr mbuf chains. 809 */ 810 void 811 m_catpkt(struct mbuf *m, struct mbuf *n) 812 { 813 814 M_ASSERTPKTHDR(m); 815 M_ASSERTPKTHDR(n); 816 817 m->m_pkthdr.len += n->m_pkthdr.len; 818 m_demote(n, 1, 0); 819 820 m_cat(m, n); 821 } 822 823 void 824 m_adj(struct mbuf *mp, int req_len) 825 { 826 int len = req_len; 827 struct mbuf *m; 828 int count; 829 830 if ((m = mp) == NULL) 831 return; 832 if (len >= 0) { 833 /* 834 * Trim from head. 835 */ 836 while (m != NULL && len > 0) { 837 if (m->m_len <= len) { 838 len -= m->m_len; 839 m->m_len = 0; 840 m = m->m_next; 841 } else { 842 m->m_len -= len; 843 m->m_data += len; 844 len = 0; 845 } 846 } 847 if (mp->m_flags & M_PKTHDR) 848 mp->m_pkthdr.len -= (req_len - len); 849 } else { 850 /* 851 * Trim from tail. Scan the mbuf chain, 852 * calculating its length and finding the last mbuf. 853 * If the adjustment only affects this mbuf, then just 854 * adjust and return. Otherwise, rescan and truncate 855 * after the remaining size. 856 */ 857 len = -len; 858 count = 0; 859 for (;;) { 860 count += m->m_len; 861 if (m->m_next == (struct mbuf *)0) 862 break; 863 m = m->m_next; 864 } 865 if (m->m_len >= len) { 866 m->m_len -= len; 867 if (mp->m_flags & M_PKTHDR) 868 mp->m_pkthdr.len -= len; 869 return; 870 } 871 count -= len; 872 if (count < 0) 873 count = 0; 874 /* 875 * Correct length for chain is "count". 876 * Find the mbuf with last data, adjust its length, 877 * and toss data from remaining mbufs on chain. 878 */ 879 m = mp; 880 if (m->m_flags & M_PKTHDR) 881 m->m_pkthdr.len = count; 882 for (; m; m = m->m_next) { 883 if (m->m_len >= count) { 884 m->m_len = count; 885 if (m->m_next != NULL) { 886 m_freem(m->m_next); 887 m->m_next = NULL; 888 } 889 break; 890 } 891 count -= m->m_len; 892 } 893 } 894 } 895 896 void 897 m_adj_decap(struct mbuf *mp, int len) 898 { 899 uint8_t rsstype; 900 901 m_adj(mp, len); 902 if ((mp->m_flags & M_PKTHDR) != 0) { 903 /* 904 * If flowid was calculated by card from the inner 905 * headers, move flowid to the decapsulated mbuf 906 * chain, otherwise clear. This depends on the 907 * internals of m_adj, which keeps pkthdr as is, in 908 * particular not changing rsstype and flowid. 909 */ 910 rsstype = mp->m_pkthdr.rsstype; 911 if ((rsstype & M_HASHTYPE_INNER) != 0) { 912 M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER); 913 } else { 914 M_HASHTYPE_CLEAR(mp); 915 } 916 } 917 } 918 919 /* 920 * Rearange an mbuf chain so that len bytes are contiguous 921 * and in the data area of an mbuf (so that mtod will work 922 * for a structure of size len). Returns the resulting 923 * mbuf chain on success, frees it and returns null on failure. 924 * If there is room, it will add up to max_protohdr-len extra bytes to the 925 * contiguous region in an attempt to avoid being called next time. 926 */ 927 struct mbuf * 928 m_pullup(struct mbuf *n, int len) 929 { 930 struct mbuf *m; 931 int count; 932 int space; 933 934 KASSERT((n->m_flags & M_EXTPG) == 0, 935 ("%s: unmapped mbuf %p", __func__, n)); 936 937 /* 938 * If first mbuf has no cluster, and has room for len bytes 939 * without shifting current data, pullup into it, 940 * otherwise allocate a new mbuf to prepend to the chain. 941 */ 942 if ((n->m_flags & M_EXT) == 0 && 943 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 944 if (n->m_len >= len) 945 return (n); 946 m = n; 947 n = n->m_next; 948 len -= m->m_len; 949 } else { 950 if (len > MHLEN) 951 goto bad; 952 m = m_get(M_NOWAIT, n->m_type); 953 if (m == NULL) 954 goto bad; 955 if (n->m_flags & M_PKTHDR) 956 m_move_pkthdr(m, n); 957 } 958 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 959 do { 960 count = min(min(max(len, max_protohdr), space), n->m_len); 961 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 962 (u_int)count); 963 len -= count; 964 m->m_len += count; 965 n->m_len -= count; 966 space -= count; 967 if (n->m_len) 968 n->m_data += count; 969 else 970 n = m_free(n); 971 } while (len > 0 && n); 972 if (len > 0) { 973 (void) m_free(m); 974 goto bad; 975 } 976 m->m_next = n; 977 return (m); 978 bad: 979 m_freem(n); 980 return (NULL); 981 } 982 983 /* 984 * Like m_pullup(), except a new mbuf is always allocated, and we allow 985 * the amount of empty space before the data in the new mbuf to be specified 986 * (in the event that the caller expects to prepend later). 987 */ 988 struct mbuf * 989 m_copyup(struct mbuf *n, int len, int dstoff) 990 { 991 struct mbuf *m; 992 int count, space; 993 994 if (len > (MHLEN - dstoff)) 995 goto bad; 996 m = m_get(M_NOWAIT, n->m_type); 997 if (m == NULL) 998 goto bad; 999 if (n->m_flags & M_PKTHDR) 1000 m_move_pkthdr(m, n); 1001 m->m_data += dstoff; 1002 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 1003 do { 1004 count = min(min(max(len, max_protohdr), space), n->m_len); 1005 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 1006 (unsigned)count); 1007 len -= count; 1008 m->m_len += count; 1009 n->m_len -= count; 1010 space -= count; 1011 if (n->m_len) 1012 n->m_data += count; 1013 else 1014 n = m_free(n); 1015 } while (len > 0 && n); 1016 if (len > 0) { 1017 (void) m_free(m); 1018 goto bad; 1019 } 1020 m->m_next = n; 1021 return (m); 1022 bad: 1023 m_freem(n); 1024 return (NULL); 1025 } 1026 1027 /* 1028 * Partition an mbuf chain in two pieces, returning the tail -- 1029 * all but the first len0 bytes. In case of failure, it returns NULL and 1030 * attempts to restore the chain to its original state. 1031 * 1032 * Note that the resulting mbufs might be read-only, because the new 1033 * mbuf can end up sharing an mbuf cluster with the original mbuf if 1034 * the "breaking point" happens to lie within a cluster mbuf. Use the 1035 * M_WRITABLE() macro to check for this case. 1036 */ 1037 struct mbuf * 1038 m_split(struct mbuf *m0, int len0, int wait) 1039 { 1040 struct mbuf *m, *n; 1041 u_int len = len0, remain; 1042 1043 MBUF_CHECKSLEEP(wait); 1044 for (m = m0; m && len > m->m_len; m = m->m_next) 1045 len -= m->m_len; 1046 if (m == NULL) 1047 return (NULL); 1048 remain = m->m_len - len; 1049 if (m0->m_flags & M_PKTHDR && remain == 0) { 1050 n = m_gethdr(wait, m0->m_type); 1051 if (n == NULL) 1052 return (NULL); 1053 n->m_next = m->m_next; 1054 m->m_next = NULL; 1055 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1056 n->m_pkthdr.snd_tag = 1057 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1058 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1059 } else 1060 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1061 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1062 m0->m_pkthdr.len = len0; 1063 return (n); 1064 } else if (m0->m_flags & M_PKTHDR) { 1065 n = m_gethdr(wait, m0->m_type); 1066 if (n == NULL) 1067 return (NULL); 1068 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1069 n->m_pkthdr.snd_tag = 1070 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1071 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1072 } else 1073 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1074 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1075 m0->m_pkthdr.len = len0; 1076 if (m->m_flags & (M_EXT | M_EXTPG)) 1077 goto extpacket; 1078 if (remain > MHLEN) { 1079 /* m can't be the lead packet */ 1080 M_ALIGN(n, 0); 1081 n->m_next = m_split(m, len, wait); 1082 if (n->m_next == NULL) { 1083 (void) m_free(n); 1084 return (NULL); 1085 } else { 1086 n->m_len = 0; 1087 return (n); 1088 } 1089 } else 1090 M_ALIGN(n, remain); 1091 } else if (remain == 0) { 1092 n = m->m_next; 1093 m->m_next = NULL; 1094 return (n); 1095 } else { 1096 n = m_get(wait, m->m_type); 1097 if (n == NULL) 1098 return (NULL); 1099 M_ALIGN(n, remain); 1100 } 1101 extpacket: 1102 if (m->m_flags & (M_EXT | M_EXTPG)) { 1103 n->m_data = m->m_data + len; 1104 mb_dupcl(n, m); 1105 } else { 1106 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); 1107 } 1108 n->m_len = remain; 1109 m->m_len = len; 1110 n->m_next = m->m_next; 1111 m->m_next = NULL; 1112 return (n); 1113 } 1114 1115 /* 1116 * Partition mchain in two pieces, keeping len0 bytes in head and transferring 1117 * remainder to tail. In case of failure, both chains to be left untouched. 1118 * M_EOR is observed correctly. 1119 * Resulting mbufs might be read-only. 1120 */ 1121 int 1122 mc_split(struct mchain *head, struct mchain *tail, u_int len0, int wait) 1123 { 1124 struct mbuf *m, *n; 1125 u_int len, mlen, remain; 1126 1127 MPASS(!(mc_first(head)->m_flags & M_PKTHDR)); 1128 MBUF_CHECKSLEEP(wait); 1129 1130 mlen = 0; 1131 len = len0; 1132 STAILQ_FOREACH(m, &head->mc_q, m_stailq) { 1133 mlen += MSIZE; 1134 if (m->m_flags & M_EXT) 1135 mlen += m->m_ext.ext_size; 1136 if (len > m->m_len) 1137 len -= m->m_len; 1138 else 1139 break; 1140 } 1141 if (__predict_false(m == NULL)) { 1142 *tail = MCHAIN_INITIALIZER(tail); 1143 return (0); 1144 } 1145 remain = m->m_len - len; 1146 if (remain > 0) { 1147 if (__predict_false((n = m_get(wait, m->m_type)) == NULL)) 1148 return (ENOMEM); 1149 m_align(n, remain); 1150 if (m->m_flags & M_EXT) { 1151 n->m_data = m->m_data + len; 1152 mb_dupcl(n, m); 1153 } else 1154 bcopy(mtod(m, char *) + len, mtod(n, char *), remain); 1155 } 1156 1157 /* XXXGL: need STAILQ_SPLIT */ 1158 STAILQ_FIRST(&tail->mc_q) = STAILQ_NEXT(m, m_stailq); 1159 tail->mc_q.stqh_last = head->mc_q.stqh_last; 1160 tail->mc_len = head->mc_len - len0; 1161 tail->mc_mlen = head->mc_mlen - mlen; 1162 if (remain > 0) { 1163 MPASS(n->m_len == 0); 1164 mc_prepend(tail, n); 1165 n->m_len = remain; 1166 m->m_len -= remain; 1167 if (m->m_flags & M_EOR) { 1168 m->m_flags &= ~M_EOR; 1169 n->m_flags |= M_EOR; 1170 } 1171 } 1172 head->mc_q.stqh_last = &STAILQ_NEXT(m, m_stailq); 1173 STAILQ_NEXT(m, m_stailq) = NULL; 1174 head->mc_len = len0; 1175 head->mc_mlen = mlen; 1176 1177 return (0); 1178 } 1179 1180 /* 1181 * Routine to copy from device local memory into mbufs. 1182 * Note that `off' argument is offset into first mbuf of target chain from 1183 * which to begin copying the data to. 1184 */ 1185 struct mbuf * 1186 m_devget(char *buf, int totlen, int off, struct ifnet *ifp, 1187 void (*copy)(char *from, caddr_t to, u_int len)) 1188 { 1189 struct mbuf *m; 1190 struct mbuf *top = NULL, **mp = ⊤ 1191 int len; 1192 1193 if (off < 0 || off > MHLEN) 1194 return (NULL); 1195 1196 while (totlen > 0) { 1197 if (top == NULL) { /* First one, must be PKTHDR */ 1198 if (totlen + off >= MINCLSIZE) { 1199 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 1200 len = MCLBYTES; 1201 } else { 1202 m = m_gethdr(M_NOWAIT, MT_DATA); 1203 len = MHLEN; 1204 1205 /* Place initial small packet/header at end of mbuf */ 1206 if (m && totlen + off + max_linkhdr <= MHLEN) { 1207 m->m_data += max_linkhdr; 1208 len -= max_linkhdr; 1209 } 1210 } 1211 if (m == NULL) 1212 return NULL; 1213 m->m_pkthdr.rcvif = ifp; 1214 m->m_pkthdr.len = totlen; 1215 } else { 1216 if (totlen + off >= MINCLSIZE) { 1217 m = m_getcl(M_NOWAIT, MT_DATA, 0); 1218 len = MCLBYTES; 1219 } else { 1220 m = m_get(M_NOWAIT, MT_DATA); 1221 len = MLEN; 1222 } 1223 if (m == NULL) { 1224 m_freem(top); 1225 return NULL; 1226 } 1227 } 1228 if (off) { 1229 m->m_data += off; 1230 len -= off; 1231 off = 0; 1232 } 1233 m->m_len = len = min(totlen, len); 1234 if (copy) 1235 copy(buf, mtod(m, caddr_t), (u_int)len); 1236 else 1237 bcopy(buf, mtod(m, caddr_t), (u_int)len); 1238 buf += len; 1239 *mp = m; 1240 mp = &m->m_next; 1241 totlen -= len; 1242 } 1243 return (top); 1244 } 1245 1246 static void 1247 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp) 1248 { 1249 struct iovec iov; 1250 struct uio uio; 1251 int error __diagused; 1252 1253 KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off)); 1254 KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len)); 1255 KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length")); 1256 iov.iov_base = __DECONST(caddr_t, cp); 1257 iov.iov_len = len; 1258 uio.uio_resid = len; 1259 uio.uio_iov = &iov; 1260 uio.uio_segflg = UIO_SYSSPACE; 1261 uio.uio_iovcnt = 1; 1262 uio.uio_offset = 0; 1263 uio.uio_rw = UIO_WRITE; 1264 error = m_unmapped_uiomove(m, off, &uio, len); 1265 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 1266 len)); 1267 } 1268 1269 /* 1270 * Copy data from a buffer back into the indicated mbuf chain, 1271 * starting "off" bytes from the beginning, extending the mbuf 1272 * chain if necessary. 1273 */ 1274 void 1275 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) 1276 { 1277 int mlen; 1278 struct mbuf *m = m0, *n; 1279 int totlen = 0; 1280 1281 if (m0 == NULL) 1282 return; 1283 while (off > (mlen = m->m_len)) { 1284 off -= mlen; 1285 totlen += mlen; 1286 if (m->m_next == NULL) { 1287 n = m_get(M_NOWAIT, m->m_type); 1288 if (n == NULL) 1289 goto out; 1290 bzero(mtod(n, caddr_t), MLEN); 1291 n->m_len = min(MLEN, len + off); 1292 m->m_next = n; 1293 } 1294 m = m->m_next; 1295 } 1296 while (len > 0) { 1297 if (m->m_next == NULL && (len > m->m_len - off)) { 1298 m->m_len += min(len - (m->m_len - off), 1299 M_TRAILINGSPACE(m)); 1300 } 1301 mlen = min (m->m_len - off, len); 1302 if ((m->m_flags & M_EXTPG) != 0) 1303 m_copytounmapped(m, off, mlen, cp); 1304 else 1305 bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); 1306 cp += mlen; 1307 len -= mlen; 1308 mlen += off; 1309 off = 0; 1310 totlen += mlen; 1311 if (len == 0) 1312 break; 1313 if (m->m_next == NULL) { 1314 n = m_get(M_NOWAIT, m->m_type); 1315 if (n == NULL) 1316 break; 1317 n->m_len = min(MLEN, len); 1318 m->m_next = n; 1319 } 1320 m = m->m_next; 1321 } 1322 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) 1323 m->m_pkthdr.len = totlen; 1324 } 1325 1326 /* 1327 * Append the specified data to the indicated mbuf chain, 1328 * Extend the mbuf chain if the new data does not fit in 1329 * existing space. 1330 * 1331 * Return 1 if able to complete the job; otherwise 0. 1332 */ 1333 int 1334 m_append(struct mbuf *m0, int len, c_caddr_t cp) 1335 { 1336 struct mbuf *m, *n; 1337 int remainder, space; 1338 1339 for (m = m0; m->m_next != NULL; m = m->m_next) 1340 ; 1341 remainder = len; 1342 space = M_TRAILINGSPACE(m); 1343 if (space > 0) { 1344 /* 1345 * Copy into available space. 1346 */ 1347 if (space > remainder) 1348 space = remainder; 1349 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 1350 m->m_len += space; 1351 cp += space, remainder -= space; 1352 } 1353 while (remainder > 0) { 1354 /* 1355 * Allocate a new mbuf; could check space 1356 * and allocate a cluster instead. 1357 */ 1358 n = m_get(M_NOWAIT, m->m_type); 1359 if (n == NULL) 1360 break; 1361 n->m_len = min(MLEN, remainder); 1362 bcopy(cp, mtod(n, caddr_t), n->m_len); 1363 cp += n->m_len, remainder -= n->m_len; 1364 m->m_next = n; 1365 m = n; 1366 } 1367 if (m0->m_flags & M_PKTHDR) 1368 m0->m_pkthdr.len += len - remainder; 1369 return (remainder == 0); 1370 } 1371 1372 static int 1373 m_apply_extpg_one(struct mbuf *m, int off, int len, 1374 int (*f)(void *, void *, u_int), void *arg) 1375 { 1376 void *p; 1377 u_int i, count, pgoff, pglen; 1378 int rval; 1379 1380 KASSERT(PMAP_HAS_DMAP, 1381 ("m_apply_extpg_one does not support unmapped mbufs")); 1382 off += mtod(m, vm_offset_t); 1383 if (off < m->m_epg_hdrlen) { 1384 count = min(m->m_epg_hdrlen - off, len); 1385 rval = f(arg, m->m_epg_hdr + off, count); 1386 if (rval) 1387 return (rval); 1388 len -= count; 1389 off = 0; 1390 } else 1391 off -= m->m_epg_hdrlen; 1392 pgoff = m->m_epg_1st_off; 1393 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 1394 pglen = m_epg_pagelen(m, i, pgoff); 1395 if (off < pglen) { 1396 count = min(pglen - off, len); 1397 p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off); 1398 rval = f(arg, p, count); 1399 if (rval) 1400 return (rval); 1401 len -= count; 1402 off = 0; 1403 } else 1404 off -= pglen; 1405 pgoff = 0; 1406 } 1407 if (len > 0) { 1408 KASSERT(off < m->m_epg_trllen, 1409 ("m_apply_extpg_one: offset beyond trailer")); 1410 KASSERT(len <= m->m_epg_trllen - off, 1411 ("m_apply_extpg_one: length beyond trailer")); 1412 return (f(arg, m->m_epg_trail + off, len)); 1413 } 1414 return (0); 1415 } 1416 1417 /* Apply function f to the data in a single mbuf. */ 1418 static int 1419 m_apply_one(struct mbuf *m, int off, int len, 1420 int (*f)(void *, void *, u_int), void *arg) 1421 { 1422 if ((m->m_flags & M_EXTPG) != 0) 1423 return (m_apply_extpg_one(m, off, len, f, arg)); 1424 else 1425 return (f(arg, mtod(m, caddr_t) + off, len)); 1426 } 1427 1428 /* 1429 * Apply function f to the data in an mbuf chain starting "off" bytes from 1430 * the beginning, continuing for "len" bytes. 1431 */ 1432 int 1433 m_apply(struct mbuf *m, int off, int len, 1434 int (*f)(void *, void *, u_int), void *arg) 1435 { 1436 u_int count; 1437 int rval; 1438 1439 KASSERT(off >= 0, ("m_apply, negative off %d", off)); 1440 KASSERT(len >= 0, ("m_apply, negative len %d", len)); 1441 while (off > 0) { 1442 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 1443 if (off < m->m_len) 1444 break; 1445 off -= m->m_len; 1446 m = m->m_next; 1447 } 1448 while (len > 0) { 1449 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); 1450 count = min(m->m_len - off, len); 1451 rval = m_apply_one(m, off, count, f, arg); 1452 if (rval) 1453 return (rval); 1454 len -= count; 1455 off = 0; 1456 m = m->m_next; 1457 } 1458 return (0); 1459 } 1460 1461 /* 1462 * Return a pointer to mbuf/offset of location in mbuf chain. 1463 */ 1464 struct mbuf * 1465 m_getptr(struct mbuf *m, int loc, int *off) 1466 { 1467 1468 while (loc >= 0) { 1469 /* Normal end of search. */ 1470 if (m->m_len > loc) { 1471 *off = loc; 1472 return (m); 1473 } else { 1474 loc -= m->m_len; 1475 if (m->m_next == NULL) { 1476 if (loc == 0) { 1477 /* Point at the end of valid data. */ 1478 *off = m->m_len; 1479 return (m); 1480 } 1481 return (NULL); 1482 } 1483 m = m->m_next; 1484 } 1485 } 1486 return (NULL); 1487 } 1488 1489 void 1490 m_print(const struct mbuf *m, int maxlen) 1491 { 1492 int len; 1493 int pdata; 1494 const struct mbuf *m2; 1495 1496 if (m == NULL) { 1497 printf("mbuf: %p\n", m); 1498 return; 1499 } 1500 1501 if (m->m_flags & M_PKTHDR) 1502 len = m->m_pkthdr.len; 1503 else 1504 len = -1; 1505 m2 = m; 1506 while (m2 != NULL && (len == -1 || len)) { 1507 pdata = m2->m_len; 1508 if (maxlen != -1 && pdata > maxlen) 1509 pdata = maxlen; 1510 printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, 1511 m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" 1512 "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" 1513 "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); 1514 if (pdata) 1515 printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); 1516 if (len != -1) 1517 len -= m2->m_len; 1518 m2 = m2->m_next; 1519 } 1520 if (len > 0) 1521 printf("%d bytes unaccounted for.\n", len); 1522 return; 1523 } 1524 1525 u_int 1526 m_fixhdr(struct mbuf *m0) 1527 { 1528 u_int len; 1529 1530 len = m_length(m0, NULL); 1531 m0->m_pkthdr.len = len; 1532 return (len); 1533 } 1534 1535 u_int 1536 m_length(struct mbuf *m0, struct mbuf **last) 1537 { 1538 struct mbuf *m; 1539 u_int len; 1540 1541 len = 0; 1542 for (m = m0; m != NULL; m = m->m_next) { 1543 len += m->m_len; 1544 if (m->m_next == NULL) 1545 break; 1546 } 1547 if (last != NULL) 1548 *last = m; 1549 return (len); 1550 } 1551 1552 /* 1553 * Defragment a mbuf chain, returning the shortest possible 1554 * chain of mbufs and clusters. If allocation fails and 1555 * this cannot be completed, NULL will be returned, but 1556 * the passed in chain will be unchanged. Upon success, 1557 * the original chain will be freed, and the new chain 1558 * will be returned. 1559 * 1560 * If a non-packet header is passed in, the original 1561 * mbuf (chain?) will be returned unharmed. 1562 */ 1563 struct mbuf * 1564 m_defrag(struct mbuf *m0, int how) 1565 { 1566 struct mbuf *m_new = NULL, *m_final = NULL; 1567 int progress = 0, length; 1568 1569 MBUF_CHECKSLEEP(how); 1570 if (!(m0->m_flags & M_PKTHDR)) 1571 return (m0); 1572 1573 m_fixhdr(m0); /* Needed sanity check */ 1574 1575 #ifdef MBUF_STRESS_TEST 1576 if (m_defragrandomfailures) { 1577 int temp = arc4random() & 0xff; 1578 if (temp == 0xba) 1579 goto nospace; 1580 } 1581 #endif 1582 1583 if (m0->m_pkthdr.len > MHLEN) 1584 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 1585 else 1586 m_final = m_gethdr(how, MT_DATA); 1587 1588 if (m_final == NULL) 1589 goto nospace; 1590 1591 if (m_dup_pkthdr(m_final, m0, how) == 0) 1592 goto nospace; 1593 1594 m_new = m_final; 1595 1596 while (progress < m0->m_pkthdr.len) { 1597 length = m0->m_pkthdr.len - progress; 1598 if (length > MCLBYTES) 1599 length = MCLBYTES; 1600 1601 if (m_new == NULL) { 1602 if (length > MLEN) 1603 m_new = m_getcl(how, MT_DATA, 0); 1604 else 1605 m_new = m_get(how, MT_DATA); 1606 if (m_new == NULL) 1607 goto nospace; 1608 } 1609 1610 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 1611 progress += length; 1612 m_new->m_len = length; 1613 if (m_new != m_final) 1614 m_cat(m_final, m_new); 1615 m_new = NULL; 1616 } 1617 #ifdef MBUF_STRESS_TEST 1618 if (m0->m_next == NULL) 1619 m_defraguseless++; 1620 #endif 1621 m_freem(m0); 1622 m0 = m_final; 1623 #ifdef MBUF_STRESS_TEST 1624 m_defragpackets++; 1625 m_defragbytes += m0->m_pkthdr.len; 1626 #endif 1627 return (m0); 1628 nospace: 1629 #ifdef MBUF_STRESS_TEST 1630 m_defragfailure++; 1631 #endif 1632 if (m_final) 1633 m_freem(m_final); 1634 return (NULL); 1635 } 1636 1637 /* 1638 * Return the number of fragments an mbuf will use. This is usually 1639 * used as a proxy for the number of scatter/gather elements needed by 1640 * a DMA engine to access an mbuf. In general mapped mbufs are 1641 * assumed to be backed by physically contiguous buffers that only 1642 * need a single fragment. Unmapped mbufs, on the other hand, can 1643 * span disjoint physical pages. 1644 */ 1645 static int 1646 frags_per_mbuf(struct mbuf *m) 1647 { 1648 int frags; 1649 1650 if ((m->m_flags & M_EXTPG) == 0) 1651 return (1); 1652 1653 /* 1654 * The header and trailer are counted as a single fragment 1655 * each when present. 1656 * 1657 * XXX: This overestimates the number of fragments by assuming 1658 * all the backing physical pages are disjoint. 1659 */ 1660 frags = 0; 1661 if (m->m_epg_hdrlen != 0) 1662 frags++; 1663 frags += m->m_epg_npgs; 1664 if (m->m_epg_trllen != 0) 1665 frags++; 1666 1667 return (frags); 1668 } 1669 1670 /* 1671 * Defragment an mbuf chain, returning at most maxfrags separate 1672 * mbufs+clusters. If this is not possible NULL is returned and 1673 * the original mbuf chain is left in its present (potentially 1674 * modified) state. We use two techniques: collapsing consecutive 1675 * mbufs and replacing consecutive mbufs by a cluster. 1676 * 1677 * NB: this should really be named m_defrag but that name is taken 1678 */ 1679 struct mbuf * 1680 m_collapse(struct mbuf *m0, int how, int maxfrags) 1681 { 1682 struct mbuf *m, *n, *n2, **prev; 1683 u_int curfrags; 1684 1685 /* 1686 * Calculate the current number of frags. 1687 */ 1688 curfrags = 0; 1689 for (m = m0; m != NULL; m = m->m_next) 1690 curfrags += frags_per_mbuf(m); 1691 /* 1692 * First, try to collapse mbufs. Note that we always collapse 1693 * towards the front so we don't need to deal with moving the 1694 * pkthdr. This may be suboptimal if the first mbuf has much 1695 * less data than the following. 1696 */ 1697 m = m0; 1698 again: 1699 for (;;) { 1700 n = m->m_next; 1701 if (n == NULL) 1702 break; 1703 if (M_WRITABLE(m) && 1704 n->m_len < M_TRAILINGSPACE(m)) { 1705 m_copydata(n, 0, n->m_len, 1706 mtod(m, char *) + m->m_len); 1707 m->m_len += n->m_len; 1708 m->m_next = n->m_next; 1709 curfrags -= frags_per_mbuf(n); 1710 m_free(n); 1711 if (curfrags <= maxfrags) 1712 return m0; 1713 } else 1714 m = n; 1715 } 1716 KASSERT(maxfrags > 1, 1717 ("maxfrags %u, but normal collapse failed", maxfrags)); 1718 /* 1719 * Collapse consecutive mbufs to a cluster. 1720 */ 1721 prev = &m0->m_next; /* NB: not the first mbuf */ 1722 while ((n = *prev) != NULL) { 1723 if ((n2 = n->m_next) != NULL && 1724 n->m_len + n2->m_len < MCLBYTES) { 1725 m = m_getcl(how, MT_DATA, 0); 1726 if (m == NULL) 1727 goto bad; 1728 m_copydata(n, 0, n->m_len, mtod(m, char *)); 1729 m_copydata(n2, 0, n2->m_len, 1730 mtod(m, char *) + n->m_len); 1731 m->m_len = n->m_len + n2->m_len; 1732 m->m_next = n2->m_next; 1733 *prev = m; 1734 curfrags += 1; /* For the new cluster */ 1735 curfrags -= frags_per_mbuf(n); 1736 curfrags -= frags_per_mbuf(n2); 1737 m_free(n); 1738 m_free(n2); 1739 if (curfrags <= maxfrags) 1740 return m0; 1741 /* 1742 * Still not there, try the normal collapse 1743 * again before we allocate another cluster. 1744 */ 1745 goto again; 1746 } 1747 prev = &n->m_next; 1748 } 1749 /* 1750 * No place where we can collapse to a cluster; punt. 1751 * This can occur if, for example, you request 2 frags 1752 * but the packet requires that both be clusters (we 1753 * never reallocate the first mbuf to avoid moving the 1754 * packet header). 1755 */ 1756 bad: 1757 return NULL; 1758 } 1759 1760 #ifdef MBUF_STRESS_TEST 1761 1762 /* 1763 * Fragment an mbuf chain. There's no reason you'd ever want to do 1764 * this in normal usage, but it's great for stress testing various 1765 * mbuf consumers. 1766 * 1767 * If fragmentation is not possible, the original chain will be 1768 * returned. 1769 * 1770 * Possible length values: 1771 * 0 no fragmentation will occur 1772 * > 0 each fragment will be of the specified length 1773 * -1 each fragment will be the same random value in length 1774 * -2 each fragment's length will be entirely random 1775 * (Random values range from 1 to 256) 1776 */ 1777 struct mbuf * 1778 m_fragment(struct mbuf *m0, int how, int length) 1779 { 1780 struct mbuf *m_first, *m_last; 1781 int divisor = 255, progress = 0, fraglen; 1782 1783 if (!(m0->m_flags & M_PKTHDR)) 1784 return (m0); 1785 1786 if (length == 0 || length < -2) 1787 return (m0); 1788 if (length > MCLBYTES) 1789 length = MCLBYTES; 1790 if (length < 0 && divisor > MCLBYTES) 1791 divisor = MCLBYTES; 1792 if (length == -1) 1793 length = 1 + (arc4random() % divisor); 1794 if (length > 0) 1795 fraglen = length; 1796 1797 m_fixhdr(m0); /* Needed sanity check */ 1798 1799 m_first = m_getcl(how, MT_DATA, M_PKTHDR); 1800 if (m_first == NULL) 1801 goto nospace; 1802 1803 if (m_dup_pkthdr(m_first, m0, how) == 0) 1804 goto nospace; 1805 1806 m_last = m_first; 1807 1808 while (progress < m0->m_pkthdr.len) { 1809 if (length == -2) 1810 fraglen = 1 + (arc4random() % divisor); 1811 if (fraglen > m0->m_pkthdr.len - progress) 1812 fraglen = m0->m_pkthdr.len - progress; 1813 1814 if (progress != 0) { 1815 struct mbuf *m_new = m_getcl(how, MT_DATA, 0); 1816 if (m_new == NULL) 1817 goto nospace; 1818 1819 m_last->m_next = m_new; 1820 m_last = m_new; 1821 } 1822 1823 m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t)); 1824 progress += fraglen; 1825 m_last->m_len = fraglen; 1826 } 1827 m_freem(m0); 1828 m0 = m_first; 1829 return (m0); 1830 nospace: 1831 if (m_first) 1832 m_freem(m_first); 1833 /* Return the original chain on failure */ 1834 return (m0); 1835 } 1836 1837 #endif 1838 1839 /* 1840 * Free pages from mbuf_ext_pgs, assuming they were allocated via 1841 * vm_page_alloc() and aren't associated with any object. Complement 1842 * to allocator from m_uiotombuf_nomap(). 1843 */ 1844 void 1845 mb_free_mext_pgs(struct mbuf *m) 1846 { 1847 vm_page_t pg; 1848 1849 M_ASSERTEXTPG(m); 1850 for (int i = 0; i < m->m_epg_npgs; i++) { 1851 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1852 vm_page_unwire_noq(pg); 1853 vm_page_free(pg); 1854 } 1855 } 1856 1857 static struct mbuf * 1858 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) 1859 { 1860 struct mbuf *m, *mb, *prev; 1861 vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; 1862 int error, length, i, needed; 1863 ssize_t total; 1864 int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED; 1865 1866 MPASS((flags & M_PKTHDR) == 0); 1867 MPASS((how & M_ZERO) == 0); 1868 1869 /* 1870 * len can be zero or an arbitrary large value bound by 1871 * the total data supplied by the uio. 1872 */ 1873 if (len > 0) 1874 total = MIN(uio->uio_resid, len); 1875 else 1876 total = uio->uio_resid; 1877 1878 if (maxseg == 0) 1879 maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; 1880 1881 /* 1882 * If total is zero, return an empty mbuf. This can occur 1883 * for TLS 1.0 connections which send empty fragments as 1884 * a countermeasure against the known-IV weakness in CBC 1885 * ciphersuites. 1886 */ 1887 if (__predict_false(total == 0)) { 1888 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1889 if (mb == NULL) 1890 return (NULL); 1891 mb->m_epg_flags = EPG_FLAG_ANON; 1892 return (mb); 1893 } 1894 1895 /* 1896 * Allocate the pages 1897 */ 1898 m = NULL; 1899 while (total > 0) { 1900 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); 1901 if (mb == NULL) 1902 goto failed; 1903 if (m == NULL) 1904 m = mb; 1905 else 1906 prev->m_next = mb; 1907 prev = mb; 1908 mb->m_epg_flags = EPG_FLAG_ANON; 1909 needed = length = MIN(maxseg, total); 1910 for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { 1911 retry_page: 1912 pg_array[i] = vm_page_alloc_noobj(pflags); 1913 if (pg_array[i] == NULL) { 1914 if (how & M_NOWAIT) { 1915 goto failed; 1916 } else { 1917 vm_wait(NULL); 1918 goto retry_page; 1919 } 1920 } 1921 mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); 1922 mb->m_epg_npgs++; 1923 } 1924 mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1); 1925 MBUF_EXT_PGS_ASSERT_SANITY(mb); 1926 total -= length; 1927 error = uiomove_fromphys(pg_array, 0, length, uio); 1928 if (error != 0) 1929 goto failed; 1930 mb->m_len = length; 1931 mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs; 1932 if (flags & M_PKTHDR) 1933 m->m_pkthdr.len += length; 1934 } 1935 return (m); 1936 1937 failed: 1938 m_freem(m); 1939 return (NULL); 1940 } 1941 1942 /* 1943 * Copy the contents of uio into a properly sized mbuf chain. 1944 * A compat KPI. Users are recommended to use direct calls to backing 1945 * functions. 1946 */ 1947 struct mbuf * 1948 m_uiotombuf(struct uio *uio, int how, int len, int lspace, int flags) 1949 { 1950 1951 if (flags & M_EXTPG) { 1952 /* XXX: 'lspace' magically becomes maxseg! */ 1953 return (m_uiotombuf_nomap(uio, how, len, lspace, flags)); 1954 } else if (__predict_false(uio->uio_resid == 0)) { 1955 struct mbuf *m; 1956 1957 /* 1958 * m_uiotombuf() is known to return zero length buffer, keep 1959 * this compatibility. mc_uiotomc() won't do that. 1960 */ 1961 if (flags & M_PKTHDR) { 1962 m = m_gethdr(how, MT_DATA); 1963 m->m_pkthdr.memlen = MSIZE; 1964 } else 1965 m = m_get(how, MT_DATA); 1966 if (m != NULL) 1967 m->m_data += lspace; 1968 return (m); 1969 } else { 1970 struct mchain mc; 1971 int error; 1972 1973 error = mc_uiotomc(&mc, uio, len, lspace, how, flags); 1974 if (__predict_true(error == 0)) { 1975 if (flags & M_PKTHDR) { 1976 mc_first(&mc)->m_pkthdr.len = mc.mc_len; 1977 mc_first(&mc)->m_pkthdr.memlen = mc.mc_mlen; 1978 } 1979 return (mc_first(&mc)); 1980 } else 1981 return (NULL); 1982 } 1983 } 1984 1985 /* 1986 * Copy the contents of uio into a properly sized mbuf chain. 1987 * In case of failure state of mchain is inconsistent. 1988 * @param length Limit copyout length. If 0 entire uio_resid is copied. 1989 * @param lspace Provide leading space in the first mbuf in the chain. 1990 */ 1991 int 1992 mc_uiotomc(struct mchain *mc, struct uio *uio, u_int length, u_int lspace, 1993 int how, int flags) 1994 { 1995 struct mbuf *mb; 1996 u_int total; 1997 int error; 1998 1999 MPASS(lspace < MHLEN); 2000 MPASS(UINT_MAX - lspace >= length); 2001 MPASS(uio->uio_rw == UIO_WRITE); 2002 MPASS(uio->uio_resid >= 0); 2003 2004 if (length > 0) { 2005 if (uio->uio_resid > length) { 2006 total = length; 2007 flags &= ~M_EOR; 2008 } else 2009 total = uio->uio_resid; 2010 } else if (__predict_false(uio->uio_resid + lspace > UINT_MAX)) 2011 return (EOVERFLOW); 2012 else 2013 total = uio->uio_resid; 2014 2015 if (__predict_false(total + lspace == 0)) { 2016 *mc = MCHAIN_INITIALIZER(mc); 2017 return (0); 2018 } 2019 2020 error = mc_get(mc, total + lspace, how, MT_DATA, flags); 2021 if (__predict_false(error)) 2022 return (error); 2023 mc_first(mc)->m_data += lspace; 2024 2025 /* Fill all mbufs with uio data and update header information. */ 2026 STAILQ_FOREACH(mb, &mc->mc_q, m_stailq) { 2027 u_int mlen; 2028 2029 mlen = min(M_TRAILINGSPACE(mb), total - mc->mc_len); 2030 error = uiomove(mtod(mb, void *), mlen, uio); 2031 if (__predict_false(error)) { 2032 mc_freem(mc); 2033 return (error); 2034 } 2035 mb->m_len = mlen; 2036 mc->mc_len += mlen; 2037 } 2038 MPASS(mc->mc_len == total); 2039 2040 return (0); 2041 } 2042 2043 /* 2044 * Copy data to/from an unmapped mbuf into a uio limited by len if set. 2045 */ 2046 int 2047 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len) 2048 { 2049 vm_page_t pg; 2050 int error, i, off, pglen, pgoff, seglen, segoff; 2051 2052 M_ASSERTEXTPG(m); 2053 error = 0; 2054 2055 /* Skip over any data removed from the front. */ 2056 off = mtod(m, vm_offset_t); 2057 2058 off += m_off; 2059 if (m->m_epg_hdrlen != 0) { 2060 if (off >= m->m_epg_hdrlen) { 2061 off -= m->m_epg_hdrlen; 2062 } else { 2063 seglen = m->m_epg_hdrlen - off; 2064 segoff = off; 2065 seglen = min(seglen, len); 2066 off = 0; 2067 len -= seglen; 2068 error = uiomove(__DECONST(void *, 2069 &m->m_epg_hdr[segoff]), seglen, uio); 2070 } 2071 } 2072 pgoff = m->m_epg_1st_off; 2073 for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) { 2074 pglen = m_epg_pagelen(m, i, pgoff); 2075 if (off >= pglen) { 2076 off -= pglen; 2077 pgoff = 0; 2078 continue; 2079 } 2080 seglen = pglen - off; 2081 segoff = pgoff + off; 2082 off = 0; 2083 seglen = min(seglen, len); 2084 len -= seglen; 2085 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2086 error = uiomove_fromphys(&pg, segoff, seglen, uio); 2087 pgoff = 0; 2088 }; 2089 if (len != 0 && error == 0) { 2090 KASSERT((off + len) <= m->m_epg_trllen, 2091 ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, 2092 m->m_epg_trllen, m_off)); 2093 error = uiomove(__DECONST(void *, &m->m_epg_trail[off]), 2094 len, uio); 2095 } 2096 return (error); 2097 } 2098 2099 /* 2100 * Copy an mbuf chain into a uio limited by len if set. 2101 */ 2102 int 2103 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) 2104 { 2105 int error, length, total; 2106 int progress = 0; 2107 2108 if (len > 0) 2109 total = min(uio->uio_resid, len); 2110 else 2111 total = uio->uio_resid; 2112 2113 /* Fill the uio with data from the mbufs. */ 2114 for (; m != NULL; m = m->m_next) { 2115 length = min(m->m_len, total - progress); 2116 2117 if ((m->m_flags & M_EXTPG) != 0) 2118 error = m_unmapped_uiomove(m, 0, uio, length); 2119 else 2120 error = uiomove(mtod(m, void *), length, uio); 2121 if (error) 2122 return (error); 2123 2124 progress += length; 2125 } 2126 2127 return (0); 2128 } 2129 2130 /* 2131 * Create a writable copy of the mbuf chain. While doing this 2132 * we compact the chain with a goal of producing a chain with 2133 * at most two mbufs. The second mbuf in this chain is likely 2134 * to be a cluster. The primary purpose of this work is to create 2135 * a writable packet for encryption, compression, etc. The 2136 * secondary goal is to linearize the data so the data can be 2137 * passed to crypto hardware in the most efficient manner possible. 2138 */ 2139 struct mbuf * 2140 m_unshare(struct mbuf *m0, int how) 2141 { 2142 struct mbuf *m, *mprev; 2143 struct mbuf *n, *mfirst, *mlast; 2144 int len, off; 2145 2146 mprev = NULL; 2147 for (m = m0; m != NULL; m = mprev->m_next) { 2148 /* 2149 * Regular mbufs are ignored unless there's a cluster 2150 * in front of it that we can use to coalesce. We do 2151 * the latter mainly so later clusters can be coalesced 2152 * also w/o having to handle them specially (i.e. convert 2153 * mbuf+cluster -> cluster). This optimization is heavily 2154 * influenced by the assumption that we're running over 2155 * Ethernet where MCLBYTES is large enough that the max 2156 * packet size will permit lots of coalescing into a 2157 * single cluster. This in turn permits efficient 2158 * crypto operations, especially when using hardware. 2159 */ 2160 if ((m->m_flags & M_EXT) == 0) { 2161 if (mprev && (mprev->m_flags & M_EXT) && 2162 m->m_len <= M_TRAILINGSPACE(mprev)) { 2163 /* XXX: this ignores mbuf types */ 2164 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2165 mtod(m, caddr_t), m->m_len); 2166 mprev->m_len += m->m_len; 2167 mprev->m_next = m->m_next; /* unlink from chain */ 2168 m_free(m); /* reclaim mbuf */ 2169 } else { 2170 mprev = m; 2171 } 2172 continue; 2173 } 2174 /* 2175 * Writable mbufs are left alone (for now). 2176 */ 2177 if (M_WRITABLE(m)) { 2178 mprev = m; 2179 continue; 2180 } 2181 2182 /* 2183 * Not writable, replace with a copy or coalesce with 2184 * the previous mbuf if possible (since we have to copy 2185 * it anyway, we try to reduce the number of mbufs and 2186 * clusters so that future work is easier). 2187 */ 2188 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); 2189 /* NB: we only coalesce into a cluster or larger */ 2190 if (mprev != NULL && (mprev->m_flags & M_EXT) && 2191 m->m_len <= M_TRAILINGSPACE(mprev)) { 2192 /* XXX: this ignores mbuf types */ 2193 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2194 mtod(m, caddr_t), m->m_len); 2195 mprev->m_len += m->m_len; 2196 mprev->m_next = m->m_next; /* unlink from chain */ 2197 m_free(m); /* reclaim mbuf */ 2198 continue; 2199 } 2200 2201 /* 2202 * Allocate new space to hold the copy and copy the data. 2203 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by 2204 * splitting them into clusters. We could just malloc a 2205 * buffer and make it external but too many device drivers 2206 * don't know how to break up the non-contiguous memory when 2207 * doing DMA. 2208 */ 2209 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2210 if (n == NULL) { 2211 m_freem(m0); 2212 return (NULL); 2213 } 2214 if (m->m_flags & M_PKTHDR) { 2215 KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", 2216 __func__, m0, m)); 2217 m_move_pkthdr(n, m); 2218 } 2219 len = m->m_len; 2220 off = 0; 2221 mfirst = n; 2222 mlast = NULL; 2223 for (;;) { 2224 int cc = min(len, MCLBYTES); 2225 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); 2226 n->m_len = cc; 2227 if (mlast != NULL) 2228 mlast->m_next = n; 2229 mlast = n; 2230 #if 0 2231 newipsecstat.ips_clcopied++; 2232 #endif 2233 2234 len -= cc; 2235 if (len <= 0) 2236 break; 2237 off += cc; 2238 2239 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2240 if (n == NULL) { 2241 m_freem(mfirst); 2242 m_freem(m0); 2243 return (NULL); 2244 } 2245 } 2246 n->m_next = m->m_next; 2247 if (mprev == NULL) 2248 m0 = mfirst; /* new head of chain */ 2249 else 2250 mprev->m_next = mfirst; /* replace old mbuf */ 2251 m_free(m); /* release old mbuf */ 2252 mprev = mfirst; 2253 } 2254 return (m0); 2255 } 2256 2257 #ifdef MBUF_PROFILING 2258 2259 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ 2260 struct mbufprofile { 2261 uintmax_t wasted[MP_BUCKETS]; 2262 uintmax_t used[MP_BUCKETS]; 2263 uintmax_t segments[MP_BUCKETS]; 2264 } mbprof; 2265 2266 void 2267 m_profile(struct mbuf *m) 2268 { 2269 int segments = 0; 2270 int used = 0; 2271 int wasted = 0; 2272 2273 while (m) { 2274 segments++; 2275 used += m->m_len; 2276 if (m->m_flags & M_EXT) { 2277 wasted += MHLEN - sizeof(m->m_ext) + 2278 m->m_ext.ext_size - m->m_len; 2279 } else { 2280 if (m->m_flags & M_PKTHDR) 2281 wasted += MHLEN - m->m_len; 2282 else 2283 wasted += MLEN - m->m_len; 2284 } 2285 m = m->m_next; 2286 } 2287 /* be paranoid.. it helps */ 2288 if (segments > MP_BUCKETS - 1) 2289 segments = MP_BUCKETS - 1; 2290 if (used > 100000) 2291 used = 100000; 2292 if (wasted > 100000) 2293 wasted = 100000; 2294 /* store in the appropriate bucket */ 2295 /* don't bother locking. if it's slightly off, so what? */ 2296 mbprof.segments[segments]++; 2297 mbprof.used[fls(used)]++; 2298 mbprof.wasted[fls(wasted)]++; 2299 } 2300 2301 static int 2302 mbprof_handler(SYSCTL_HANDLER_ARGS) 2303 { 2304 char buf[256]; 2305 struct sbuf sb; 2306 int error; 2307 uint64_t *p; 2308 2309 sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); 2310 2311 p = &mbprof.wasted[0]; 2312 sbuf_printf(&sb, 2313 "wasted:\n" 2314 "%ju %ju %ju %ju %ju %ju %ju %ju " 2315 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2316 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2317 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2318 #ifdef BIG_ARRAY 2319 p = &mbprof.wasted[16]; 2320 sbuf_printf(&sb, 2321 "%ju %ju %ju %ju %ju %ju %ju %ju " 2322 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2323 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2324 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2325 #endif 2326 p = &mbprof.used[0]; 2327 sbuf_printf(&sb, 2328 "used:\n" 2329 "%ju %ju %ju %ju %ju %ju %ju %ju " 2330 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2331 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2332 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2333 #ifdef BIG_ARRAY 2334 p = &mbprof.used[16]; 2335 sbuf_printf(&sb, 2336 "%ju %ju %ju %ju %ju %ju %ju %ju " 2337 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2338 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2339 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2340 #endif 2341 p = &mbprof.segments[0]; 2342 sbuf_printf(&sb, 2343 "segments:\n" 2344 "%ju %ju %ju %ju %ju %ju %ju %ju " 2345 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2346 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2347 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2348 #ifdef BIG_ARRAY 2349 p = &mbprof.segments[16]; 2350 sbuf_printf(&sb, 2351 "%ju %ju %ju %ju %ju %ju %ju %ju " 2352 "%ju %ju %ju %ju %ju %ju %ju %jju", 2353 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2354 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2355 #endif 2356 2357 error = sbuf_finish(&sb); 2358 sbuf_delete(&sb); 2359 return (error); 2360 } 2361 2362 static int 2363 mbprof_clr_handler(SYSCTL_HANDLER_ARGS) 2364 { 2365 int clear, error; 2366 2367 clear = 0; 2368 error = sysctl_handle_int(oidp, &clear, 0, req); 2369 if (error || !req->newptr) 2370 return (error); 2371 2372 if (clear) { 2373 bzero(&mbprof, sizeof(mbprof)); 2374 } 2375 2376 return (error); 2377 } 2378 2379 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, 2380 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2381 mbprof_handler, "A", 2382 "mbuf profiling statistics"); 2383 2384 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, 2385 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 2386 mbprof_clr_handler, "I", 2387 "clear mbuf profiling statistics"); 2388 #endif 2389