1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include "opt_param.h" 34 #include "opt_mbuf_stress_test.h" 35 #include "opt_mbuf_profiling.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/limits.h> 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/sysctl.h> 45 #include <sys/domain.h> 46 #include <sys/protosw.h> 47 #include <sys/uio.h> 48 #include <sys/vmmeter.h> 49 #include <sys/sbuf.h> 50 #include <sys/sdt.h> 51 #include <vm/vm.h> 52 #include <vm/vm_pageout.h> 53 #include <vm/vm_page.h> 54 55 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, 56 "struct mbuf *", "mbufinfo_t *", 57 "uint32_t", "uint32_t", 58 "uint16_t", "uint16_t", 59 "uint32_t", "uint32_t", 60 "uint32_t", "uint32_t"); 61 62 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw, 63 "uint32_t", "uint32_t", 64 "uint16_t", "uint16_t", 65 "struct mbuf *", "mbufinfo_t *"); 66 67 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, 68 "uint32_t", "uint32_t", 69 "uint16_t", "uint16_t", 70 "struct mbuf *", "mbufinfo_t *"); 71 72 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw, 73 "uint32_t", "uint32_t", 74 "uint16_t", "uint16_t", 75 "struct mbuf *", "mbufinfo_t *"); 76 77 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, 78 "uint32_t", "uint32_t", 79 "uint16_t", "uint16_t", 80 "struct mbuf *", "mbufinfo_t *"); 81 82 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, 83 "uint32_t", "uint32_t", 84 "uint16_t", "uint16_t", 85 "uint32_t", "uint32_t", 86 "struct mbuf *", "mbufinfo_t *"); 87 88 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl, 89 "uint32_t", "uint32_t", 90 "uint16_t", "uint16_t", 91 "uint32_t", "uint32_t", 92 "uint32_t", "uint32_t", 93 "struct mbuf *", "mbufinfo_t *"); 94 95 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, 96 "struct mbuf *", "mbufinfo_t *", 97 "uint32_t", "uint32_t", 98 "uint32_t", "uint32_t"); 99 100 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, 101 "struct mbuf *", "mbufinfo_t *", 102 "uint32_t", "uint32_t", 103 "uint32_t", "uint32_t", 104 "void*", "void*"); 105 106 SDT_PROBE_DEFINE(sdt, , , m__cljset); 107 108 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, 109 "struct mbuf *", "mbufinfo_t *"); 110 111 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, 112 "struct mbuf *", "mbufinfo_t *"); 113 114 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freemp, 115 "struct mbuf *", "mbufinfo_t *"); 116 117 #include <security/mac/mac_framework.h> 118 119 /* 120 * Provide minimum possible defaults for link and protocol header space, 121 * assuming IPv4 over Ethernet. Enabling IPv6, IEEE802.11 or some other 122 * protocol may grow these values. 123 */ 124 u_int max_linkhdr = 16; 125 u_int max_protohdr = 40; 126 u_int max_hdr = 16 + 40; 127 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, 128 &max_linkhdr, 16, "Size of largest link layer header"); 129 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, 130 &max_protohdr, 40, "Size of largest protocol layer header"); 131 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, 132 &max_hdr, 16 + 40, "Size of largest link plus protocol header"); 133 134 static void 135 max_hdr_grow(void) 136 { 137 138 max_hdr = max_linkhdr + max_protohdr; 139 MPASS(max_hdr <= MHLEN); 140 } 141 142 void 143 max_linkhdr_grow(u_int new) 144 { 145 146 if (new > max_linkhdr) { 147 max_linkhdr = new; 148 max_hdr_grow(); 149 } 150 } 151 152 void 153 max_protohdr_grow(u_int new) 154 { 155 156 if (new > max_protohdr) { 157 max_protohdr = new; 158 max_hdr_grow(); 159 } 160 } 161 162 #ifdef MBUF_STRESS_TEST 163 int m_defragpackets; 164 int m_defragbytes; 165 int m_defraguseless; 166 int m_defragfailure; 167 int m_defragrandomfailures; 168 169 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, 170 &m_defragpackets, 0, ""); 171 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, 172 &m_defragbytes, 0, ""); 173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, 174 &m_defraguseless, 0, ""); 175 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, 176 &m_defragfailure, 0, ""); 177 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, 178 &m_defragrandomfailures, 0, ""); 179 #endif 180 181 /* 182 * Ensure the correct size of various mbuf parameters. It could be off due 183 * to compiler-induced padding and alignment artifacts. 184 */ 185 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); 186 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); 187 188 /* 189 * mbuf data storage should be 64-bit aligned regardless of architectural 190 * pointer size; check this is the case with and without a packet header. 191 */ 192 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); 193 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); 194 195 /* 196 * While the specific values here don't matter too much (i.e., +/- a few 197 * words), we do want to ensure that changes to these values are carefully 198 * reasoned about and properly documented. This is especially the case as 199 * network-protocol and device-driver modules encode these layouts, and must 200 * be recompiled if the structures change. Check these values at compile time 201 * against the ones documented in comments in mbuf.h. 202 * 203 * NB: Possibly they should be documented there via #define's and not just 204 * comments. 205 */ 206 #if defined(__LP64__) 207 CTASSERT(offsetof(struct mbuf, m_dat) == 32); 208 CTASSERT(sizeof(struct pkthdr) == 64); 209 CTASSERT(sizeof(struct m_ext) == 160); 210 #else 211 CTASSERT(offsetof(struct mbuf, m_dat) == 24); 212 CTASSERT(sizeof(struct pkthdr) == 56); 213 #if defined(__powerpc__) && defined(BOOKE) 214 /* PowerPC booke has 64-bit physical pointers. */ 215 CTASSERT(sizeof(struct m_ext) == 176); 216 #else 217 CTASSERT(sizeof(struct m_ext) == 172); 218 #endif 219 #endif 220 221 /* 222 * Assert that the queue(3) macros produce code of the same size as an old 223 * plain pointer does. 224 */ 225 #ifdef INVARIANTS 226 static struct mbuf __used m_assertbuf; 227 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); 228 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); 229 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); 230 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); 231 #endif 232 233 /* 234 * Attach the cluster from *m to *n, set up m_ext in *n 235 * and bump the refcount of the cluster. 236 */ 237 void 238 mb_dupcl(struct mbuf *n, struct mbuf *m) 239 { 240 volatile u_int *refcnt; 241 242 KASSERT(m->m_flags & (M_EXT | M_EXTPG), 243 ("%s: M_EXT | M_EXTPG not set on %p", __func__, m)); 244 KASSERT(!(n->m_flags & (M_EXT | M_EXTPG)), 245 ("%s: M_EXT | M_EXTPG set on %p", __func__, n)); 246 247 /* 248 * Cache access optimization. 249 * 250 * o Regular M_EXT storage doesn't need full copy of m_ext, since 251 * the holder of the 'ext_count' is responsible to carry the free 252 * routine and its arguments. 253 * o M_EXTPG data is split between main part of mbuf and m_ext, the 254 * main part is copied in full, the m_ext part is similar to M_EXT. 255 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is 256 * special - it needs full copy of m_ext into each mbuf, since any 257 * copy could end up as the last to free. 258 */ 259 if (m->m_flags & M_EXTPG) { 260 bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy, 261 __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy)); 262 bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen); 263 } else if (m->m_ext.ext_type == EXT_EXTREF) 264 bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext)); 265 else 266 bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); 267 268 n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG); 269 270 /* See if this is the mbuf that holds the embedded refcount. */ 271 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 272 refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; 273 n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; 274 } else { 275 KASSERT(m->m_ext.ext_cnt != NULL, 276 ("%s: no refcounting pointer on %p", __func__, m)); 277 refcnt = m->m_ext.ext_cnt; 278 } 279 280 if (*refcnt == 1) 281 *refcnt += 1; 282 else 283 atomic_add_int(refcnt, 1); 284 } 285 286 void 287 m_demote_pkthdr(struct mbuf *m) 288 { 289 290 M_ASSERTPKTHDR(m); 291 M_ASSERT_NO_SND_TAG(m); 292 293 m_tag_delete_chain(m, NULL); 294 m->m_flags &= ~M_PKTHDR; 295 bzero(&m->m_pkthdr, sizeof(struct pkthdr)); 296 } 297 298 /* 299 * Clean up mbuf (chain) from any tags and packet headers. 300 * If "all" is set then the first mbuf in the chain will be 301 * cleaned too. 302 */ 303 void 304 m_demote(struct mbuf *m0, int all, int flags) 305 { 306 struct mbuf *m; 307 308 flags |= M_DEMOTEFLAGS; 309 310 for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { 311 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", 312 __func__, m, m0)); 313 if (m->m_flags & M_PKTHDR) 314 m_demote_pkthdr(m); 315 m->m_flags &= flags; 316 } 317 } 318 319 /* 320 * Sanity checks on mbuf (chain) for use in KASSERT() and general 321 * debugging. 322 * Returns 0 or panics when bad and 1 on all tests passed. 323 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they 324 * blow up later. 325 */ 326 int 327 m_sanity(struct mbuf *m0, int sanitize) 328 { 329 struct mbuf *m; 330 caddr_t a, b; 331 int pktlen = 0; 332 333 #ifdef INVARIANTS 334 #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) 335 #else 336 #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) 337 #endif 338 339 for (m = m0; m != NULL; m = m->m_next) { 340 /* 341 * Basic pointer checks. If any of these fails then some 342 * unrelated kernel memory before or after us is trashed. 343 * No way to recover from that. 344 */ 345 a = M_START(m); 346 b = a + M_SIZE(m); 347 if ((caddr_t)m->m_data < a) 348 M_SANITY_ACTION("m_data outside mbuf data range left"); 349 if ((caddr_t)m->m_data > b) 350 M_SANITY_ACTION("m_data outside mbuf data range right"); 351 if ((caddr_t)m->m_data + m->m_len > b) 352 M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); 353 354 /* m->m_nextpkt may only be set on first mbuf in chain. */ 355 if (m != m0 && m->m_nextpkt != NULL) { 356 if (sanitize) { 357 m_freem(m->m_nextpkt); 358 m->m_nextpkt = (struct mbuf *)0xDEADC0DE; 359 } else 360 M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); 361 } 362 363 /* packet length (not mbuf length!) calculation */ 364 if (m0->m_flags & M_PKTHDR) 365 pktlen += m->m_len; 366 367 /* m_tags may only be attached to first mbuf in chain. */ 368 if (m != m0 && m->m_flags & M_PKTHDR && 369 !SLIST_EMPTY(&m->m_pkthdr.tags)) { 370 if (sanitize) { 371 m_tag_delete_chain(m, NULL); 372 /* put in 0xDEADC0DE perhaps? */ 373 } else 374 M_SANITY_ACTION("m_tags on in-chain mbuf"); 375 } 376 377 /* M_PKTHDR may only be set on first mbuf in chain */ 378 if (m != m0 && m->m_flags & M_PKTHDR) { 379 if (sanitize) { 380 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 381 m->m_flags &= ~M_PKTHDR; 382 /* put in 0xDEADCODE and leave hdr flag in */ 383 } else 384 M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); 385 } 386 } 387 m = m0; 388 if (pktlen && pktlen != m->m_pkthdr.len) { 389 if (sanitize) 390 m->m_pkthdr.len = 0; 391 else 392 M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); 393 } 394 return 1; 395 396 #undef M_SANITY_ACTION 397 } 398 399 /* 400 * Non-inlined part of m_init(). 401 */ 402 int 403 m_pkthdr_init(struct mbuf *m, int how) 404 { 405 #ifdef MAC 406 int error; 407 #endif 408 m->m_data = m->m_pktdat; 409 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); 410 #ifdef NUMA 411 m->m_pkthdr.numa_domain = M_NODOM; 412 #endif 413 #ifdef MAC 414 /* If the label init fails, fail the alloc */ 415 error = mac_mbuf_init(m, how); 416 if (error) 417 return (error); 418 #endif 419 420 return (0); 421 } 422 423 /* 424 * "Move" mbuf pkthdr from "from" to "to". 425 * "from" must have M_PKTHDR set, and "to" must be empty. 426 */ 427 void 428 m_move_pkthdr(struct mbuf *to, struct mbuf *from) 429 { 430 431 #if 0 432 /* see below for why these are not enabled */ 433 M_ASSERTPKTHDR(to); 434 /* Note: with MAC, this may not be a good assertion. */ 435 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), 436 ("m_move_pkthdr: to has tags")); 437 #endif 438 #ifdef MAC 439 /* 440 * XXXMAC: It could be this should also occur for non-MAC? 441 */ 442 if (to->m_flags & M_PKTHDR) 443 m_tag_delete_chain(to, NULL); 444 #endif 445 to->m_flags = (from->m_flags & M_COPYFLAGS) | 446 (to->m_flags & (M_EXT | M_EXTPG)); 447 if ((to->m_flags & M_EXT) == 0) 448 to->m_data = to->m_pktdat; 449 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 450 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ 451 from->m_flags &= ~M_PKTHDR; 452 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { 453 from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; 454 from->m_pkthdr.snd_tag = NULL; 455 } 456 } 457 458 /* 459 * Duplicate "from"'s mbuf pkthdr in "to". 460 * "from" must have M_PKTHDR set, and "to" must be empty. 461 * In particular, this does a deep copy of the packet tags. 462 */ 463 int 464 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) 465 { 466 467 #if 0 468 /* 469 * The mbuf allocator only initializes the pkthdr 470 * when the mbuf is allocated with m_gethdr(). Many users 471 * (e.g. m_copy*, m_prepend) use m_get() and then 472 * smash the pkthdr as needed causing these 473 * assertions to trip. For now just disable them. 474 */ 475 M_ASSERTPKTHDR(to); 476 /* Note: with MAC, this may not be a good assertion. */ 477 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); 478 #endif 479 MBUF_CHECKSLEEP(how); 480 #ifdef MAC 481 if (to->m_flags & M_PKTHDR) 482 m_tag_delete_chain(to, NULL); 483 #endif 484 to->m_flags = (from->m_flags & M_COPYFLAGS) | 485 (to->m_flags & (M_EXT | M_EXTPG)); 486 if ((to->m_flags & M_EXT) == 0) 487 to->m_data = to->m_pktdat; 488 to->m_pkthdr = from->m_pkthdr; 489 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) 490 m_snd_tag_ref(from->m_pkthdr.snd_tag); 491 SLIST_INIT(&to->m_pkthdr.tags); 492 return (m_tag_copy_chain(to, from, how)); 493 } 494 495 /* 496 * Lesser-used path for M_PREPEND: 497 * allocate new mbuf to prepend to chain, 498 * copy junk along. 499 */ 500 struct mbuf * 501 m_prepend(struct mbuf *m, int len, int how) 502 { 503 struct mbuf *mn; 504 505 if (m->m_flags & M_PKTHDR) 506 mn = m_gethdr(how, m->m_type); 507 else 508 mn = m_get(how, m->m_type); 509 if (mn == NULL) { 510 m_freem(m); 511 return (NULL); 512 } 513 if (m->m_flags & M_PKTHDR) 514 m_move_pkthdr(mn, m); 515 mn->m_next = m; 516 m = mn; 517 if (len < M_SIZE(m)) 518 M_ALIGN(m, len); 519 m->m_len = len; 520 return (m); 521 } 522 523 /* 524 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 525 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 526 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. 527 * Note that the copy is read-only, because clusters are not copied, 528 * only their reference counts are incremented. 529 */ 530 struct mbuf * 531 m_copym(struct mbuf *m, int off0, int len, int wait) 532 { 533 struct mbuf *n, **np; 534 int off = off0; 535 struct mbuf *top; 536 int copyhdr = 0; 537 538 KASSERT(off >= 0, ("m_copym, negative off %d", off)); 539 KASSERT(len >= 0, ("m_copym, negative len %d", len)); 540 MBUF_CHECKSLEEP(wait); 541 if (off == 0 && m->m_flags & M_PKTHDR) 542 copyhdr = 1; 543 while (off > 0) { 544 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); 545 if (off < m->m_len) 546 break; 547 off -= m->m_len; 548 m = m->m_next; 549 } 550 np = ⊤ 551 top = NULL; 552 while (len > 0) { 553 if (m == NULL) { 554 KASSERT(len == M_COPYALL, 555 ("m_copym, length > size of mbuf chain")); 556 break; 557 } 558 if (copyhdr) 559 n = m_gethdr(wait, m->m_type); 560 else 561 n = m_get(wait, m->m_type); 562 *np = n; 563 if (n == NULL) 564 goto nospace; 565 if (copyhdr) { 566 if (!m_dup_pkthdr(n, m, wait)) 567 goto nospace; 568 if (len == M_COPYALL) 569 n->m_pkthdr.len -= off0; 570 else 571 n->m_pkthdr.len = len; 572 copyhdr = 0; 573 } 574 n->m_len = min(len, m->m_len - off); 575 if (m->m_flags & (M_EXT | M_EXTPG)) { 576 n->m_data = m->m_data + off; 577 mb_dupcl(n, m); 578 } else 579 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 580 (u_int)n->m_len); 581 if (len != M_COPYALL) 582 len -= n->m_len; 583 off = 0; 584 m = m->m_next; 585 np = &n->m_next; 586 } 587 588 return (top); 589 nospace: 590 m_freem(top); 591 return (NULL); 592 } 593 594 /* 595 * Copy an entire packet, including header (which must be present). 596 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. 597 * Note that the copy is read-only, because clusters are not copied, 598 * only their reference counts are incremented. 599 * Preserve alignment of the first mbuf so if the creator has left 600 * some room at the beginning (e.g. for inserting protocol headers) 601 * the copies still have the room available. 602 */ 603 struct mbuf * 604 m_copypacket(struct mbuf *m, int how) 605 { 606 struct mbuf *top, *n, *o; 607 608 MBUF_CHECKSLEEP(how); 609 n = m_get(how, m->m_type); 610 top = n; 611 if (n == NULL) 612 goto nospace; 613 614 if (!m_dup_pkthdr(n, m, how)) 615 goto nospace; 616 n->m_len = m->m_len; 617 if (m->m_flags & (M_EXT | M_EXTPG)) { 618 n->m_data = m->m_data; 619 mb_dupcl(n, m); 620 } else { 621 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); 622 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 623 } 624 625 m = m->m_next; 626 while (m) { 627 o = m_get(how, m->m_type); 628 if (o == NULL) 629 goto nospace; 630 631 n->m_next = o; 632 n = n->m_next; 633 634 n->m_len = m->m_len; 635 if (m->m_flags & (M_EXT | M_EXTPG)) { 636 n->m_data = m->m_data; 637 mb_dupcl(n, m); 638 } else { 639 bcopy(mtod(m, char *), mtod(n, char *), n->m_len); 640 } 641 642 m = m->m_next; 643 } 644 return top; 645 nospace: 646 m_freem(top); 647 return (NULL); 648 } 649 650 static void 651 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) 652 { 653 struct iovec iov; 654 struct uio uio; 655 int error __diagused; 656 657 KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); 658 KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); 659 KASSERT(off < m->m_len, 660 ("m_copyfromunmapped: len exceeds mbuf length")); 661 iov.iov_base = cp; 662 iov.iov_len = len; 663 uio.uio_resid = len; 664 uio.uio_iov = &iov; 665 uio.uio_segflg = UIO_SYSSPACE; 666 uio.uio_iovcnt = 1; 667 uio.uio_offset = 0; 668 uio.uio_rw = UIO_READ; 669 error = m_unmapped_uiomove(m, off, &uio, len); 670 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 671 len)); 672 } 673 674 /* 675 * Copy data from an mbuf chain starting "off" bytes from the beginning, 676 * continuing for "len" bytes, into the indicated buffer. 677 */ 678 void 679 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) 680 { 681 u_int count; 682 683 KASSERT(off >= 0, ("m_copydata, negative off %d", off)); 684 KASSERT(len >= 0, ("m_copydata, negative len %d", len)); 685 while (off > 0) { 686 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); 687 if (off < m->m_len) 688 break; 689 off -= m->m_len; 690 m = m->m_next; 691 } 692 while (len > 0) { 693 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); 694 count = min(m->m_len - off, len); 695 if ((m->m_flags & M_EXTPG) != 0) 696 m_copyfromunmapped(m, off, count, cp); 697 else 698 bcopy(mtod(m, caddr_t) + off, cp, count); 699 len -= count; 700 cp += count; 701 off = 0; 702 m = m->m_next; 703 } 704 } 705 706 /* 707 * Copy a packet header mbuf chain into a completely new chain, including 708 * copying any mbuf clusters. Use this instead of m_copypacket() when 709 * you need a writable copy of an mbuf chain. 710 */ 711 struct mbuf * 712 m_dup(const struct mbuf *m, int how) 713 { 714 struct mbuf **p, *top = NULL; 715 int remain, moff, nsize; 716 717 MBUF_CHECKSLEEP(how); 718 /* Sanity check */ 719 if (m == NULL) 720 return (NULL); 721 M_ASSERTPKTHDR(m); 722 723 /* While there's more data, get a new mbuf, tack it on, and fill it */ 724 remain = m->m_pkthdr.len; 725 moff = 0; 726 p = ⊤ 727 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ 728 struct mbuf *n; 729 730 /* Get the next new mbuf */ 731 if (remain >= MINCLSIZE) { 732 n = m_getcl(how, m->m_type, 0); 733 nsize = MCLBYTES; 734 } else { 735 n = m_get(how, m->m_type); 736 nsize = MLEN; 737 } 738 if (n == NULL) 739 goto nospace; 740 741 if (top == NULL) { /* First one, must be PKTHDR */ 742 if (!m_dup_pkthdr(n, m, how)) { 743 m_free(n); 744 goto nospace; 745 } 746 if ((n->m_flags & M_EXT) == 0) 747 nsize = MHLEN; 748 n->m_flags &= ~M_RDONLY; 749 } 750 n->m_len = 0; 751 752 /* Link it into the new chain */ 753 *p = n; 754 p = &n->m_next; 755 756 /* Copy data from original mbuf(s) into new mbuf */ 757 while (n->m_len < nsize && m != NULL) { 758 int chunk = min(nsize - n->m_len, m->m_len - moff); 759 760 m_copydata(m, moff, chunk, n->m_data + n->m_len); 761 moff += chunk; 762 n->m_len += chunk; 763 remain -= chunk; 764 if (moff == m->m_len) { 765 m = m->m_next; 766 moff = 0; 767 } 768 } 769 770 /* Check correct total mbuf length */ 771 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), 772 ("%s: bogus m_pkthdr.len", __func__)); 773 } 774 return (top); 775 776 nospace: 777 m_freem(top); 778 return (NULL); 779 } 780 781 /* 782 * Concatenate mbuf chain n to m. 783 * Both chains must be of the same type (e.g. MT_DATA). 784 * Any m_pkthdr is not updated. 785 */ 786 void 787 m_cat(struct mbuf *m, struct mbuf *n) 788 { 789 while (m->m_next) 790 m = m->m_next; 791 while (n) { 792 if (!M_WRITABLE(m) || 793 (n->m_flags & M_EXTPG) != 0 || 794 M_TRAILINGSPACE(m) < n->m_len) { 795 /* just join the two chains */ 796 m->m_next = n; 797 return; 798 } 799 /* splat the data from one into the other */ 800 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 801 (u_int)n->m_len); 802 m->m_len += n->m_len; 803 n = m_free(n); 804 } 805 } 806 807 /* 808 * Concatenate two pkthdr mbuf chains. 809 */ 810 void 811 m_catpkt(struct mbuf *m, struct mbuf *n) 812 { 813 814 M_ASSERTPKTHDR(m); 815 M_ASSERTPKTHDR(n); 816 817 m->m_pkthdr.len += n->m_pkthdr.len; 818 m_demote(n, 1, 0); 819 820 m_cat(m, n); 821 } 822 823 void 824 m_adj(struct mbuf *mp, int req_len) 825 { 826 int len = req_len; 827 struct mbuf *m; 828 int count; 829 830 if ((m = mp) == NULL) 831 return; 832 if (len >= 0) { 833 /* 834 * Trim from head. 835 */ 836 while (m != NULL && len > 0) { 837 if (m->m_len <= len) { 838 len -= m->m_len; 839 m->m_len = 0; 840 m = m->m_next; 841 } else { 842 m->m_len -= len; 843 m->m_data += len; 844 len = 0; 845 } 846 } 847 if (mp->m_flags & M_PKTHDR) 848 mp->m_pkthdr.len -= (req_len - len); 849 } else { 850 /* 851 * Trim from tail. Scan the mbuf chain, 852 * calculating its length and finding the last mbuf. 853 * If the adjustment only affects this mbuf, then just 854 * adjust and return. Otherwise, rescan and truncate 855 * after the remaining size. 856 */ 857 len = -len; 858 count = 0; 859 for (;;) { 860 count += m->m_len; 861 if (m->m_next == (struct mbuf *)0) 862 break; 863 m = m->m_next; 864 } 865 if (m->m_len >= len) { 866 m->m_len -= len; 867 if (mp->m_flags & M_PKTHDR) 868 mp->m_pkthdr.len -= len; 869 return; 870 } 871 count -= len; 872 if (count < 0) 873 count = 0; 874 /* 875 * Correct length for chain is "count". 876 * Find the mbuf with last data, adjust its length, 877 * and toss data from remaining mbufs on chain. 878 */ 879 m = mp; 880 if (m->m_flags & M_PKTHDR) 881 m->m_pkthdr.len = count; 882 for (; m; m = m->m_next) { 883 if (m->m_len >= count) { 884 m->m_len = count; 885 if (m->m_next != NULL) { 886 m_freem(m->m_next); 887 m->m_next = NULL; 888 } 889 break; 890 } 891 count -= m->m_len; 892 } 893 } 894 } 895 896 void 897 m_adj_decap(struct mbuf *mp, int len) 898 { 899 uint8_t rsstype; 900 901 m_adj(mp, len); 902 if ((mp->m_flags & M_PKTHDR) != 0) { 903 /* 904 * If flowid was calculated by card from the inner 905 * headers, move flowid to the decapsulated mbuf 906 * chain, otherwise clear. This depends on the 907 * internals of m_adj, which keeps pkthdr as is, in 908 * particular not changing rsstype and flowid. 909 */ 910 rsstype = mp->m_pkthdr.rsstype; 911 if ((rsstype & M_HASHTYPE_INNER) != 0) { 912 M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER); 913 } else { 914 M_HASHTYPE_CLEAR(mp); 915 } 916 } 917 } 918 919 /* 920 * Rearange an mbuf chain so that len bytes are contiguous 921 * and in the data area of an mbuf (so that mtod will work 922 * for a structure of size len). Returns the resulting 923 * mbuf chain on success, frees it and returns null on failure. 924 * If there is room, it will add up to max_protohdr-len extra bytes to the 925 * contiguous region in an attempt to avoid being called next time. 926 */ 927 struct mbuf * 928 m_pullup(struct mbuf *n, int len) 929 { 930 struct mbuf *m; 931 int count; 932 int space; 933 934 KASSERT((n->m_flags & M_EXTPG) == 0, 935 ("%s: unmapped mbuf %p", __func__, n)); 936 937 /* 938 * If first mbuf has no cluster, and has room for len bytes 939 * without shifting current data, pullup into it, 940 * otherwise allocate a new mbuf to prepend to the chain. 941 */ 942 if ((n->m_flags & M_EXT) == 0 && 943 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 944 if (n->m_len >= len) 945 return (n); 946 m = n; 947 n = n->m_next; 948 len -= m->m_len; 949 } else { 950 if (len > MHLEN) 951 goto bad; 952 m = m_get(M_NOWAIT, n->m_type); 953 if (m == NULL) 954 goto bad; 955 if (n->m_flags & M_PKTHDR) 956 m_move_pkthdr(m, n); 957 } 958 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 959 do { 960 count = min(min(max(len, max_protohdr), space), n->m_len); 961 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, 962 (u_int)count); 963 len -= count; 964 m->m_len += count; 965 n->m_len -= count; 966 space -= count; 967 if (n->m_len) 968 n->m_data += count; 969 else 970 n = m_free(n); 971 } while (len > 0 && n); 972 if (len > 0) { 973 (void) m_free(m); 974 goto bad; 975 } 976 m->m_next = n; 977 return (m); 978 bad: 979 m_freem(n); 980 return (NULL); 981 } 982 983 /* 984 * Like m_pullup(), except a new mbuf is always allocated, and we allow 985 * the amount of empty space before the data in the new mbuf to be specified 986 * (in the event that the caller expects to prepend later). 987 */ 988 struct mbuf * 989 m_copyup(struct mbuf *n, int len, int dstoff) 990 { 991 struct mbuf *m; 992 int count, space; 993 994 if (len > (MHLEN - dstoff)) 995 goto bad; 996 m = m_get(M_NOWAIT, n->m_type); 997 if (m == NULL) 998 goto bad; 999 if (n->m_flags & M_PKTHDR) 1000 m_move_pkthdr(m, n); 1001 m->m_data += dstoff; 1002 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 1003 do { 1004 count = min(min(max(len, max_protohdr), space), n->m_len); 1005 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 1006 (unsigned)count); 1007 len -= count; 1008 m->m_len += count; 1009 n->m_len -= count; 1010 space -= count; 1011 if (n->m_len) 1012 n->m_data += count; 1013 else 1014 n = m_free(n); 1015 } while (len > 0 && n); 1016 if (len > 0) { 1017 (void) m_free(m); 1018 goto bad; 1019 } 1020 m->m_next = n; 1021 return (m); 1022 bad: 1023 m_freem(n); 1024 return (NULL); 1025 } 1026 1027 /* 1028 * Partition an mbuf chain in two pieces, returning the tail -- 1029 * all but the first len0 bytes. In case of failure, it returns NULL and 1030 * attempts to restore the chain to its original state. 1031 * 1032 * Note that the resulting mbufs might be read-only, because the new 1033 * mbuf can end up sharing an mbuf cluster with the original mbuf if 1034 * the "breaking point" happens to lie within a cluster mbuf. Use the 1035 * M_WRITABLE() macro to check for this case. 1036 */ 1037 struct mbuf * 1038 m_split(struct mbuf *m0, int len0, int wait) 1039 { 1040 struct mbuf *m, *n; 1041 u_int len = len0, remain; 1042 1043 MBUF_CHECKSLEEP(wait); 1044 for (m = m0; m && len > m->m_len; m = m->m_next) 1045 len -= m->m_len; 1046 if (m == NULL) 1047 return (NULL); 1048 remain = m->m_len - len; 1049 if (m0->m_flags & M_PKTHDR && remain == 0) { 1050 n = m_gethdr(wait, m0->m_type); 1051 if (n == NULL) 1052 return (NULL); 1053 n->m_next = m->m_next; 1054 m->m_next = NULL; 1055 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1056 n->m_pkthdr.snd_tag = 1057 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1058 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1059 } else 1060 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1061 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1062 m0->m_pkthdr.len = len0; 1063 return (n); 1064 } else if (m0->m_flags & M_PKTHDR) { 1065 n = m_gethdr(wait, m0->m_type); 1066 if (n == NULL) 1067 return (NULL); 1068 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { 1069 n->m_pkthdr.snd_tag = 1070 m_snd_tag_ref(m0->m_pkthdr.snd_tag); 1071 n->m_pkthdr.csum_flags |= CSUM_SND_TAG; 1072 } else 1073 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1074 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1075 m0->m_pkthdr.len = len0; 1076 if (m->m_flags & (M_EXT | M_EXTPG)) 1077 goto extpacket; 1078 if (remain > MHLEN) { 1079 /* m can't be the lead packet */ 1080 M_ALIGN(n, 0); 1081 n->m_next = m_split(m, len, wait); 1082 if (n->m_next == NULL) { 1083 (void) m_free(n); 1084 return (NULL); 1085 } else { 1086 n->m_len = 0; 1087 return (n); 1088 } 1089 } else 1090 M_ALIGN(n, remain); 1091 } else if (remain == 0) { 1092 n = m->m_next; 1093 m->m_next = NULL; 1094 return (n); 1095 } else { 1096 n = m_get(wait, m->m_type); 1097 if (n == NULL) 1098 return (NULL); 1099 M_ALIGN(n, remain); 1100 } 1101 extpacket: 1102 if (m->m_flags & (M_EXT | M_EXTPG)) { 1103 n->m_data = m->m_data + len; 1104 mb_dupcl(n, m); 1105 } else { 1106 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); 1107 } 1108 n->m_len = remain; 1109 m->m_len = len; 1110 n->m_next = m->m_next; 1111 m->m_next = NULL; 1112 return (n); 1113 } 1114 1115 /* 1116 * Partition mchain in two pieces, keeping len0 bytes in head and transferring 1117 * remainder to tail. In case of failure, both chains to be left untouched. 1118 * M_EOR is observed correctly. 1119 * Resulting mbufs might be read-only. 1120 */ 1121 int 1122 mc_split(struct mchain *head, struct mchain *tail, u_int len0, int wait) 1123 { 1124 struct mbuf *m, *n; 1125 u_int len, mlen, remain; 1126 1127 MPASS(!(mc_first(head)->m_flags & M_PKTHDR)); 1128 MBUF_CHECKSLEEP(wait); 1129 1130 mlen = 0; 1131 len = len0; 1132 STAILQ_FOREACH(m, &head->mc_q, m_stailq) { 1133 mlen += MSIZE; 1134 if (m->m_flags & M_EXT) 1135 mlen += m->m_ext.ext_size; 1136 if (len > m->m_len) 1137 len -= m->m_len; 1138 else 1139 break; 1140 } 1141 if (__predict_false(m == NULL)) { 1142 *tail = MCHAIN_INITIALIZER(tail); 1143 return (0); 1144 } 1145 remain = m->m_len - len; 1146 if (remain > 0) { 1147 if (__predict_false((n = m_get(wait, m->m_type)) == NULL)) 1148 return (ENOMEM); 1149 m_align(n, remain); 1150 if (m->m_flags & M_EXT) { 1151 n->m_data = m->m_data + len; 1152 mb_dupcl(n, m); 1153 } else 1154 bcopy(mtod(m, char *) + len, mtod(n, char *), remain); 1155 } 1156 1157 /* XXXGL: need STAILQ_SPLIT */ 1158 STAILQ_FIRST(&tail->mc_q) = STAILQ_NEXT(m, m_stailq); 1159 tail->mc_q.stqh_last = head->mc_q.stqh_last; 1160 tail->mc_len = head->mc_len - len0; 1161 tail->mc_mlen = head->mc_mlen - mlen; 1162 if (remain > 0) { 1163 MPASS(n->m_len == 0); 1164 mc_prepend(tail, n); 1165 n->m_len = remain; 1166 m->m_len -= remain; 1167 if (m->m_flags & M_EOR) { 1168 m->m_flags &= ~M_EOR; 1169 n->m_flags |= M_EOR; 1170 } 1171 } 1172 head->mc_q.stqh_last = &STAILQ_NEXT(m, m_stailq); 1173 STAILQ_NEXT(m, m_stailq) = NULL; 1174 head->mc_len = len0; 1175 head->mc_mlen = mlen; 1176 1177 return (0); 1178 } 1179 1180 /* 1181 * Routine to copy from device local memory into mbufs. 1182 * Note that `off' argument is offset into first mbuf of target chain from 1183 * which to begin copying the data to. 1184 */ 1185 struct mbuf * 1186 m_devget(char *buf, int totlen, int off, struct ifnet *ifp, 1187 void (*copy)(char *from, caddr_t to, u_int len)) 1188 { 1189 struct mbuf *m; 1190 struct mbuf *top = NULL, **mp = ⊤ 1191 int len; 1192 1193 if (off < 0 || off > MHLEN) 1194 return (NULL); 1195 1196 while (totlen > 0) { 1197 if (top == NULL) { /* First one, must be PKTHDR */ 1198 if (totlen + off >= MINCLSIZE) { 1199 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 1200 len = MCLBYTES; 1201 } else { 1202 m = m_gethdr(M_NOWAIT, MT_DATA); 1203 len = MHLEN; 1204 1205 /* Place initial small packet/header at end of mbuf */ 1206 if (m && totlen + off + max_linkhdr <= MHLEN) { 1207 m->m_data += max_linkhdr; 1208 len -= max_linkhdr; 1209 } 1210 } 1211 if (m == NULL) 1212 return NULL; 1213 m->m_pkthdr.rcvif = ifp; 1214 m->m_pkthdr.len = totlen; 1215 } else { 1216 if (totlen + off >= MINCLSIZE) { 1217 m = m_getcl(M_NOWAIT, MT_DATA, 0); 1218 len = MCLBYTES; 1219 } else { 1220 m = m_get(M_NOWAIT, MT_DATA); 1221 len = MLEN; 1222 } 1223 if (m == NULL) { 1224 m_freem(top); 1225 return NULL; 1226 } 1227 } 1228 if (off) { 1229 m->m_data += off; 1230 len -= off; 1231 off = 0; 1232 } 1233 m->m_len = len = min(totlen, len); 1234 if (copy) 1235 copy(buf, mtod(m, caddr_t), (u_int)len); 1236 else 1237 bcopy(buf, mtod(m, caddr_t), (u_int)len); 1238 buf += len; 1239 *mp = m; 1240 mp = &m->m_next; 1241 totlen -= len; 1242 } 1243 return (top); 1244 } 1245 1246 static void 1247 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp) 1248 { 1249 struct iovec iov; 1250 struct uio uio; 1251 int error __diagused; 1252 1253 KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off)); 1254 KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len)); 1255 KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length")); 1256 iov.iov_base = __DECONST(caddr_t, cp); 1257 iov.iov_len = len; 1258 uio.uio_resid = len; 1259 uio.uio_iov = &iov; 1260 uio.uio_segflg = UIO_SYSSPACE; 1261 uio.uio_iovcnt = 1; 1262 uio.uio_offset = 0; 1263 uio.uio_rw = UIO_WRITE; 1264 error = m_unmapped_uiomove(m, off, &uio, len); 1265 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, 1266 len)); 1267 } 1268 1269 /* 1270 * Copy data from a buffer back into the indicated mbuf chain, 1271 * starting "off" bytes from the beginning, extending the mbuf 1272 * chain if necessary. 1273 */ 1274 void 1275 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) 1276 { 1277 int mlen; 1278 struct mbuf *m = m0, *n; 1279 int totlen = 0; 1280 1281 if (m0 == NULL) 1282 return; 1283 while (off > (mlen = m->m_len)) { 1284 off -= mlen; 1285 totlen += mlen; 1286 if (m->m_next == NULL) { 1287 n = m_get(M_NOWAIT, m->m_type); 1288 if (n == NULL) 1289 goto out; 1290 bzero(mtod(n, caddr_t), MLEN); 1291 n->m_len = min(MLEN, len + off); 1292 m->m_next = n; 1293 } 1294 m = m->m_next; 1295 } 1296 while (len > 0) { 1297 if (m->m_next == NULL && (len > m->m_len - off)) { 1298 m->m_len += min(len - (m->m_len - off), 1299 M_TRAILINGSPACE(m)); 1300 } 1301 mlen = min (m->m_len - off, len); 1302 if ((m->m_flags & M_EXTPG) != 0) 1303 m_copytounmapped(m, off, mlen, cp); 1304 else 1305 bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); 1306 cp += mlen; 1307 len -= mlen; 1308 mlen += off; 1309 off = 0; 1310 totlen += mlen; 1311 if (len == 0) 1312 break; 1313 if (m->m_next == NULL) { 1314 n = m_get(M_NOWAIT, m->m_type); 1315 if (n == NULL) 1316 break; 1317 n->m_len = min(MLEN, len); 1318 m->m_next = n; 1319 } 1320 m = m->m_next; 1321 } 1322 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) 1323 m->m_pkthdr.len = totlen; 1324 } 1325 1326 /* 1327 * Append the specified data to the indicated mbuf chain, 1328 * Extend the mbuf chain if the new data does not fit in 1329 * existing space. 1330 * 1331 * Return 1 if able to complete the job; otherwise 0. 1332 */ 1333 int 1334 m_append(struct mbuf *m0, int len, c_caddr_t cp) 1335 { 1336 struct mbuf *m, *n; 1337 int remainder, space; 1338 1339 for (m = m0; m->m_next != NULL; m = m->m_next) 1340 ; 1341 remainder = len; 1342 space = M_TRAILINGSPACE(m); 1343 if (space > 0) { 1344 /* 1345 * Copy into available space. 1346 */ 1347 if (space > remainder) 1348 space = remainder; 1349 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 1350 m->m_len += space; 1351 cp += space, remainder -= space; 1352 } 1353 while (remainder > 0) { 1354 /* 1355 * Allocate a new mbuf; could check space 1356 * and allocate a cluster instead. 1357 */ 1358 n = m_get(M_NOWAIT, m->m_type); 1359 if (n == NULL) 1360 break; 1361 n->m_len = min(MLEN, remainder); 1362 bcopy(cp, mtod(n, caddr_t), n->m_len); 1363 cp += n->m_len, remainder -= n->m_len; 1364 m->m_next = n; 1365 m = n; 1366 } 1367 if (m0->m_flags & M_PKTHDR) 1368 m0->m_pkthdr.len += len - remainder; 1369 return (remainder == 0); 1370 } 1371 1372 static int 1373 m_apply_extpg_one(struct mbuf *m, int off, int len, 1374 int (*f)(void *, void *, u_int), void *arg) 1375 { 1376 void *p; 1377 u_int i, count, pgoff, pglen; 1378 int rval; 1379 1380 KASSERT(PMAP_HAS_DMAP, 1381 ("m_apply_extpg_one does not support unmapped mbufs")); 1382 off += mtod(m, vm_offset_t); 1383 if (off < m->m_epg_hdrlen) { 1384 count = min(m->m_epg_hdrlen - off, len); 1385 rval = f(arg, m->m_epg_hdr + off, count); 1386 if (rval) 1387 return (rval); 1388 len -= count; 1389 off = 0; 1390 } else 1391 off -= m->m_epg_hdrlen; 1392 pgoff = m->m_epg_1st_off; 1393 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 1394 pglen = m_epg_pagelen(m, i, pgoff); 1395 if (off < pglen) { 1396 count = min(pglen - off, len); 1397 p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off); 1398 rval = f(arg, p, count); 1399 if (rval) 1400 return (rval); 1401 len -= count; 1402 off = 0; 1403 } else 1404 off -= pglen; 1405 pgoff = 0; 1406 } 1407 if (len > 0) { 1408 KASSERT(off < m->m_epg_trllen, 1409 ("m_apply_extpg_one: offset beyond trailer")); 1410 KASSERT(len <= m->m_epg_trllen - off, 1411 ("m_apply_extpg_one: length beyond trailer")); 1412 return (f(arg, m->m_epg_trail + off, len)); 1413 } 1414 return (0); 1415 } 1416 1417 /* Apply function f to the data in a single mbuf. */ 1418 static int 1419 m_apply_one(struct mbuf *m, int off, int len, 1420 int (*f)(void *, void *, u_int), void *arg) 1421 { 1422 if ((m->m_flags & M_EXTPG) != 0) 1423 return (m_apply_extpg_one(m, off, len, f, arg)); 1424 else 1425 return (f(arg, mtod(m, caddr_t) + off, len)); 1426 } 1427 1428 /* 1429 * Apply function f to the data in an mbuf chain starting "off" bytes from 1430 * the beginning, continuing for "len" bytes. 1431 */ 1432 int 1433 m_apply(struct mbuf *m, int off, int len, 1434 int (*f)(void *, void *, u_int), void *arg) 1435 { 1436 u_int count; 1437 int rval; 1438 1439 KASSERT(off >= 0, ("m_apply, negative off %d", off)); 1440 KASSERT(len >= 0, ("m_apply, negative len %d", len)); 1441 while (off > 0) { 1442 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain " 1443 "(%d extra)", off)); 1444 if (off < m->m_len) 1445 break; 1446 off -= m->m_len; 1447 m = m->m_next; 1448 } 1449 while (len > 0) { 1450 KASSERT(m != NULL, ("m_apply, length > size of mbuf chain " 1451 "(%d extra)", len)); 1452 count = min(m->m_len - off, len); 1453 rval = m_apply_one(m, off, count, f, arg); 1454 if (rval) 1455 return (rval); 1456 len -= count; 1457 off = 0; 1458 m = m->m_next; 1459 } 1460 return (0); 1461 } 1462 1463 /* 1464 * Return a pointer to mbuf/offset of location in mbuf chain. 1465 */ 1466 struct mbuf * 1467 m_getptr(struct mbuf *m, int loc, int *off) 1468 { 1469 1470 while (loc >= 0) { 1471 /* Normal end of search. */ 1472 if (m->m_len > loc) { 1473 *off = loc; 1474 return (m); 1475 } else { 1476 loc -= m->m_len; 1477 if (m->m_next == NULL) { 1478 if (loc == 0) { 1479 /* Point at the end of valid data. */ 1480 *off = m->m_len; 1481 return (m); 1482 } 1483 return (NULL); 1484 } 1485 m = m->m_next; 1486 } 1487 } 1488 return (NULL); 1489 } 1490 1491 void 1492 m_print(const struct mbuf *m, int maxlen) 1493 { 1494 int len; 1495 int pdata; 1496 const struct mbuf *m2; 1497 1498 if (m == NULL) { 1499 printf("mbuf: %p\n", m); 1500 return; 1501 } 1502 1503 if (m->m_flags & M_PKTHDR) 1504 len = m->m_pkthdr.len; 1505 else 1506 len = -1; 1507 m2 = m; 1508 while (m2 != NULL && (len == -1 || len)) { 1509 pdata = m2->m_len; 1510 if (maxlen != -1 && pdata > maxlen) 1511 pdata = maxlen; 1512 printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, 1513 m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" 1514 "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" 1515 "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); 1516 if (pdata) 1517 printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); 1518 if (len != -1) 1519 len -= m2->m_len; 1520 m2 = m2->m_next; 1521 } 1522 if (len > 0) 1523 printf("%d bytes unaccounted for.\n", len); 1524 return; 1525 } 1526 1527 u_int 1528 m_fixhdr(struct mbuf *m0) 1529 { 1530 u_int len; 1531 1532 len = m_length(m0, NULL); 1533 m0->m_pkthdr.len = len; 1534 return (len); 1535 } 1536 1537 u_int 1538 m_length(struct mbuf *m0, struct mbuf **last) 1539 { 1540 struct mbuf *m; 1541 u_int len; 1542 1543 len = 0; 1544 for (m = m0; m != NULL; m = m->m_next) { 1545 len += m->m_len; 1546 if (m->m_next == NULL) 1547 break; 1548 } 1549 if (last != NULL) 1550 *last = m; 1551 return (len); 1552 } 1553 1554 /* 1555 * Defragment a mbuf chain, returning the shortest possible 1556 * chain of mbufs and clusters. If allocation fails and 1557 * this cannot be completed, NULL will be returned, but 1558 * the passed in chain will be unchanged. Upon success, 1559 * the original chain will be freed, and the new chain 1560 * will be returned. 1561 * 1562 * If a non-packet header is passed in, the original 1563 * mbuf (chain?) will be returned unharmed. 1564 */ 1565 struct mbuf * 1566 m_defrag(struct mbuf *m0, int how) 1567 { 1568 struct mbuf *m_new = NULL, *m_final = NULL; 1569 int progress = 0, length; 1570 1571 MBUF_CHECKSLEEP(how); 1572 if (!(m0->m_flags & M_PKTHDR)) 1573 return (m0); 1574 1575 m_fixhdr(m0); /* Needed sanity check */ 1576 1577 #ifdef MBUF_STRESS_TEST 1578 if (m_defragrandomfailures) { 1579 int temp = arc4random() & 0xff; 1580 if (temp == 0xba) 1581 goto nospace; 1582 } 1583 #endif 1584 1585 if (m0->m_pkthdr.len > MHLEN) 1586 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 1587 else 1588 m_final = m_gethdr(how, MT_DATA); 1589 1590 if (m_final == NULL) 1591 goto nospace; 1592 1593 if (m_dup_pkthdr(m_final, m0, how) == 0) 1594 goto nospace; 1595 1596 m_new = m_final; 1597 1598 while (progress < m0->m_pkthdr.len) { 1599 length = m0->m_pkthdr.len - progress; 1600 if (length > MCLBYTES) 1601 length = MCLBYTES; 1602 1603 if (m_new == NULL) { 1604 if (length > MLEN) 1605 m_new = m_getcl(how, MT_DATA, 0); 1606 else 1607 m_new = m_get(how, MT_DATA); 1608 if (m_new == NULL) 1609 goto nospace; 1610 } 1611 1612 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 1613 progress += length; 1614 m_new->m_len = length; 1615 if (m_new != m_final) 1616 m_cat(m_final, m_new); 1617 m_new = NULL; 1618 } 1619 #ifdef MBUF_STRESS_TEST 1620 if (m0->m_next == NULL) 1621 m_defraguseless++; 1622 #endif 1623 m_freem(m0); 1624 m0 = m_final; 1625 #ifdef MBUF_STRESS_TEST 1626 m_defragpackets++; 1627 m_defragbytes += m0->m_pkthdr.len; 1628 #endif 1629 return (m0); 1630 nospace: 1631 #ifdef MBUF_STRESS_TEST 1632 m_defragfailure++; 1633 #endif 1634 if (m_final) 1635 m_freem(m_final); 1636 return (NULL); 1637 } 1638 1639 /* 1640 * Return the number of fragments an mbuf will use. This is usually 1641 * used as a proxy for the number of scatter/gather elements needed by 1642 * a DMA engine to access an mbuf. In general mapped mbufs are 1643 * assumed to be backed by physically contiguous buffers that only 1644 * need a single fragment. Unmapped mbufs, on the other hand, can 1645 * span disjoint physical pages. 1646 */ 1647 static int 1648 frags_per_mbuf(struct mbuf *m) 1649 { 1650 int frags; 1651 1652 if ((m->m_flags & M_EXTPG) == 0) 1653 return (1); 1654 1655 /* 1656 * The header and trailer are counted as a single fragment 1657 * each when present. 1658 * 1659 * XXX: This overestimates the number of fragments by assuming 1660 * all the backing physical pages are disjoint. 1661 */ 1662 frags = 0; 1663 if (m->m_epg_hdrlen != 0) 1664 frags++; 1665 frags += m->m_epg_npgs; 1666 if (m->m_epg_trllen != 0) 1667 frags++; 1668 1669 return (frags); 1670 } 1671 1672 /* 1673 * Defragment an mbuf chain, returning at most maxfrags separate 1674 * mbufs+clusters. If this is not possible NULL is returned and 1675 * the original mbuf chain is left in its present (potentially 1676 * modified) state. We use two techniques: collapsing consecutive 1677 * mbufs and replacing consecutive mbufs by a cluster. 1678 * 1679 * NB: this should really be named m_defrag but that name is taken 1680 */ 1681 struct mbuf * 1682 m_collapse(struct mbuf *m0, int how, int maxfrags) 1683 { 1684 struct mbuf *m, *n, *n2, **prev; 1685 u_int curfrags; 1686 1687 /* 1688 * Calculate the current number of frags. 1689 */ 1690 curfrags = 0; 1691 for (m = m0; m != NULL; m = m->m_next) 1692 curfrags += frags_per_mbuf(m); 1693 /* 1694 * First, try to collapse mbufs. Note that we always collapse 1695 * towards the front so we don't need to deal with moving the 1696 * pkthdr. This may be suboptimal if the first mbuf has much 1697 * less data than the following. 1698 */ 1699 m = m0; 1700 again: 1701 for (;;) { 1702 n = m->m_next; 1703 if (n == NULL) 1704 break; 1705 if (M_WRITABLE(m) && 1706 n->m_len < M_TRAILINGSPACE(m)) { 1707 m_copydata(n, 0, n->m_len, 1708 mtod(m, char *) + m->m_len); 1709 m->m_len += n->m_len; 1710 m->m_next = n->m_next; 1711 curfrags -= frags_per_mbuf(n); 1712 m_free(n); 1713 if (curfrags <= maxfrags) 1714 return m0; 1715 } else 1716 m = n; 1717 } 1718 KASSERT(maxfrags > 1, 1719 ("maxfrags %u, but normal collapse failed", maxfrags)); 1720 /* 1721 * Collapse consecutive mbufs to a cluster. 1722 */ 1723 prev = &m0->m_next; /* NB: not the first mbuf */ 1724 while ((n = *prev) != NULL) { 1725 if ((n2 = n->m_next) != NULL && 1726 n->m_len + n2->m_len < MCLBYTES) { 1727 m = m_getcl(how, MT_DATA, 0); 1728 if (m == NULL) 1729 goto bad; 1730 m_copydata(n, 0, n->m_len, mtod(m, char *)); 1731 m_copydata(n2, 0, n2->m_len, 1732 mtod(m, char *) + n->m_len); 1733 m->m_len = n->m_len + n2->m_len; 1734 m->m_next = n2->m_next; 1735 *prev = m; 1736 curfrags += 1; /* For the new cluster */ 1737 curfrags -= frags_per_mbuf(n); 1738 curfrags -= frags_per_mbuf(n2); 1739 m_free(n); 1740 m_free(n2); 1741 if (curfrags <= maxfrags) 1742 return m0; 1743 /* 1744 * Still not there, try the normal collapse 1745 * again before we allocate another cluster. 1746 */ 1747 goto again; 1748 } 1749 prev = &n->m_next; 1750 } 1751 /* 1752 * No place where we can collapse to a cluster; punt. 1753 * This can occur if, for example, you request 2 frags 1754 * but the packet requires that both be clusters (we 1755 * never reallocate the first mbuf to avoid moving the 1756 * packet header). 1757 */ 1758 bad: 1759 return NULL; 1760 } 1761 1762 #ifdef MBUF_STRESS_TEST 1763 1764 /* 1765 * Fragment an mbuf chain. There's no reason you'd ever want to do 1766 * this in normal usage, but it's great for stress testing various 1767 * mbuf consumers. 1768 * 1769 * If fragmentation is not possible, the original chain will be 1770 * returned. 1771 * 1772 * Possible length values: 1773 * 0 no fragmentation will occur 1774 * > 0 each fragment will be of the specified length 1775 * -1 each fragment will be the same random value in length 1776 * -2 each fragment's length will be entirely random 1777 * (Random values range from 1 to 256) 1778 */ 1779 struct mbuf * 1780 m_fragment(struct mbuf *m0, int how, int length) 1781 { 1782 struct mbuf *m_first, *m_last; 1783 int divisor = 255, progress = 0, fraglen; 1784 1785 if (!(m0->m_flags & M_PKTHDR)) 1786 return (m0); 1787 1788 if (length == 0 || length < -2) 1789 return (m0); 1790 if (length > MCLBYTES) 1791 length = MCLBYTES; 1792 if (length < 0 && divisor > MCLBYTES) 1793 divisor = MCLBYTES; 1794 if (length == -1) 1795 length = 1 + (arc4random() % divisor); 1796 if (length > 0) 1797 fraglen = length; 1798 1799 m_fixhdr(m0); /* Needed sanity check */ 1800 1801 m_first = m_getcl(how, MT_DATA, M_PKTHDR); 1802 if (m_first == NULL) 1803 goto nospace; 1804 1805 if (m_dup_pkthdr(m_first, m0, how) == 0) 1806 goto nospace; 1807 1808 m_last = m_first; 1809 1810 while (progress < m0->m_pkthdr.len) { 1811 if (length == -2) 1812 fraglen = 1 + (arc4random() % divisor); 1813 if (fraglen > m0->m_pkthdr.len - progress) 1814 fraglen = m0->m_pkthdr.len - progress; 1815 1816 if (progress != 0) { 1817 struct mbuf *m_new = m_getcl(how, MT_DATA, 0); 1818 if (m_new == NULL) 1819 goto nospace; 1820 1821 m_last->m_next = m_new; 1822 m_last = m_new; 1823 } 1824 1825 m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t)); 1826 progress += fraglen; 1827 m_last->m_len = fraglen; 1828 } 1829 m_freem(m0); 1830 m0 = m_first; 1831 return (m0); 1832 nospace: 1833 if (m_first) 1834 m_freem(m_first); 1835 /* Return the original chain on failure */ 1836 return (m0); 1837 } 1838 1839 #endif 1840 1841 /* 1842 * Free pages from mbuf_ext_pgs, assuming they were allocated via 1843 * vm_page_alloc() and aren't associated with any object. Complement 1844 * to allocator from m_uiotombuf_nomap(). 1845 */ 1846 void 1847 mb_free_mext_pgs(struct mbuf *m) 1848 { 1849 vm_page_t pg; 1850 1851 M_ASSERTEXTPG(m); 1852 for (int i = 0; i < m->m_epg_npgs; i++) { 1853 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1854 vm_page_unwire_noq(pg); 1855 vm_page_free(pg); 1856 } 1857 } 1858 1859 static struct mbuf * 1860 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) 1861 { 1862 struct mbuf *m, *mb, *prev; 1863 vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; 1864 int error, length, i, needed; 1865 ssize_t total; 1866 int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED; 1867 1868 MPASS((flags & M_PKTHDR) == 0); 1869 MPASS((how & M_ZERO) == 0); 1870 1871 /* 1872 * len can be zero or an arbitrary large value bound by 1873 * the total data supplied by the uio. 1874 */ 1875 if (len > 0) 1876 total = MIN(uio->uio_resid, len); 1877 else 1878 total = uio->uio_resid; 1879 1880 if (maxseg == 0) 1881 maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; 1882 1883 /* 1884 * If total is zero, return an empty mbuf. This can occur 1885 * for TLS 1.0 connections which send empty fragments as 1886 * a countermeasure against the known-IV weakness in CBC 1887 * ciphersuites. 1888 */ 1889 if (__predict_false(total == 0)) { 1890 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0); 1891 if (mb == NULL) 1892 return (NULL); 1893 mb->m_epg_flags = EPG_FLAG_ANON; 1894 return (mb); 1895 } 1896 1897 /* 1898 * Allocate the pages 1899 */ 1900 m = NULL; 1901 while (total > 0) { 1902 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0); 1903 if (mb == NULL) 1904 goto failed; 1905 if (m == NULL) 1906 m = mb; 1907 else 1908 prev->m_next = mb; 1909 prev = mb; 1910 mb->m_epg_flags = EPG_FLAG_ANON; 1911 needed = length = MIN(maxseg, total); 1912 for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { 1913 retry_page: 1914 pg_array[i] = vm_page_alloc_noobj(pflags); 1915 if (pg_array[i] == NULL) { 1916 if (how & M_NOWAIT) { 1917 goto failed; 1918 } else { 1919 vm_wait(NULL); 1920 goto retry_page; 1921 } 1922 } 1923 mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); 1924 mb->m_epg_npgs++; 1925 } 1926 mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1); 1927 MBUF_EXT_PGS_ASSERT_SANITY(mb); 1928 total -= length; 1929 error = uiomove_fromphys(pg_array, 0, length, uio); 1930 if (error != 0) 1931 goto failed; 1932 mb->m_len = length; 1933 mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs; 1934 if (flags & M_PKTHDR) 1935 m->m_pkthdr.len += length; 1936 } 1937 return (m); 1938 1939 failed: 1940 m_freem(m); 1941 return (NULL); 1942 } 1943 1944 /* 1945 * Copy the contents of uio into a properly sized mbuf chain. 1946 * A compat KPI. Users are recommended to use direct calls to backing 1947 * functions. 1948 */ 1949 struct mbuf * 1950 m_uiotombuf(struct uio *uio, int how, int len, int lspace, int flags) 1951 { 1952 1953 if (flags & M_EXTPG) { 1954 /* XXX: 'lspace' magically becomes maxseg! */ 1955 return (m_uiotombuf_nomap(uio, how, len, lspace, flags)); 1956 } else if (__predict_false(uio->uio_resid == 0)) { 1957 struct mbuf *m; 1958 1959 /* 1960 * m_uiotombuf() is known to return zero length buffer, keep 1961 * this compatibility. mc_uiotomc() won't do that. 1962 */ 1963 if (flags & M_PKTHDR) { 1964 m = m_gethdr(how, MT_DATA); 1965 m->m_pkthdr.memlen = MSIZE; 1966 } else 1967 m = m_get(how, MT_DATA); 1968 if (m != NULL) 1969 m->m_data += lspace; 1970 return (m); 1971 } else { 1972 struct mchain mc; 1973 int error; 1974 1975 error = mc_uiotomc(&mc, uio, len, lspace, how, flags); 1976 if (__predict_true(error == 0)) { 1977 if (flags & M_PKTHDR) { 1978 mc_first(&mc)->m_pkthdr.len = mc.mc_len; 1979 mc_first(&mc)->m_pkthdr.memlen = mc.mc_mlen; 1980 } 1981 return (mc_first(&mc)); 1982 } else 1983 return (NULL); 1984 } 1985 } 1986 1987 /* 1988 * Copy the contents of uio into a properly sized mbuf chain. 1989 * In case of failure state of mchain is inconsistent. 1990 * @param length Limit copyout length. If 0 entire uio_resid is copied. 1991 * @param lspace Provide leading space in the first mbuf in the chain. 1992 */ 1993 int 1994 mc_uiotomc(struct mchain *mc, struct uio *uio, u_int length, u_int lspace, 1995 int how, int flags) 1996 { 1997 struct mbuf *mb; 1998 u_int total; 1999 int error; 2000 2001 MPASS(lspace < MHLEN); 2002 MPASS(UINT_MAX - lspace >= length); 2003 MPASS(uio->uio_rw == UIO_WRITE); 2004 MPASS(uio->uio_resid >= 0); 2005 2006 if (length > 0) { 2007 if (uio->uio_resid > length) { 2008 total = length; 2009 flags &= ~M_EOR; 2010 } else 2011 total = uio->uio_resid; 2012 } else if (__predict_false(uio->uio_resid + lspace > UINT_MAX)) 2013 return (EOVERFLOW); 2014 else 2015 total = uio->uio_resid; 2016 2017 if (__predict_false(total + lspace == 0)) { 2018 *mc = MCHAIN_INITIALIZER(mc); 2019 return (0); 2020 } 2021 2022 error = mc_get(mc, total + lspace, how, MT_DATA, flags); 2023 if (__predict_false(error)) 2024 return (error); 2025 mc_first(mc)->m_data += lspace; 2026 2027 /* Fill all mbufs with uio data and update header information. */ 2028 STAILQ_FOREACH(mb, &mc->mc_q, m_stailq) { 2029 u_int mlen; 2030 2031 mlen = min(M_TRAILINGSPACE(mb), total - mc->mc_len); 2032 error = uiomove(mtod(mb, void *), mlen, uio); 2033 if (__predict_false(error)) { 2034 mc_freem(mc); 2035 return (error); 2036 } 2037 mb->m_len = mlen; 2038 mc->mc_len += mlen; 2039 } 2040 MPASS(mc->mc_len == total); 2041 2042 return (0); 2043 } 2044 2045 /* 2046 * Copy data to/from an unmapped mbuf into a uio limited by len if set. 2047 */ 2048 int 2049 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len) 2050 { 2051 vm_page_t pg; 2052 int error, i, off, pglen, pgoff, seglen, segoff; 2053 2054 M_ASSERTEXTPG(m); 2055 error = 0; 2056 2057 /* Skip over any data removed from the front. */ 2058 off = mtod(m, vm_offset_t); 2059 2060 off += m_off; 2061 if (m->m_epg_hdrlen != 0) { 2062 if (off >= m->m_epg_hdrlen) { 2063 off -= m->m_epg_hdrlen; 2064 } else { 2065 seglen = m->m_epg_hdrlen - off; 2066 segoff = off; 2067 seglen = min(seglen, len); 2068 off = 0; 2069 len -= seglen; 2070 error = uiomove(__DECONST(void *, 2071 &m->m_epg_hdr[segoff]), seglen, uio); 2072 } 2073 } 2074 pgoff = m->m_epg_1st_off; 2075 for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) { 2076 pglen = m_epg_pagelen(m, i, pgoff); 2077 if (off >= pglen) { 2078 off -= pglen; 2079 pgoff = 0; 2080 continue; 2081 } 2082 seglen = pglen - off; 2083 segoff = pgoff + off; 2084 off = 0; 2085 seglen = min(seglen, len); 2086 len -= seglen; 2087 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2088 error = uiomove_fromphys(&pg, segoff, seglen, uio); 2089 pgoff = 0; 2090 }; 2091 if (len != 0 && error == 0) { 2092 KASSERT((off + len) <= m->m_epg_trllen, 2093 ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, 2094 m->m_epg_trllen, m_off)); 2095 error = uiomove(__DECONST(void *, &m->m_epg_trail[off]), 2096 len, uio); 2097 } 2098 return (error); 2099 } 2100 2101 /* 2102 * Copy an mbuf chain into a uio limited by len if set. 2103 */ 2104 int 2105 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) 2106 { 2107 int error, length, total; 2108 int progress = 0; 2109 2110 if (len > 0) 2111 total = min(uio->uio_resid, len); 2112 else 2113 total = uio->uio_resid; 2114 2115 /* Fill the uio with data from the mbufs. */ 2116 for (; m != NULL; m = m->m_next) { 2117 length = min(m->m_len, total - progress); 2118 2119 if ((m->m_flags & M_EXTPG) != 0) 2120 error = m_unmapped_uiomove(m, 0, uio, length); 2121 else 2122 error = uiomove(mtod(m, void *), length, uio); 2123 if (error) 2124 return (error); 2125 2126 progress += length; 2127 } 2128 2129 return (0); 2130 } 2131 2132 /* 2133 * Create a writable copy of the mbuf chain. While doing this 2134 * we compact the chain with a goal of producing a chain with 2135 * at most two mbufs. The second mbuf in this chain is likely 2136 * to be a cluster. The primary purpose of this work is to create 2137 * a writable packet for encryption, compression, etc. The 2138 * secondary goal is to linearize the data so the data can be 2139 * passed to crypto hardware in the most efficient manner possible. 2140 */ 2141 struct mbuf * 2142 m_unshare(struct mbuf *m0, int how) 2143 { 2144 struct mbuf *m, *mprev; 2145 struct mbuf *n, *mfirst, *mlast; 2146 int len, off; 2147 2148 mprev = NULL; 2149 for (m = m0; m != NULL; m = mprev->m_next) { 2150 /* 2151 * Regular mbufs are ignored unless there's a cluster 2152 * in front of it that we can use to coalesce. We do 2153 * the latter mainly so later clusters can be coalesced 2154 * also w/o having to handle them specially (i.e. convert 2155 * mbuf+cluster -> cluster). This optimization is heavily 2156 * influenced by the assumption that we're running over 2157 * Ethernet where MCLBYTES is large enough that the max 2158 * packet size will permit lots of coalescing into a 2159 * single cluster. This in turn permits efficient 2160 * crypto operations, especially when using hardware. 2161 */ 2162 if ((m->m_flags & M_EXT) == 0) { 2163 if (mprev && (mprev->m_flags & M_EXT) && 2164 m->m_len <= M_TRAILINGSPACE(mprev)) { 2165 /* XXX: this ignores mbuf types */ 2166 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2167 mtod(m, caddr_t), m->m_len); 2168 mprev->m_len += m->m_len; 2169 mprev->m_next = m->m_next; /* unlink from chain */ 2170 m_free(m); /* reclaim mbuf */ 2171 } else { 2172 mprev = m; 2173 } 2174 continue; 2175 } 2176 /* 2177 * Writable mbufs are left alone (for now). 2178 */ 2179 if (M_WRITABLE(m)) { 2180 mprev = m; 2181 continue; 2182 } 2183 2184 /* 2185 * Not writable, replace with a copy or coalesce with 2186 * the previous mbuf if possible (since we have to copy 2187 * it anyway, we try to reduce the number of mbufs and 2188 * clusters so that future work is easier). 2189 */ 2190 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); 2191 /* NB: we only coalesce into a cluster or larger */ 2192 if (mprev != NULL && (mprev->m_flags & M_EXT) && 2193 m->m_len <= M_TRAILINGSPACE(mprev)) { 2194 /* XXX: this ignores mbuf types */ 2195 memcpy(mtod(mprev, caddr_t) + mprev->m_len, 2196 mtod(m, caddr_t), m->m_len); 2197 mprev->m_len += m->m_len; 2198 mprev->m_next = m->m_next; /* unlink from chain */ 2199 m_free(m); /* reclaim mbuf */ 2200 continue; 2201 } 2202 2203 /* 2204 * Allocate new space to hold the copy and copy the data. 2205 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by 2206 * splitting them into clusters. We could just malloc a 2207 * buffer and make it external but too many device drivers 2208 * don't know how to break up the non-contiguous memory when 2209 * doing DMA. 2210 */ 2211 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2212 if (n == NULL) { 2213 m_freem(m0); 2214 return (NULL); 2215 } 2216 if (m->m_flags & M_PKTHDR) { 2217 KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", 2218 __func__, m0, m)); 2219 m_move_pkthdr(n, m); 2220 } 2221 len = m->m_len; 2222 off = 0; 2223 mfirst = n; 2224 mlast = NULL; 2225 for (;;) { 2226 int cc = min(len, MCLBYTES); 2227 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); 2228 n->m_len = cc; 2229 if (mlast != NULL) 2230 mlast->m_next = n; 2231 mlast = n; 2232 #if 0 2233 newipsecstat.ips_clcopied++; 2234 #endif 2235 2236 len -= cc; 2237 if (len <= 0) 2238 break; 2239 off += cc; 2240 2241 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); 2242 if (n == NULL) { 2243 m_freem(mfirst); 2244 m_freem(m0); 2245 return (NULL); 2246 } 2247 } 2248 n->m_next = m->m_next; 2249 if (mprev == NULL) 2250 m0 = mfirst; /* new head of chain */ 2251 else 2252 mprev->m_next = mfirst; /* replace old mbuf */ 2253 m_free(m); /* release old mbuf */ 2254 mprev = mfirst; 2255 } 2256 return (m0); 2257 } 2258 2259 #ifdef MBUF_PROFILING 2260 2261 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ 2262 struct mbufprofile { 2263 uintmax_t wasted[MP_BUCKETS]; 2264 uintmax_t used[MP_BUCKETS]; 2265 uintmax_t segments[MP_BUCKETS]; 2266 } mbprof; 2267 2268 void 2269 m_profile(struct mbuf *m) 2270 { 2271 int segments = 0; 2272 int used = 0; 2273 int wasted = 0; 2274 2275 while (m) { 2276 segments++; 2277 used += m->m_len; 2278 if (m->m_flags & M_EXT) { 2279 wasted += MHLEN - sizeof(m->m_ext) + 2280 m->m_ext.ext_size - m->m_len; 2281 } else { 2282 if (m->m_flags & M_PKTHDR) 2283 wasted += MHLEN - m->m_len; 2284 else 2285 wasted += MLEN - m->m_len; 2286 } 2287 m = m->m_next; 2288 } 2289 /* be paranoid.. it helps */ 2290 if (segments > MP_BUCKETS - 1) 2291 segments = MP_BUCKETS - 1; 2292 if (used > 100000) 2293 used = 100000; 2294 if (wasted > 100000) 2295 wasted = 100000; 2296 /* store in the appropriate bucket */ 2297 /* don't bother locking. if it's slightly off, so what? */ 2298 mbprof.segments[segments]++; 2299 mbprof.used[fls(used)]++; 2300 mbprof.wasted[fls(wasted)]++; 2301 } 2302 2303 static int 2304 mbprof_handler(SYSCTL_HANDLER_ARGS) 2305 { 2306 char buf[256]; 2307 struct sbuf sb; 2308 int error; 2309 uint64_t *p; 2310 2311 sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); 2312 2313 p = &mbprof.wasted[0]; 2314 sbuf_printf(&sb, 2315 "wasted:\n" 2316 "%ju %ju %ju %ju %ju %ju %ju %ju " 2317 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2318 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2319 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2320 #ifdef BIG_ARRAY 2321 p = &mbprof.wasted[16]; 2322 sbuf_printf(&sb, 2323 "%ju %ju %ju %ju %ju %ju %ju %ju " 2324 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2325 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2326 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2327 #endif 2328 p = &mbprof.used[0]; 2329 sbuf_printf(&sb, 2330 "used:\n" 2331 "%ju %ju %ju %ju %ju %ju %ju %ju " 2332 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2333 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2334 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2335 #ifdef BIG_ARRAY 2336 p = &mbprof.used[16]; 2337 sbuf_printf(&sb, 2338 "%ju %ju %ju %ju %ju %ju %ju %ju " 2339 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2340 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2341 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2342 #endif 2343 p = &mbprof.segments[0]; 2344 sbuf_printf(&sb, 2345 "segments:\n" 2346 "%ju %ju %ju %ju %ju %ju %ju %ju " 2347 "%ju %ju %ju %ju %ju %ju %ju %ju\n", 2348 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2349 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2350 #ifdef BIG_ARRAY 2351 p = &mbprof.segments[16]; 2352 sbuf_printf(&sb, 2353 "%ju %ju %ju %ju %ju %ju %ju %ju " 2354 "%ju %ju %ju %ju %ju %ju %ju %jju", 2355 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 2356 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 2357 #endif 2358 2359 error = sbuf_finish(&sb); 2360 sbuf_delete(&sb); 2361 return (error); 2362 } 2363 2364 static int 2365 mbprof_clr_handler(SYSCTL_HANDLER_ARGS) 2366 { 2367 int clear, error; 2368 2369 clear = 0; 2370 error = sysctl_handle_int(oidp, &clear, 0, req); 2371 if (error || !req->newptr) 2372 return (error); 2373 2374 if (clear) { 2375 bzero(&mbprof, sizeof(mbprof)); 2376 } 2377 2378 return (error); 2379 } 2380 2381 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, 2382 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2383 mbprof_handler, "A", 2384 "mbuf profiling statistics"); 2385 2386 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, 2387 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 2388 mbprof_clr_handler, "I", 2389 "clear mbuf profiling statistics"); 2390 #endif 2391