1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #include "opt_param.h"
34 #include "opt_mbuf_stress_test.h"
35 #include "opt_mbuf_profiling.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/sysctl.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47 #include <sys/uio.h>
48 #include <sys/vmmeter.h>
49 #include <sys/sbuf.h>
50 #include <sys/sdt.h>
51 #include <vm/vm.h>
52 #include <vm/vm_pageout.h>
53 #include <vm/vm_page.h>
54
55 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
56 "struct mbuf *", "mbufinfo_t *",
57 "uint32_t", "uint32_t",
58 "uint16_t", "uint16_t",
59 "uint32_t", "uint32_t",
60 "uint32_t", "uint32_t");
61
62 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw,
63 "uint32_t", "uint32_t",
64 "uint16_t", "uint16_t",
65 "struct mbuf *", "mbufinfo_t *");
66
67 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
68 "uint32_t", "uint32_t",
69 "uint16_t", "uint16_t",
70 "struct mbuf *", "mbufinfo_t *");
71
72 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw,
73 "uint32_t", "uint32_t",
74 "uint16_t", "uint16_t",
75 "struct mbuf *", "mbufinfo_t *");
76
77 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
78 "uint32_t", "uint32_t",
79 "uint16_t", "uint16_t",
80 "struct mbuf *", "mbufinfo_t *");
81
82 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
83 "uint32_t", "uint32_t",
84 "uint16_t", "uint16_t",
85 "uint32_t", "uint32_t",
86 "struct mbuf *", "mbufinfo_t *");
87
88 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl,
89 "uint32_t", "uint32_t",
90 "uint16_t", "uint16_t",
91 "uint32_t", "uint32_t",
92 "uint32_t", "uint32_t",
93 "struct mbuf *", "mbufinfo_t *");
94
95 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
96 "struct mbuf *", "mbufinfo_t *",
97 "uint32_t", "uint32_t",
98 "uint32_t", "uint32_t");
99
100 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
101 "struct mbuf *", "mbufinfo_t *",
102 "uint32_t", "uint32_t",
103 "uint32_t", "uint32_t",
104 "void*", "void*");
105
106 SDT_PROBE_DEFINE(sdt, , , m__cljset);
107
108 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
109 "struct mbuf *", "mbufinfo_t *");
110
111 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
112 "struct mbuf *", "mbufinfo_t *");
113
114 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freemp,
115 "struct mbuf *", "mbufinfo_t *");
116
117 #include <security/mac/mac_framework.h>
118
119 /*
120 * Provide minimum possible defaults for link and protocol header space,
121 * assuming IPv4 over Ethernet. Enabling IPv6, IEEE802.11 or some other
122 * protocol may grow these values.
123 */
124 u_int max_linkhdr = 16;
125 u_int max_protohdr = 40;
126 u_int max_hdr = 16 + 40;
127 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
128 &max_linkhdr, 16, "Size of largest link layer header");
129 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
130 &max_protohdr, 40, "Size of largest protocol layer header");
131 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
132 &max_hdr, 16 + 40, "Size of largest link plus protocol header");
133
134 static void
max_hdr_grow(void)135 max_hdr_grow(void)
136 {
137
138 max_hdr = max_linkhdr + max_protohdr;
139 MPASS(max_hdr <= MHLEN);
140 }
141
142 void
max_linkhdr_grow(u_int new)143 max_linkhdr_grow(u_int new)
144 {
145
146 if (new > max_linkhdr) {
147 max_linkhdr = new;
148 max_hdr_grow();
149 }
150 }
151
152 void
max_protohdr_grow(u_int new)153 max_protohdr_grow(u_int new)
154 {
155
156 if (new > max_protohdr) {
157 max_protohdr = new;
158 max_hdr_grow();
159 }
160 }
161
162 #ifdef MBUF_STRESS_TEST
163 int m_defragpackets;
164 int m_defragbytes;
165 int m_defraguseless;
166 int m_defragfailure;
167 int m_defragrandomfailures;
168
169 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
170 &m_defragpackets, 0, "");
171 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
172 &m_defragbytes, 0, "");
173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
174 &m_defraguseless, 0, "");
175 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
176 &m_defragfailure, 0, "");
177 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
178 &m_defragrandomfailures, 0, "");
179 #endif
180
181 /*
182 * Ensure the correct size of various mbuf parameters. It could be off due
183 * to compiler-induced padding and alignment artifacts.
184 */
185 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
186 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
187
188 /*
189 * mbuf data storage should be 64-bit aligned regardless of architectural
190 * pointer size; check this is the case with and without a packet header.
191 */
192 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
193 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
194
195 /*
196 * While the specific values here don't matter too much (i.e., +/- a few
197 * words), we do want to ensure that changes to these values are carefully
198 * reasoned about and properly documented. This is especially the case as
199 * network-protocol and device-driver modules encode these layouts, and must
200 * be recompiled if the structures change. Check these values at compile time
201 * against the ones documented in comments in mbuf.h.
202 *
203 * NB: Possibly they should be documented there via #define's and not just
204 * comments.
205 */
206 #if defined(__LP64__)
207 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
208 CTASSERT(sizeof(struct pkthdr) == 64);
209 CTASSERT(sizeof(struct m_ext) == 160);
210 #else
211 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
212 CTASSERT(sizeof(struct pkthdr) == 56);
213 #if defined(__powerpc__) && defined(BOOKE)
214 /* PowerPC booke has 64-bit physical pointers. */
215 CTASSERT(sizeof(struct m_ext) == 176);
216 #else
217 CTASSERT(sizeof(struct m_ext) == 172);
218 #endif
219 #endif
220
221 /*
222 * Assert that the queue(3) macros produce code of the same size as an old
223 * plain pointer does.
224 */
225 #ifdef INVARIANTS
226 static struct mbuf __used m_assertbuf;
227 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
228 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
229 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
230 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
231 #endif
232
233 /*
234 * Attach the cluster from *m to *n, set up m_ext in *n
235 * and bump the refcount of the cluster.
236 */
237 void
mb_dupcl(struct mbuf * n,struct mbuf * m)238 mb_dupcl(struct mbuf *n, struct mbuf *m)
239 {
240 volatile u_int *refcnt;
241
242 KASSERT(m->m_flags & (M_EXT | M_EXTPG),
243 ("%s: M_EXT | M_EXTPG not set on %p", __func__, m));
244 KASSERT(!(n->m_flags & (M_EXT | M_EXTPG)),
245 ("%s: M_EXT | M_EXTPG set on %p", __func__, n));
246
247 /*
248 * Cache access optimization.
249 *
250 * o Regular M_EXT storage doesn't need full copy of m_ext, since
251 * the holder of the 'ext_count' is responsible to carry the free
252 * routine and its arguments.
253 * o M_EXTPG data is split between main part of mbuf and m_ext, the
254 * main part is copied in full, the m_ext part is similar to M_EXT.
255 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is
256 * special - it needs full copy of m_ext into each mbuf, since any
257 * copy could end up as the last to free.
258 */
259 if (m->m_flags & M_EXTPG) {
260 bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy,
261 __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy));
262 bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen);
263 } else if (m->m_ext.ext_type == EXT_EXTREF)
264 bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
265 else
266 bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
267
268 n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG);
269
270 /* See if this is the mbuf that holds the embedded refcount. */
271 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
272 refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
273 n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
274 } else {
275 KASSERT(m->m_ext.ext_cnt != NULL,
276 ("%s: no refcounting pointer on %p", __func__, m));
277 refcnt = m->m_ext.ext_cnt;
278 }
279
280 if (*refcnt == 1)
281 *refcnt += 1;
282 else
283 atomic_add_int(refcnt, 1);
284 }
285
286 void
m_demote_pkthdr(struct mbuf * m)287 m_demote_pkthdr(struct mbuf *m)
288 {
289
290 M_ASSERTPKTHDR(m);
291 M_ASSERT_NO_SND_TAG(m);
292
293 m_tag_delete_chain(m, NULL);
294 m->m_flags &= ~M_PKTHDR;
295 bzero(&m->m_pkthdr, sizeof(struct pkthdr));
296 }
297
298 /*
299 * Clean up mbuf (chain) from any tags and packet headers.
300 * If "all" is set then the first mbuf in the chain will be
301 * cleaned too.
302 */
303 void
m_demote(struct mbuf * m0,int all,int flags)304 m_demote(struct mbuf *m0, int all, int flags)
305 {
306 struct mbuf *m;
307
308 flags |= M_DEMOTEFLAGS;
309
310 for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
311 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
312 __func__, m, m0));
313 if (m->m_flags & M_PKTHDR)
314 m_demote_pkthdr(m);
315 m->m_flags &= flags;
316 }
317 }
318
319 /*
320 * Sanity checks on mbuf (chain) for use in KASSERT() and general
321 * debugging.
322 * Returns 0 or panics when bad and 1 on all tests passed.
323 * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
324 * blow up later.
325 */
326 int
m_sanity(struct mbuf * m0,int sanitize)327 m_sanity(struct mbuf *m0, int sanitize)
328 {
329 struct mbuf *m;
330 caddr_t a, b;
331 int pktlen = 0;
332
333 #ifdef INVARIANTS
334 #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m)
335 #else
336 #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m)
337 #endif
338
339 for (m = m0; m != NULL; m = m->m_next) {
340 /*
341 * Basic pointer checks. If any of these fails then some
342 * unrelated kernel memory before or after us is trashed.
343 * No way to recover from that.
344 */
345 a = M_START(m);
346 b = a + M_SIZE(m);
347 if ((caddr_t)m->m_data < a)
348 M_SANITY_ACTION("m_data outside mbuf data range left");
349 if ((caddr_t)m->m_data > b)
350 M_SANITY_ACTION("m_data outside mbuf data range right");
351 if ((caddr_t)m->m_data + m->m_len > b)
352 M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
353
354 /* m->m_nextpkt may only be set on first mbuf in chain. */
355 if (m != m0 && m->m_nextpkt != NULL) {
356 if (sanitize) {
357 m_freem(m->m_nextpkt);
358 m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
359 } else
360 M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
361 }
362
363 /* packet length (not mbuf length!) calculation */
364 if (m0->m_flags & M_PKTHDR)
365 pktlen += m->m_len;
366
367 /* m_tags may only be attached to first mbuf in chain. */
368 if (m != m0 && m->m_flags & M_PKTHDR &&
369 !SLIST_EMPTY(&m->m_pkthdr.tags)) {
370 if (sanitize) {
371 m_tag_delete_chain(m, NULL);
372 /* put in 0xDEADC0DE perhaps? */
373 } else
374 M_SANITY_ACTION("m_tags on in-chain mbuf");
375 }
376
377 /* M_PKTHDR may only be set on first mbuf in chain */
378 if (m != m0 && m->m_flags & M_PKTHDR) {
379 if (sanitize) {
380 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
381 m->m_flags &= ~M_PKTHDR;
382 /* put in 0xDEADCODE and leave hdr flag in */
383 } else
384 M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
385 }
386 }
387 m = m0;
388 if (pktlen && pktlen != m->m_pkthdr.len) {
389 if (sanitize)
390 m->m_pkthdr.len = 0;
391 else
392 M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
393 }
394 return 1;
395
396 #undef M_SANITY_ACTION
397 }
398
399 /*
400 * Non-inlined part of m_init().
401 */
402 int
m_pkthdr_init(struct mbuf * m,int how)403 m_pkthdr_init(struct mbuf *m, int how)
404 {
405 #ifdef MAC
406 int error;
407 #endif
408 m->m_data = m->m_pktdat;
409 bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
410 #ifdef NUMA
411 m->m_pkthdr.numa_domain = M_NODOM;
412 #endif
413 #ifdef MAC
414 /* If the label init fails, fail the alloc */
415 error = mac_mbuf_init(m, how);
416 if (error)
417 return (error);
418 #endif
419
420 return (0);
421 }
422
423 /*
424 * "Move" mbuf pkthdr from "from" to "to".
425 * "from" must have M_PKTHDR set, and "to" must be empty.
426 */
427 void
m_move_pkthdr(struct mbuf * to,struct mbuf * from)428 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
429 {
430
431 #if 0
432 /* see below for why these are not enabled */
433 M_ASSERTPKTHDR(to);
434 /* Note: with MAC, this may not be a good assertion. */
435 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
436 ("m_move_pkthdr: to has tags"));
437 #endif
438 #ifdef MAC
439 /*
440 * XXXMAC: It could be this should also occur for non-MAC?
441 */
442 if (to->m_flags & M_PKTHDR)
443 m_tag_delete_chain(to, NULL);
444 #endif
445 to->m_flags = (from->m_flags & M_COPYFLAGS) |
446 (to->m_flags & (M_EXT | M_EXTPG));
447 if ((to->m_flags & M_EXT) == 0)
448 to->m_data = to->m_pktdat;
449 to->m_pkthdr = from->m_pkthdr; /* especially tags */
450 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */
451 from->m_flags &= ~M_PKTHDR;
452 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
453 from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
454 from->m_pkthdr.snd_tag = NULL;
455 }
456 }
457
458 /*
459 * Duplicate "from"'s mbuf pkthdr in "to".
460 * "from" must have M_PKTHDR set, and "to" must be empty.
461 * In particular, this does a deep copy of the packet tags.
462 */
463 int
m_dup_pkthdr(struct mbuf * to,const struct mbuf * from,int how)464 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
465 {
466
467 #if 0
468 /*
469 * The mbuf allocator only initializes the pkthdr
470 * when the mbuf is allocated with m_gethdr(). Many users
471 * (e.g. m_copy*, m_prepend) use m_get() and then
472 * smash the pkthdr as needed causing these
473 * assertions to trip. For now just disable them.
474 */
475 M_ASSERTPKTHDR(to);
476 /* Note: with MAC, this may not be a good assertion. */
477 KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
478 #endif
479 MBUF_CHECKSLEEP(how);
480 #ifdef MAC
481 if (to->m_flags & M_PKTHDR)
482 m_tag_delete_chain(to, NULL);
483 #endif
484 to->m_flags = (from->m_flags & M_COPYFLAGS) |
485 (to->m_flags & (M_EXT | M_EXTPG));
486 if ((to->m_flags & M_EXT) == 0)
487 to->m_data = to->m_pktdat;
488 to->m_pkthdr = from->m_pkthdr;
489 if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
490 m_snd_tag_ref(from->m_pkthdr.snd_tag);
491 SLIST_INIT(&to->m_pkthdr.tags);
492 return (m_tag_copy_chain(to, from, how));
493 }
494
495 /*
496 * Lesser-used path for M_PREPEND:
497 * allocate new mbuf to prepend to chain,
498 * copy junk along.
499 */
500 struct mbuf *
m_prepend(struct mbuf * m,int len,int how)501 m_prepend(struct mbuf *m, int len, int how)
502 {
503 struct mbuf *mn;
504
505 if (m->m_flags & M_PKTHDR)
506 mn = m_gethdr(how, m->m_type);
507 else
508 mn = m_get(how, m->m_type);
509 if (mn == NULL) {
510 m_freem(m);
511 return (NULL);
512 }
513 if (m->m_flags & M_PKTHDR)
514 m_move_pkthdr(mn, m);
515 mn->m_next = m;
516 m = mn;
517 if (len < M_SIZE(m))
518 M_ALIGN(m, len);
519 m->m_len = len;
520 return (m);
521 }
522
523 /*
524 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
525 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
526 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
527 * Note that the copy is read-only, because clusters are not copied,
528 * only their reference counts are incremented.
529 */
530 struct mbuf *
m_copym(struct mbuf * m,int off0,int len,int wait)531 m_copym(struct mbuf *m, int off0, int len, int wait)
532 {
533 struct mbuf *n, **np;
534 int off = off0;
535 struct mbuf *top;
536 int copyhdr = 0;
537
538 KASSERT(off >= 0, ("m_copym, negative off %d", off));
539 KASSERT(len >= 0, ("m_copym, negative len %d", len));
540 MBUF_CHECKSLEEP(wait);
541 if (off == 0 && m->m_flags & M_PKTHDR)
542 copyhdr = 1;
543 while (off > 0) {
544 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
545 if (off < m->m_len)
546 break;
547 off -= m->m_len;
548 m = m->m_next;
549 }
550 np = ⊤
551 top = NULL;
552 while (len > 0) {
553 if (m == NULL) {
554 KASSERT(len == M_COPYALL,
555 ("m_copym, length > size of mbuf chain"));
556 break;
557 }
558 if (copyhdr)
559 n = m_gethdr(wait, m->m_type);
560 else
561 n = m_get(wait, m->m_type);
562 *np = n;
563 if (n == NULL)
564 goto nospace;
565 if (copyhdr) {
566 if (!m_dup_pkthdr(n, m, wait))
567 goto nospace;
568 if (len == M_COPYALL)
569 n->m_pkthdr.len -= off0;
570 else
571 n->m_pkthdr.len = len;
572 copyhdr = 0;
573 }
574 n->m_len = min(len, m->m_len - off);
575 if (m->m_flags & (M_EXT | M_EXTPG)) {
576 n->m_data = m->m_data + off;
577 mb_dupcl(n, m);
578 } else
579 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
580 (u_int)n->m_len);
581 if (len != M_COPYALL)
582 len -= n->m_len;
583 off = 0;
584 m = m->m_next;
585 np = &n->m_next;
586 }
587
588 return (top);
589 nospace:
590 m_freem(top);
591 return (NULL);
592 }
593
594 /*
595 * Copy an entire packet, including header (which must be present).
596 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
597 * Note that the copy is read-only, because clusters are not copied,
598 * only their reference counts are incremented.
599 * Preserve alignment of the first mbuf so if the creator has left
600 * some room at the beginning (e.g. for inserting protocol headers)
601 * the copies still have the room available.
602 */
603 struct mbuf *
m_copypacket(struct mbuf * m,int how)604 m_copypacket(struct mbuf *m, int how)
605 {
606 struct mbuf *top, *n, *o;
607
608 MBUF_CHECKSLEEP(how);
609 n = m_get(how, m->m_type);
610 top = n;
611 if (n == NULL)
612 goto nospace;
613
614 if (!m_dup_pkthdr(n, m, how))
615 goto nospace;
616 n->m_len = m->m_len;
617 if (m->m_flags & (M_EXT | M_EXTPG)) {
618 n->m_data = m->m_data;
619 mb_dupcl(n, m);
620 } else {
621 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
622 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
623 }
624
625 m = m->m_next;
626 while (m) {
627 o = m_get(how, m->m_type);
628 if (o == NULL)
629 goto nospace;
630
631 n->m_next = o;
632 n = n->m_next;
633
634 n->m_len = m->m_len;
635 if (m->m_flags & (M_EXT | M_EXTPG)) {
636 n->m_data = m->m_data;
637 mb_dupcl(n, m);
638 } else {
639 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
640 }
641
642 m = m->m_next;
643 }
644 return top;
645 nospace:
646 m_freem(top);
647 return (NULL);
648 }
649
650 static void
m_copyfromunmapped(const struct mbuf * m,int off,int len,caddr_t cp)651 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
652 {
653 struct iovec iov;
654 struct uio uio;
655 int error __diagused;
656
657 KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
658 KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
659 KASSERT(off < m->m_len,
660 ("m_copyfromunmapped: len exceeds mbuf length"));
661 iov.iov_base = cp;
662 iov.iov_len = len;
663 uio.uio_resid = len;
664 uio.uio_iov = &iov;
665 uio.uio_segflg = UIO_SYSSPACE;
666 uio.uio_iovcnt = 1;
667 uio.uio_offset = 0;
668 uio.uio_rw = UIO_READ;
669 error = m_unmapped_uiomove(m, off, &uio, len);
670 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
671 len));
672 }
673
674 /*
675 * Copy data from an mbuf chain starting "off" bytes from the beginning,
676 * continuing for "len" bytes, into the indicated buffer.
677 */
678 void
m_copydata(const struct mbuf * m,int off,int len,caddr_t cp)679 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
680 {
681 u_int count;
682
683 KASSERT(off >= 0, ("m_copydata, negative off %d", off));
684 KASSERT(len >= 0, ("m_copydata, negative len %d", len));
685 while (off > 0) {
686 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
687 if (off < m->m_len)
688 break;
689 off -= m->m_len;
690 m = m->m_next;
691 }
692 while (len > 0) {
693 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
694 count = min(m->m_len - off, len);
695 if ((m->m_flags & M_EXTPG) != 0)
696 m_copyfromunmapped(m, off, count, cp);
697 else
698 bcopy(mtod(m, caddr_t) + off, cp, count);
699 len -= count;
700 cp += count;
701 off = 0;
702 m = m->m_next;
703 }
704 }
705
706 /*
707 * Copy a packet header mbuf chain into a completely new chain, including
708 * copying any mbuf clusters. Use this instead of m_copypacket() when
709 * you need a writable copy of an mbuf chain.
710 */
711 struct mbuf *
m_dup(const struct mbuf * m,int how)712 m_dup(const struct mbuf *m, int how)
713 {
714 struct mbuf **p, *top = NULL;
715 int remain, moff, nsize;
716
717 MBUF_CHECKSLEEP(how);
718 /* Sanity check */
719 if (m == NULL)
720 return (NULL);
721 M_ASSERTPKTHDR(m);
722
723 /* While there's more data, get a new mbuf, tack it on, and fill it */
724 remain = m->m_pkthdr.len;
725 moff = 0;
726 p = ⊤
727 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */
728 struct mbuf *n;
729
730 /* Get the next new mbuf */
731 if (remain >= MINCLSIZE) {
732 n = m_getcl(how, m->m_type, 0);
733 nsize = MCLBYTES;
734 } else {
735 n = m_get(how, m->m_type);
736 nsize = MLEN;
737 }
738 if (n == NULL)
739 goto nospace;
740
741 if (top == NULL) { /* First one, must be PKTHDR */
742 if (!m_dup_pkthdr(n, m, how)) {
743 m_free(n);
744 goto nospace;
745 }
746 if ((n->m_flags & M_EXT) == 0)
747 nsize = MHLEN;
748 n->m_flags &= ~M_RDONLY;
749 }
750 n->m_len = 0;
751
752 /* Link it into the new chain */
753 *p = n;
754 p = &n->m_next;
755
756 /* Copy data from original mbuf(s) into new mbuf */
757 while (n->m_len < nsize && m != NULL) {
758 int chunk = min(nsize - n->m_len, m->m_len - moff);
759
760 m_copydata(m, moff, chunk, n->m_data + n->m_len);
761 moff += chunk;
762 n->m_len += chunk;
763 remain -= chunk;
764 if (moff == m->m_len) {
765 m = m->m_next;
766 moff = 0;
767 }
768 }
769
770 /* Check correct total mbuf length */
771 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
772 ("%s: bogus m_pkthdr.len", __func__));
773 }
774 return (top);
775
776 nospace:
777 m_freem(top);
778 return (NULL);
779 }
780
781 /*
782 * Concatenate mbuf chain n to m.
783 * Both chains must be of the same type (e.g. MT_DATA).
784 * Any m_pkthdr is not updated.
785 */
786 void
m_cat(struct mbuf * m,struct mbuf * n)787 m_cat(struct mbuf *m, struct mbuf *n)
788 {
789 while (m->m_next)
790 m = m->m_next;
791 while (n) {
792 if (!M_WRITABLE(m) ||
793 (n->m_flags & M_EXTPG) != 0 ||
794 M_TRAILINGSPACE(m) < n->m_len) {
795 /* just join the two chains */
796 m->m_next = n;
797 return;
798 }
799 /* splat the data from one into the other */
800 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
801 (u_int)n->m_len);
802 m->m_len += n->m_len;
803 n = m_free(n);
804 }
805 }
806
807 /*
808 * Concatenate two pkthdr mbuf chains.
809 */
810 void
m_catpkt(struct mbuf * m,struct mbuf * n)811 m_catpkt(struct mbuf *m, struct mbuf *n)
812 {
813
814 M_ASSERTPKTHDR(m);
815 M_ASSERTPKTHDR(n);
816
817 m->m_pkthdr.len += n->m_pkthdr.len;
818 m_demote(n, 1, 0);
819
820 m_cat(m, n);
821 }
822
823 void
m_adj(struct mbuf * mp,int req_len)824 m_adj(struct mbuf *mp, int req_len)
825 {
826 int len = req_len;
827 struct mbuf *m;
828 int count;
829
830 if ((m = mp) == NULL)
831 return;
832 if (len >= 0) {
833 /*
834 * Trim from head.
835 */
836 while (m != NULL && len > 0) {
837 if (m->m_len <= len) {
838 len -= m->m_len;
839 m->m_len = 0;
840 m = m->m_next;
841 } else {
842 m->m_len -= len;
843 m->m_data += len;
844 len = 0;
845 }
846 }
847 if (mp->m_flags & M_PKTHDR)
848 mp->m_pkthdr.len -= (req_len - len);
849 } else {
850 /*
851 * Trim from tail. Scan the mbuf chain,
852 * calculating its length and finding the last mbuf.
853 * If the adjustment only affects this mbuf, then just
854 * adjust and return. Otherwise, rescan and truncate
855 * after the remaining size.
856 */
857 len = -len;
858 count = 0;
859 for (;;) {
860 count += m->m_len;
861 if (m->m_next == (struct mbuf *)0)
862 break;
863 m = m->m_next;
864 }
865 if (m->m_len >= len) {
866 m->m_len -= len;
867 if (mp->m_flags & M_PKTHDR)
868 mp->m_pkthdr.len -= len;
869 return;
870 }
871 count -= len;
872 if (count < 0)
873 count = 0;
874 /*
875 * Correct length for chain is "count".
876 * Find the mbuf with last data, adjust its length,
877 * and toss data from remaining mbufs on chain.
878 */
879 m = mp;
880 if (m->m_flags & M_PKTHDR)
881 m->m_pkthdr.len = count;
882 for (; m; m = m->m_next) {
883 if (m->m_len >= count) {
884 m->m_len = count;
885 if (m->m_next != NULL) {
886 m_freem(m->m_next);
887 m->m_next = NULL;
888 }
889 break;
890 }
891 count -= m->m_len;
892 }
893 }
894 }
895
896 void
m_adj_decap(struct mbuf * mp,int len)897 m_adj_decap(struct mbuf *mp, int len)
898 {
899 uint8_t rsstype;
900
901 m_adj(mp, len);
902 if ((mp->m_flags & M_PKTHDR) != 0) {
903 /*
904 * If flowid was calculated by card from the inner
905 * headers, move flowid to the decapsulated mbuf
906 * chain, otherwise clear. This depends on the
907 * internals of m_adj, which keeps pkthdr as is, in
908 * particular not changing rsstype and flowid.
909 */
910 rsstype = mp->m_pkthdr.rsstype;
911 if ((rsstype & M_HASHTYPE_INNER) != 0) {
912 M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER);
913 } else {
914 M_HASHTYPE_CLEAR(mp);
915 }
916 }
917 }
918
919 /*
920 * Rearange an mbuf chain so that len bytes are contiguous
921 * and in the data area of an mbuf (so that mtod will work
922 * for a structure of size len). Returns the resulting
923 * mbuf chain on success, frees it and returns null on failure.
924 * If there is room, it will add up to max_protohdr-len extra bytes to the
925 * contiguous region in an attempt to avoid being called next time.
926 */
927 struct mbuf *
m_pullup(struct mbuf * n,int len)928 m_pullup(struct mbuf *n, int len)
929 {
930 struct mbuf *m;
931 int count;
932 int space;
933
934 KASSERT((n->m_flags & M_EXTPG) == 0,
935 ("%s: unmapped mbuf %p", __func__, n));
936
937 /*
938 * If first mbuf has no cluster, and has room for len bytes
939 * without shifting current data, pullup into it,
940 * otherwise allocate a new mbuf to prepend to the chain.
941 */
942 if ((n->m_flags & M_EXT) == 0 &&
943 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
944 if (n->m_len >= len)
945 return (n);
946 m = n;
947 n = n->m_next;
948 len -= m->m_len;
949 } else {
950 if (len > MHLEN)
951 goto bad;
952 m = m_get(M_NOWAIT, n->m_type);
953 if (m == NULL)
954 goto bad;
955 if (n->m_flags & M_PKTHDR)
956 m_move_pkthdr(m, n);
957 }
958 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
959 do {
960 count = min(min(max(len, max_protohdr), space), n->m_len);
961 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
962 (u_int)count);
963 len -= count;
964 m->m_len += count;
965 n->m_len -= count;
966 space -= count;
967 if (n->m_len)
968 n->m_data += count;
969 else
970 n = m_free(n);
971 } while (len > 0 && n);
972 if (len > 0) {
973 (void) m_free(m);
974 goto bad;
975 }
976 m->m_next = n;
977 return (m);
978 bad:
979 m_freem(n);
980 return (NULL);
981 }
982
983 /*
984 * Like m_pullup(), except a new mbuf is always allocated, and we allow
985 * the amount of empty space before the data in the new mbuf to be specified
986 * (in the event that the caller expects to prepend later).
987 */
988 struct mbuf *
m_copyup(struct mbuf * n,int len,int dstoff)989 m_copyup(struct mbuf *n, int len, int dstoff)
990 {
991 struct mbuf *m;
992 int count, space;
993
994 if (len > (MHLEN - dstoff))
995 goto bad;
996 m = m_get(M_NOWAIT, n->m_type);
997 if (m == NULL)
998 goto bad;
999 if (n->m_flags & M_PKTHDR)
1000 m_move_pkthdr(m, n);
1001 m->m_data += dstoff;
1002 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1003 do {
1004 count = min(min(max(len, max_protohdr), space), n->m_len);
1005 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
1006 (unsigned)count);
1007 len -= count;
1008 m->m_len += count;
1009 n->m_len -= count;
1010 space -= count;
1011 if (n->m_len)
1012 n->m_data += count;
1013 else
1014 n = m_free(n);
1015 } while (len > 0 && n);
1016 if (len > 0) {
1017 (void) m_free(m);
1018 goto bad;
1019 }
1020 m->m_next = n;
1021 return (m);
1022 bad:
1023 m_freem(n);
1024 return (NULL);
1025 }
1026
1027 /*
1028 * Partition an mbuf chain in two pieces, returning the tail --
1029 * all but the first len0 bytes. In case of failure, it returns NULL and
1030 * attempts to restore the chain to its original state.
1031 *
1032 * Note that the resulting mbufs might be read-only, because the new
1033 * mbuf can end up sharing an mbuf cluster with the original mbuf if
1034 * the "breaking point" happens to lie within a cluster mbuf. Use the
1035 * M_WRITABLE() macro to check for this case.
1036 */
1037 struct mbuf *
m_split(struct mbuf * m0,int len0,int wait)1038 m_split(struct mbuf *m0, int len0, int wait)
1039 {
1040 struct mbuf *m, *n;
1041 u_int len = len0, remain;
1042
1043 MBUF_CHECKSLEEP(wait);
1044 for (m = m0; m && len > m->m_len; m = m->m_next)
1045 len -= m->m_len;
1046 if (m == NULL)
1047 return (NULL);
1048 remain = m->m_len - len;
1049 if (m0->m_flags & M_PKTHDR && remain == 0) {
1050 n = m_gethdr(wait, m0->m_type);
1051 if (n == NULL)
1052 return (NULL);
1053 n->m_next = m->m_next;
1054 m->m_next = NULL;
1055 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1056 n->m_pkthdr.snd_tag =
1057 m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1058 n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1059 } else
1060 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1061 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1062 m0->m_pkthdr.len = len0;
1063 return (n);
1064 } else if (m0->m_flags & M_PKTHDR) {
1065 n = m_gethdr(wait, m0->m_type);
1066 if (n == NULL)
1067 return (NULL);
1068 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1069 n->m_pkthdr.snd_tag =
1070 m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1071 n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1072 } else
1073 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1074 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1075 m0->m_pkthdr.len = len0;
1076 if (m->m_flags & (M_EXT | M_EXTPG))
1077 goto extpacket;
1078 if (remain > MHLEN) {
1079 /* m can't be the lead packet */
1080 M_ALIGN(n, 0);
1081 n->m_next = m_split(m, len, wait);
1082 if (n->m_next == NULL) {
1083 (void) m_free(n);
1084 return (NULL);
1085 } else {
1086 n->m_len = 0;
1087 return (n);
1088 }
1089 } else
1090 M_ALIGN(n, remain);
1091 } else if (remain == 0) {
1092 n = m->m_next;
1093 m->m_next = NULL;
1094 return (n);
1095 } else {
1096 n = m_get(wait, m->m_type);
1097 if (n == NULL)
1098 return (NULL);
1099 M_ALIGN(n, remain);
1100 }
1101 extpacket:
1102 if (m->m_flags & (M_EXT | M_EXTPG)) {
1103 n->m_data = m->m_data + len;
1104 mb_dupcl(n, m);
1105 } else {
1106 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1107 }
1108 n->m_len = remain;
1109 m->m_len = len;
1110 n->m_next = m->m_next;
1111 m->m_next = NULL;
1112 return (n);
1113 }
1114
1115 /*
1116 * Partition mchain in two pieces, keeping len0 bytes in head and transferring
1117 * remainder to tail. In case of failure, both chains to be left untouched.
1118 * M_EOR is observed correctly.
1119 * Resulting mbufs might be read-only.
1120 */
1121 int
mc_split(struct mchain * head,struct mchain * tail,u_int len0,int wait)1122 mc_split(struct mchain *head, struct mchain *tail, u_int len0, int wait)
1123 {
1124 struct mbuf *m, *n;
1125 u_int len, mlen, remain;
1126
1127 MPASS(!(mc_first(head)->m_flags & M_PKTHDR));
1128 MBUF_CHECKSLEEP(wait);
1129
1130 mlen = 0;
1131 len = len0;
1132 STAILQ_FOREACH(m, &head->mc_q, m_stailq) {
1133 mlen += MSIZE;
1134 if (m->m_flags & M_EXT)
1135 mlen += m->m_ext.ext_size;
1136 if (len > m->m_len)
1137 len -= m->m_len;
1138 else
1139 break;
1140 }
1141 if (__predict_false(m == NULL)) {
1142 *tail = MCHAIN_INITIALIZER(tail);
1143 return (0);
1144 }
1145 remain = m->m_len - len;
1146 if (remain > 0) {
1147 if (__predict_false((n = m_get(wait, m->m_type)) == NULL))
1148 return (ENOMEM);
1149 m_align(n, remain);
1150 if (m->m_flags & M_EXT) {
1151 n->m_data = m->m_data + len;
1152 mb_dupcl(n, m);
1153 } else
1154 bcopy(mtod(m, char *) + len, mtod(n, char *), remain);
1155 }
1156
1157 /* XXXGL: need STAILQ_SPLIT */
1158 STAILQ_FIRST(&tail->mc_q) = STAILQ_NEXT(m, m_stailq);
1159 tail->mc_q.stqh_last = head->mc_q.stqh_last;
1160 tail->mc_len = head->mc_len - len0;
1161 tail->mc_mlen = head->mc_mlen - mlen;
1162 if (remain > 0) {
1163 MPASS(n->m_len == 0);
1164 mc_prepend(tail, n);
1165 n->m_len = remain;
1166 m->m_len -= remain;
1167 if (m->m_flags & M_EOR) {
1168 m->m_flags &= ~M_EOR;
1169 n->m_flags |= M_EOR;
1170 }
1171 }
1172 head->mc_q.stqh_last = &STAILQ_NEXT(m, m_stailq);
1173 STAILQ_NEXT(m, m_stailq) = NULL;
1174 head->mc_len = len0;
1175 head->mc_mlen = mlen;
1176
1177 return (0);
1178 }
1179
1180 /*
1181 * Routine to copy from device local memory into mbufs.
1182 * Note that `off' argument is offset into first mbuf of target chain from
1183 * which to begin copying the data to.
1184 */
1185 struct mbuf *
m_devget(char * buf,int totlen,int off,struct ifnet * ifp,void (* copy)(char * from,caddr_t to,u_int len))1186 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
1187 void (*copy)(char *from, caddr_t to, u_int len))
1188 {
1189 struct mbuf *m;
1190 struct mbuf *top = NULL, **mp = ⊤
1191 int len;
1192
1193 if (off < 0 || off > MHLEN)
1194 return (NULL);
1195
1196 while (totlen > 0) {
1197 if (top == NULL) { /* First one, must be PKTHDR */
1198 if (totlen + off >= MINCLSIZE) {
1199 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1200 len = MCLBYTES;
1201 } else {
1202 m = m_gethdr(M_NOWAIT, MT_DATA);
1203 len = MHLEN;
1204
1205 /* Place initial small packet/header at end of mbuf */
1206 if (m && totlen + off + max_linkhdr <= MHLEN) {
1207 m->m_data += max_linkhdr;
1208 len -= max_linkhdr;
1209 }
1210 }
1211 if (m == NULL)
1212 return NULL;
1213 m->m_pkthdr.rcvif = ifp;
1214 m->m_pkthdr.len = totlen;
1215 } else {
1216 if (totlen + off >= MINCLSIZE) {
1217 m = m_getcl(M_NOWAIT, MT_DATA, 0);
1218 len = MCLBYTES;
1219 } else {
1220 m = m_get(M_NOWAIT, MT_DATA);
1221 len = MLEN;
1222 }
1223 if (m == NULL) {
1224 m_freem(top);
1225 return NULL;
1226 }
1227 }
1228 if (off) {
1229 m->m_data += off;
1230 len -= off;
1231 off = 0;
1232 }
1233 m->m_len = len = min(totlen, len);
1234 if (copy)
1235 copy(buf, mtod(m, caddr_t), (u_int)len);
1236 else
1237 bcopy(buf, mtod(m, caddr_t), (u_int)len);
1238 buf += len;
1239 *mp = m;
1240 mp = &m->m_next;
1241 totlen -= len;
1242 }
1243 return (top);
1244 }
1245
1246 static void
m_copytounmapped(const struct mbuf * m,int off,int len,c_caddr_t cp)1247 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp)
1248 {
1249 struct iovec iov;
1250 struct uio uio;
1251 int error __diagused;
1252
1253 KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off));
1254 KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len));
1255 KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length"));
1256 iov.iov_base = __DECONST(caddr_t, cp);
1257 iov.iov_len = len;
1258 uio.uio_resid = len;
1259 uio.uio_iov = &iov;
1260 uio.uio_segflg = UIO_SYSSPACE;
1261 uio.uio_iovcnt = 1;
1262 uio.uio_offset = 0;
1263 uio.uio_rw = UIO_WRITE;
1264 error = m_unmapped_uiomove(m, off, &uio, len);
1265 KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
1266 len));
1267 }
1268
1269 /*
1270 * Copy data from a buffer back into the indicated mbuf chain,
1271 * starting "off" bytes from the beginning, extending the mbuf
1272 * chain if necessary.
1273 */
1274 void
m_copyback(struct mbuf * m0,int off,int len,c_caddr_t cp)1275 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
1276 {
1277 int mlen;
1278 struct mbuf *m = m0, *n;
1279 int totlen = 0;
1280
1281 if (m0 == NULL)
1282 return;
1283 while (off > (mlen = m->m_len)) {
1284 off -= mlen;
1285 totlen += mlen;
1286 if (m->m_next == NULL) {
1287 n = m_get(M_NOWAIT, m->m_type);
1288 if (n == NULL)
1289 goto out;
1290 bzero(mtod(n, caddr_t), MLEN);
1291 n->m_len = min(MLEN, len + off);
1292 m->m_next = n;
1293 }
1294 m = m->m_next;
1295 }
1296 while (len > 0) {
1297 if (m->m_next == NULL && (len > m->m_len - off)) {
1298 m->m_len += min(len - (m->m_len - off),
1299 M_TRAILINGSPACE(m));
1300 }
1301 mlen = min (m->m_len - off, len);
1302 if ((m->m_flags & M_EXTPG) != 0)
1303 m_copytounmapped(m, off, mlen, cp);
1304 else
1305 bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1306 cp += mlen;
1307 len -= mlen;
1308 mlen += off;
1309 off = 0;
1310 totlen += mlen;
1311 if (len == 0)
1312 break;
1313 if (m->m_next == NULL) {
1314 n = m_get(M_NOWAIT, m->m_type);
1315 if (n == NULL)
1316 break;
1317 n->m_len = min(MLEN, len);
1318 m->m_next = n;
1319 }
1320 m = m->m_next;
1321 }
1322 out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1323 m->m_pkthdr.len = totlen;
1324 }
1325
1326 /*
1327 * Append the specified data to the indicated mbuf chain,
1328 * Extend the mbuf chain if the new data does not fit in
1329 * existing space.
1330 *
1331 * Return 1 if able to complete the job; otherwise 0.
1332 */
1333 int
m_append(struct mbuf * m0,int len,c_caddr_t cp)1334 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1335 {
1336 struct mbuf *m, *n;
1337 int remainder, space;
1338
1339 for (m = m0; m->m_next != NULL; m = m->m_next)
1340 ;
1341 remainder = len;
1342 space = M_TRAILINGSPACE(m);
1343 if (space > 0) {
1344 /*
1345 * Copy into available space.
1346 */
1347 if (space > remainder)
1348 space = remainder;
1349 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1350 m->m_len += space;
1351 cp += space, remainder -= space;
1352 }
1353 while (remainder > 0) {
1354 /*
1355 * Allocate a new mbuf; could check space
1356 * and allocate a cluster instead.
1357 */
1358 n = m_get(M_NOWAIT, m->m_type);
1359 if (n == NULL)
1360 break;
1361 n->m_len = min(MLEN, remainder);
1362 bcopy(cp, mtod(n, caddr_t), n->m_len);
1363 cp += n->m_len, remainder -= n->m_len;
1364 m->m_next = n;
1365 m = n;
1366 }
1367 if (m0->m_flags & M_PKTHDR)
1368 m0->m_pkthdr.len += len - remainder;
1369 return (remainder == 0);
1370 }
1371
1372 static int
m_apply_extpg_one(struct mbuf * m,int off,int len,int (* f)(void *,void *,u_int),void * arg)1373 m_apply_extpg_one(struct mbuf *m, int off, int len,
1374 int (*f)(void *, void *, u_int), void *arg)
1375 {
1376 void *p;
1377 u_int i, count, pgoff, pglen;
1378 int rval;
1379
1380 KASSERT(PMAP_HAS_DMAP,
1381 ("m_apply_extpg_one does not support unmapped mbufs"));
1382 off += mtod(m, vm_offset_t);
1383 if (off < m->m_epg_hdrlen) {
1384 count = min(m->m_epg_hdrlen - off, len);
1385 rval = f(arg, m->m_epg_hdr + off, count);
1386 if (rval)
1387 return (rval);
1388 len -= count;
1389 off = 0;
1390 } else
1391 off -= m->m_epg_hdrlen;
1392 pgoff = m->m_epg_1st_off;
1393 for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
1394 pglen = m_epg_pagelen(m, i, pgoff);
1395 if (off < pglen) {
1396 count = min(pglen - off, len);
1397 p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off);
1398 rval = f(arg, p, count);
1399 if (rval)
1400 return (rval);
1401 len -= count;
1402 off = 0;
1403 } else
1404 off -= pglen;
1405 pgoff = 0;
1406 }
1407 if (len > 0) {
1408 KASSERT(off < m->m_epg_trllen,
1409 ("m_apply_extpg_one: offset beyond trailer"));
1410 KASSERT(len <= m->m_epg_trllen - off,
1411 ("m_apply_extpg_one: length beyond trailer"));
1412 return (f(arg, m->m_epg_trail + off, len));
1413 }
1414 return (0);
1415 }
1416
1417 /* Apply function f to the data in a single mbuf. */
1418 static int
m_apply_one(struct mbuf * m,int off,int len,int (* f)(void *,void *,u_int),void * arg)1419 m_apply_one(struct mbuf *m, int off, int len,
1420 int (*f)(void *, void *, u_int), void *arg)
1421 {
1422 if ((m->m_flags & M_EXTPG) != 0)
1423 return (m_apply_extpg_one(m, off, len, f, arg));
1424 else
1425 return (f(arg, mtod(m, caddr_t) + off, len));
1426 }
1427
1428 /*
1429 * Apply function f to the data in an mbuf chain starting "off" bytes from
1430 * the beginning, continuing for "len" bytes.
1431 */
1432 int
m_apply(struct mbuf * m,int off,int len,int (* f)(void *,void *,u_int),void * arg)1433 m_apply(struct mbuf *m, int off, int len,
1434 int (*f)(void *, void *, u_int), void *arg)
1435 {
1436 u_int count;
1437 int rval;
1438
1439 KASSERT(off >= 0, ("m_apply, negative off %d", off));
1440 KASSERT(len >= 0, ("m_apply, negative len %d", len));
1441 while (off > 0) {
1442 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain "
1443 "(%d extra)", off));
1444 if (off < m->m_len)
1445 break;
1446 off -= m->m_len;
1447 m = m->m_next;
1448 }
1449 while (len > 0) {
1450 KASSERT(m != NULL, ("m_apply, length > size of mbuf chain "
1451 "(%d extra)", len));
1452 count = min(m->m_len - off, len);
1453 rval = m_apply_one(m, off, count, f, arg);
1454 if (rval)
1455 return (rval);
1456 len -= count;
1457 off = 0;
1458 m = m->m_next;
1459 }
1460 return (0);
1461 }
1462
1463 /*
1464 * Return a pointer to mbuf/offset of location in mbuf chain.
1465 */
1466 struct mbuf *
m_getptr(struct mbuf * m,int loc,int * off)1467 m_getptr(struct mbuf *m, int loc, int *off)
1468 {
1469
1470 while (loc >= 0) {
1471 /* Normal end of search. */
1472 if (m->m_len > loc) {
1473 *off = loc;
1474 return (m);
1475 } else {
1476 loc -= m->m_len;
1477 if (m->m_next == NULL) {
1478 if (loc == 0) {
1479 /* Point at the end of valid data. */
1480 *off = m->m_len;
1481 return (m);
1482 }
1483 return (NULL);
1484 }
1485 m = m->m_next;
1486 }
1487 }
1488 return (NULL);
1489 }
1490
1491 void
m_print(const struct mbuf * m,int maxlen)1492 m_print(const struct mbuf *m, int maxlen)
1493 {
1494 int len;
1495 int pdata;
1496 const struct mbuf *m2;
1497
1498 if (m == NULL) {
1499 printf("mbuf: %p\n", m);
1500 return;
1501 }
1502
1503 if (m->m_flags & M_PKTHDR)
1504 len = m->m_pkthdr.len;
1505 else
1506 len = -1;
1507 m2 = m;
1508 while (m2 != NULL && (len == -1 || len)) {
1509 pdata = m2->m_len;
1510 if (maxlen != -1 && pdata > maxlen)
1511 pdata = maxlen;
1512 printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1513 m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1514 "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1515 "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1516 if (pdata)
1517 printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1518 if (len != -1)
1519 len -= m2->m_len;
1520 m2 = m2->m_next;
1521 }
1522 if (len > 0)
1523 printf("%d bytes unaccounted for.\n", len);
1524 return;
1525 }
1526
1527 u_int
m_fixhdr(struct mbuf * m0)1528 m_fixhdr(struct mbuf *m0)
1529 {
1530 u_int len;
1531
1532 len = m_length(m0, NULL);
1533 m0->m_pkthdr.len = len;
1534 return (len);
1535 }
1536
1537 u_int
m_length(struct mbuf * m0,struct mbuf ** last)1538 m_length(struct mbuf *m0, struct mbuf **last)
1539 {
1540 struct mbuf *m;
1541 u_int len;
1542
1543 len = 0;
1544 for (m = m0; m != NULL; m = m->m_next) {
1545 len += m->m_len;
1546 if (m->m_next == NULL)
1547 break;
1548 }
1549 if (last != NULL)
1550 *last = m;
1551 return (len);
1552 }
1553
1554 /*
1555 * Defragment a mbuf chain, returning the shortest possible
1556 * chain of mbufs and clusters. If allocation fails and
1557 * this cannot be completed, NULL will be returned, but
1558 * the passed in chain will be unchanged. Upon success,
1559 * the original chain will be freed, and the new chain
1560 * will be returned.
1561 *
1562 * If a non-packet header is passed in, the original
1563 * mbuf (chain?) will be returned unharmed.
1564 */
1565 struct mbuf *
m_defrag(struct mbuf * m0,int how)1566 m_defrag(struct mbuf *m0, int how)
1567 {
1568 struct mbuf *m_new = NULL, *m_final = NULL;
1569 int progress = 0, length;
1570
1571 MBUF_CHECKSLEEP(how);
1572 if (!(m0->m_flags & M_PKTHDR))
1573 return (m0);
1574
1575 m_fixhdr(m0); /* Needed sanity check */
1576
1577 #ifdef MBUF_STRESS_TEST
1578 if (m_defragrandomfailures) {
1579 int temp = arc4random() & 0xff;
1580 if (temp == 0xba)
1581 goto nospace;
1582 }
1583 #endif
1584
1585 if (m0->m_pkthdr.len > MHLEN)
1586 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1587 else
1588 m_final = m_gethdr(how, MT_DATA);
1589
1590 if (m_final == NULL)
1591 goto nospace;
1592
1593 if (m_dup_pkthdr(m_final, m0, how) == 0)
1594 goto nospace;
1595
1596 m_new = m_final;
1597
1598 while (progress < m0->m_pkthdr.len) {
1599 length = m0->m_pkthdr.len - progress;
1600 if (length > MCLBYTES)
1601 length = MCLBYTES;
1602
1603 if (m_new == NULL) {
1604 if (length > MLEN)
1605 m_new = m_getcl(how, MT_DATA, 0);
1606 else
1607 m_new = m_get(how, MT_DATA);
1608 if (m_new == NULL)
1609 goto nospace;
1610 }
1611
1612 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1613 progress += length;
1614 m_new->m_len = length;
1615 if (m_new != m_final)
1616 m_cat(m_final, m_new);
1617 m_new = NULL;
1618 }
1619 #ifdef MBUF_STRESS_TEST
1620 if (m0->m_next == NULL)
1621 m_defraguseless++;
1622 #endif
1623 m_freem(m0);
1624 m0 = m_final;
1625 #ifdef MBUF_STRESS_TEST
1626 m_defragpackets++;
1627 m_defragbytes += m0->m_pkthdr.len;
1628 #endif
1629 return (m0);
1630 nospace:
1631 #ifdef MBUF_STRESS_TEST
1632 m_defragfailure++;
1633 #endif
1634 if (m_final)
1635 m_freem(m_final);
1636 return (NULL);
1637 }
1638
1639 /*
1640 * Return the number of fragments an mbuf will use. This is usually
1641 * used as a proxy for the number of scatter/gather elements needed by
1642 * a DMA engine to access an mbuf. In general mapped mbufs are
1643 * assumed to be backed by physically contiguous buffers that only
1644 * need a single fragment. Unmapped mbufs, on the other hand, can
1645 * span disjoint physical pages.
1646 */
1647 static int
frags_per_mbuf(struct mbuf * m)1648 frags_per_mbuf(struct mbuf *m)
1649 {
1650 int frags;
1651
1652 if ((m->m_flags & M_EXTPG) == 0)
1653 return (1);
1654
1655 /*
1656 * The header and trailer are counted as a single fragment
1657 * each when present.
1658 *
1659 * XXX: This overestimates the number of fragments by assuming
1660 * all the backing physical pages are disjoint.
1661 */
1662 frags = 0;
1663 if (m->m_epg_hdrlen != 0)
1664 frags++;
1665 frags += m->m_epg_npgs;
1666 if (m->m_epg_trllen != 0)
1667 frags++;
1668
1669 return (frags);
1670 }
1671
1672 /*
1673 * Defragment an mbuf chain, returning at most maxfrags separate
1674 * mbufs+clusters. If this is not possible NULL is returned and
1675 * the original mbuf chain is left in its present (potentially
1676 * modified) state. We use two techniques: collapsing consecutive
1677 * mbufs and replacing consecutive mbufs by a cluster.
1678 *
1679 * NB: this should really be named m_defrag but that name is taken
1680 */
1681 struct mbuf *
m_collapse(struct mbuf * m0,int how,int maxfrags)1682 m_collapse(struct mbuf *m0, int how, int maxfrags)
1683 {
1684 struct mbuf *m, *n, *n2, **prev;
1685 u_int curfrags;
1686
1687 /*
1688 * Calculate the current number of frags.
1689 */
1690 curfrags = 0;
1691 for (m = m0; m != NULL; m = m->m_next)
1692 curfrags += frags_per_mbuf(m);
1693 /*
1694 * First, try to collapse mbufs. Note that we always collapse
1695 * towards the front so we don't need to deal with moving the
1696 * pkthdr. This may be suboptimal if the first mbuf has much
1697 * less data than the following.
1698 */
1699 m = m0;
1700 again:
1701 for (;;) {
1702 n = m->m_next;
1703 if (n == NULL)
1704 break;
1705 if (M_WRITABLE(m) &&
1706 n->m_len < M_TRAILINGSPACE(m)) {
1707 m_copydata(n, 0, n->m_len,
1708 mtod(m, char *) + m->m_len);
1709 m->m_len += n->m_len;
1710 m->m_next = n->m_next;
1711 curfrags -= frags_per_mbuf(n);
1712 m_free(n);
1713 if (curfrags <= maxfrags)
1714 return m0;
1715 } else
1716 m = n;
1717 }
1718 KASSERT(maxfrags > 1,
1719 ("maxfrags %u, but normal collapse failed", maxfrags));
1720 /*
1721 * Collapse consecutive mbufs to a cluster.
1722 */
1723 prev = &m0->m_next; /* NB: not the first mbuf */
1724 while ((n = *prev) != NULL) {
1725 if ((n2 = n->m_next) != NULL &&
1726 n->m_len + n2->m_len < MCLBYTES) {
1727 m = m_getcl(how, MT_DATA, 0);
1728 if (m == NULL)
1729 goto bad;
1730 m_copydata(n, 0, n->m_len, mtod(m, char *));
1731 m_copydata(n2, 0, n2->m_len,
1732 mtod(m, char *) + n->m_len);
1733 m->m_len = n->m_len + n2->m_len;
1734 m->m_next = n2->m_next;
1735 *prev = m;
1736 curfrags += 1; /* For the new cluster */
1737 curfrags -= frags_per_mbuf(n);
1738 curfrags -= frags_per_mbuf(n2);
1739 m_free(n);
1740 m_free(n2);
1741 if (curfrags <= maxfrags)
1742 return m0;
1743 /*
1744 * Still not there, try the normal collapse
1745 * again before we allocate another cluster.
1746 */
1747 goto again;
1748 }
1749 prev = &n->m_next;
1750 }
1751 /*
1752 * No place where we can collapse to a cluster; punt.
1753 * This can occur if, for example, you request 2 frags
1754 * but the packet requires that both be clusters (we
1755 * never reallocate the first mbuf to avoid moving the
1756 * packet header).
1757 */
1758 bad:
1759 return NULL;
1760 }
1761
1762 #ifdef MBUF_STRESS_TEST
1763
1764 /*
1765 * Fragment an mbuf chain. There's no reason you'd ever want to do
1766 * this in normal usage, but it's great for stress testing various
1767 * mbuf consumers.
1768 *
1769 * If fragmentation is not possible, the original chain will be
1770 * returned.
1771 *
1772 * Possible length values:
1773 * 0 no fragmentation will occur
1774 * > 0 each fragment will be of the specified length
1775 * -1 each fragment will be the same random value in length
1776 * -2 each fragment's length will be entirely random
1777 * (Random values range from 1 to 256)
1778 */
1779 struct mbuf *
m_fragment(struct mbuf * m0,int how,int length)1780 m_fragment(struct mbuf *m0, int how, int length)
1781 {
1782 struct mbuf *m_first, *m_last;
1783 int divisor = 255, progress = 0, fraglen;
1784
1785 if (!(m0->m_flags & M_PKTHDR))
1786 return (m0);
1787
1788 if (length == 0 || length < -2)
1789 return (m0);
1790 if (length > MCLBYTES)
1791 length = MCLBYTES;
1792 if (length < 0 && divisor > MCLBYTES)
1793 divisor = MCLBYTES;
1794 if (length == -1)
1795 length = 1 + (arc4random() % divisor);
1796 if (length > 0)
1797 fraglen = length;
1798
1799 m_fixhdr(m0); /* Needed sanity check */
1800
1801 m_first = m_getcl(how, MT_DATA, M_PKTHDR);
1802 if (m_first == NULL)
1803 goto nospace;
1804
1805 if (m_dup_pkthdr(m_first, m0, how) == 0)
1806 goto nospace;
1807
1808 m_last = m_first;
1809
1810 while (progress < m0->m_pkthdr.len) {
1811 if (length == -2)
1812 fraglen = 1 + (arc4random() % divisor);
1813 if (fraglen > m0->m_pkthdr.len - progress)
1814 fraglen = m0->m_pkthdr.len - progress;
1815
1816 if (progress != 0) {
1817 struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
1818 if (m_new == NULL)
1819 goto nospace;
1820
1821 m_last->m_next = m_new;
1822 m_last = m_new;
1823 }
1824
1825 m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
1826 progress += fraglen;
1827 m_last->m_len = fraglen;
1828 }
1829 m_freem(m0);
1830 m0 = m_first;
1831 return (m0);
1832 nospace:
1833 if (m_first)
1834 m_freem(m_first);
1835 /* Return the original chain on failure */
1836 return (m0);
1837 }
1838
1839 #endif
1840
1841 /*
1842 * Free pages from mbuf_ext_pgs, assuming they were allocated via
1843 * vm_page_alloc() and aren't associated with any object. Complement
1844 * to allocator from m_uiotombuf_nomap().
1845 */
1846 void
mb_free_mext_pgs(struct mbuf * m)1847 mb_free_mext_pgs(struct mbuf *m)
1848 {
1849 vm_page_t pg;
1850
1851 M_ASSERTEXTPG(m);
1852 for (int i = 0; i < m->m_epg_npgs; i++) {
1853 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
1854 vm_page_unwire_noq(pg);
1855 vm_page_free(pg);
1856 }
1857 }
1858
1859 static struct mbuf *
m_uiotombuf_nomap(struct uio * uio,int how,int len,int maxseg,int flags)1860 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
1861 {
1862 struct mbuf *m, *mb, *prev;
1863 vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
1864 int error, length, i, needed;
1865 ssize_t total;
1866 int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED;
1867
1868 MPASS((flags & M_PKTHDR) == 0);
1869 MPASS((how & M_ZERO) == 0);
1870
1871 /*
1872 * len can be zero or an arbitrary large value bound by
1873 * the total data supplied by the uio.
1874 */
1875 if (len > 0)
1876 total = MIN(uio->uio_resid, len);
1877 else
1878 total = uio->uio_resid;
1879
1880 if (maxseg == 0)
1881 maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
1882
1883 /*
1884 * If total is zero, return an empty mbuf. This can occur
1885 * for TLS 1.0 connections which send empty fragments as
1886 * a countermeasure against the known-IV weakness in CBC
1887 * ciphersuites.
1888 */
1889 if (__predict_false(total == 0)) {
1890 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0);
1891 if (mb == NULL)
1892 return (NULL);
1893 mb->m_epg_flags = EPG_FLAG_ANON;
1894 return (mb);
1895 }
1896
1897 /*
1898 * Allocate the pages
1899 */
1900 m = NULL;
1901 while (total > 0) {
1902 mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0);
1903 if (mb == NULL)
1904 goto failed;
1905 if (m == NULL)
1906 m = mb;
1907 else
1908 prev->m_next = mb;
1909 prev = mb;
1910 mb->m_epg_flags = EPG_FLAG_ANON;
1911 needed = length = MIN(maxseg, total);
1912 for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
1913 retry_page:
1914 pg_array[i] = vm_page_alloc_noobj(pflags);
1915 if (pg_array[i] == NULL) {
1916 if (how & M_NOWAIT) {
1917 goto failed;
1918 } else {
1919 vm_wait(NULL);
1920 goto retry_page;
1921 }
1922 }
1923 mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
1924 mb->m_epg_npgs++;
1925 }
1926 mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1);
1927 MBUF_EXT_PGS_ASSERT_SANITY(mb);
1928 total -= length;
1929 error = uiomove_fromphys(pg_array, 0, length, uio);
1930 if (error != 0)
1931 goto failed;
1932 mb->m_len = length;
1933 mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs;
1934 if (flags & M_PKTHDR)
1935 m->m_pkthdr.len += length;
1936 }
1937 return (m);
1938
1939 failed:
1940 m_freem(m);
1941 return (NULL);
1942 }
1943
1944 /*
1945 * Copy the contents of uio into a properly sized mbuf chain.
1946 * A compat KPI. Users are recommended to use direct calls to backing
1947 * functions.
1948 */
1949 struct mbuf *
m_uiotombuf(struct uio * uio,int how,int len,int lspace,int flags)1950 m_uiotombuf(struct uio *uio, int how, int len, int lspace, int flags)
1951 {
1952
1953 if (flags & M_EXTPG) {
1954 /* XXX: 'lspace' magically becomes maxseg! */
1955 return (m_uiotombuf_nomap(uio, how, len, lspace, flags));
1956 } else if (__predict_false(uio->uio_resid == 0)) {
1957 struct mbuf *m;
1958
1959 /*
1960 * m_uiotombuf() is known to return zero length buffer, keep
1961 * this compatibility. mc_uiotomc() won't do that.
1962 */
1963 if (flags & M_PKTHDR) {
1964 m = m_gethdr(how, MT_DATA);
1965 m->m_pkthdr.memlen = MSIZE;
1966 } else
1967 m = m_get(how, MT_DATA);
1968 if (m != NULL)
1969 m->m_data += lspace;
1970 return (m);
1971 } else {
1972 struct mchain mc;
1973 int error;
1974
1975 error = mc_uiotomc(&mc, uio, len, lspace, how, flags);
1976 if (__predict_true(error == 0)) {
1977 if (flags & M_PKTHDR) {
1978 mc_first(&mc)->m_pkthdr.len = mc.mc_len;
1979 mc_first(&mc)->m_pkthdr.memlen = mc.mc_mlen;
1980 }
1981 return (mc_first(&mc));
1982 } else
1983 return (NULL);
1984 }
1985 }
1986
1987 /*
1988 * Copy the contents of uio into a properly sized mbuf chain.
1989 * In case of failure state of mchain is inconsistent.
1990 * @param length Limit copyout length. If 0 entire uio_resid is copied.
1991 * @param lspace Provide leading space in the first mbuf in the chain.
1992 */
1993 int
mc_uiotomc(struct mchain * mc,struct uio * uio,u_int length,u_int lspace,int how,int flags)1994 mc_uiotomc(struct mchain *mc, struct uio *uio, u_int length, u_int lspace,
1995 int how, int flags)
1996 {
1997 struct mbuf *mb;
1998 u_int total;
1999 int error;
2000
2001 MPASS(lspace < MHLEN);
2002 MPASS(UINT_MAX - lspace >= length);
2003 MPASS(uio->uio_rw == UIO_WRITE);
2004 MPASS(uio->uio_resid >= 0);
2005
2006 if (length > 0) {
2007 if (uio->uio_resid > length) {
2008 total = length;
2009 flags &= ~M_EOR;
2010 } else
2011 total = uio->uio_resid;
2012 } else if (__predict_false(uio->uio_resid + lspace > UINT_MAX))
2013 return (EOVERFLOW);
2014 else
2015 total = uio->uio_resid;
2016
2017 if (__predict_false(total + lspace == 0)) {
2018 *mc = MCHAIN_INITIALIZER(mc);
2019 return (0);
2020 }
2021
2022 error = mc_get(mc, total + lspace, how, MT_DATA, flags);
2023 if (__predict_false(error))
2024 return (error);
2025 mc_first(mc)->m_data += lspace;
2026
2027 /* Fill all mbufs with uio data and update header information. */
2028 STAILQ_FOREACH(mb, &mc->mc_q, m_stailq) {
2029 u_int mlen;
2030
2031 mlen = min(M_TRAILINGSPACE(mb), total - mc->mc_len);
2032 error = uiomove(mtod(mb, void *), mlen, uio);
2033 if (__predict_false(error)) {
2034 mc_freem(mc);
2035 return (error);
2036 }
2037 mb->m_len = mlen;
2038 mc->mc_len += mlen;
2039 }
2040 MPASS(mc->mc_len == total);
2041
2042 return (0);
2043 }
2044
2045 /*
2046 * Copy data to/from an unmapped mbuf into a uio limited by len if set.
2047 */
2048 int
m_unmapped_uiomove(const struct mbuf * m,int m_off,struct uio * uio,int len)2049 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len)
2050 {
2051 vm_page_t pg;
2052 int error, i, off, pglen, pgoff, seglen, segoff;
2053
2054 M_ASSERTEXTPG(m);
2055 error = 0;
2056
2057 /* Skip over any data removed from the front. */
2058 off = mtod(m, vm_offset_t);
2059
2060 off += m_off;
2061 if (m->m_epg_hdrlen != 0) {
2062 if (off >= m->m_epg_hdrlen) {
2063 off -= m->m_epg_hdrlen;
2064 } else {
2065 seglen = m->m_epg_hdrlen - off;
2066 segoff = off;
2067 seglen = min(seglen, len);
2068 off = 0;
2069 len -= seglen;
2070 error = uiomove(__DECONST(void *,
2071 &m->m_epg_hdr[segoff]), seglen, uio);
2072 }
2073 }
2074 pgoff = m->m_epg_1st_off;
2075 for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
2076 pglen = m_epg_pagelen(m, i, pgoff);
2077 if (off >= pglen) {
2078 off -= pglen;
2079 pgoff = 0;
2080 continue;
2081 }
2082 seglen = pglen - off;
2083 segoff = pgoff + off;
2084 off = 0;
2085 seglen = min(seglen, len);
2086 len -= seglen;
2087 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
2088 error = uiomove_fromphys(&pg, segoff, seglen, uio);
2089 pgoff = 0;
2090 };
2091 if (len != 0 && error == 0) {
2092 KASSERT((off + len) <= m->m_epg_trllen,
2093 ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
2094 m->m_epg_trllen, m_off));
2095 error = uiomove(__DECONST(void *, &m->m_epg_trail[off]),
2096 len, uio);
2097 }
2098 return (error);
2099 }
2100
2101 /*
2102 * Copy an mbuf chain into a uio limited by len if set.
2103 */
2104 int
m_mbuftouio(struct uio * uio,const struct mbuf * m,int len)2105 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
2106 {
2107 int error, length, total;
2108 int progress = 0;
2109
2110 if (len > 0)
2111 total = min(uio->uio_resid, len);
2112 else
2113 total = uio->uio_resid;
2114
2115 /* Fill the uio with data from the mbufs. */
2116 for (; m != NULL; m = m->m_next) {
2117 length = min(m->m_len, total - progress);
2118
2119 if ((m->m_flags & M_EXTPG) != 0)
2120 error = m_unmapped_uiomove(m, 0, uio, length);
2121 else
2122 error = uiomove(mtod(m, void *), length, uio);
2123 if (error)
2124 return (error);
2125
2126 progress += length;
2127 }
2128
2129 return (0);
2130 }
2131
2132 /*
2133 * Create a writable copy of the mbuf chain. While doing this
2134 * we compact the chain with a goal of producing a chain with
2135 * at most two mbufs. The second mbuf in this chain is likely
2136 * to be a cluster. The primary purpose of this work is to create
2137 * a writable packet for encryption, compression, etc. The
2138 * secondary goal is to linearize the data so the data can be
2139 * passed to crypto hardware in the most efficient manner possible.
2140 */
2141 struct mbuf *
m_unshare(struct mbuf * m0,int how)2142 m_unshare(struct mbuf *m0, int how)
2143 {
2144 struct mbuf *m, *mprev;
2145 struct mbuf *n, *mfirst, *mlast;
2146 int len, off;
2147
2148 mprev = NULL;
2149 for (m = m0; m != NULL; m = mprev->m_next) {
2150 /*
2151 * Regular mbufs are ignored unless there's a cluster
2152 * in front of it that we can use to coalesce. We do
2153 * the latter mainly so later clusters can be coalesced
2154 * also w/o having to handle them specially (i.e. convert
2155 * mbuf+cluster -> cluster). This optimization is heavily
2156 * influenced by the assumption that we're running over
2157 * Ethernet where MCLBYTES is large enough that the max
2158 * packet size will permit lots of coalescing into a
2159 * single cluster. This in turn permits efficient
2160 * crypto operations, especially when using hardware.
2161 */
2162 if ((m->m_flags & M_EXT) == 0) {
2163 if (mprev && (mprev->m_flags & M_EXT) &&
2164 m->m_len <= M_TRAILINGSPACE(mprev)) {
2165 /* XXX: this ignores mbuf types */
2166 memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2167 mtod(m, caddr_t), m->m_len);
2168 mprev->m_len += m->m_len;
2169 mprev->m_next = m->m_next; /* unlink from chain */
2170 m_free(m); /* reclaim mbuf */
2171 } else {
2172 mprev = m;
2173 }
2174 continue;
2175 }
2176 /*
2177 * Writable mbufs are left alone (for now).
2178 */
2179 if (M_WRITABLE(m)) {
2180 mprev = m;
2181 continue;
2182 }
2183
2184 /*
2185 * Not writable, replace with a copy or coalesce with
2186 * the previous mbuf if possible (since we have to copy
2187 * it anyway, we try to reduce the number of mbufs and
2188 * clusters so that future work is easier).
2189 */
2190 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
2191 /* NB: we only coalesce into a cluster or larger */
2192 if (mprev != NULL && (mprev->m_flags & M_EXT) &&
2193 m->m_len <= M_TRAILINGSPACE(mprev)) {
2194 /* XXX: this ignores mbuf types */
2195 memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2196 mtod(m, caddr_t), m->m_len);
2197 mprev->m_len += m->m_len;
2198 mprev->m_next = m->m_next; /* unlink from chain */
2199 m_free(m); /* reclaim mbuf */
2200 continue;
2201 }
2202
2203 /*
2204 * Allocate new space to hold the copy and copy the data.
2205 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
2206 * splitting them into clusters. We could just malloc a
2207 * buffer and make it external but too many device drivers
2208 * don't know how to break up the non-contiguous memory when
2209 * doing DMA.
2210 */
2211 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2212 if (n == NULL) {
2213 m_freem(m0);
2214 return (NULL);
2215 }
2216 if (m->m_flags & M_PKTHDR) {
2217 KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
2218 __func__, m0, m));
2219 m_move_pkthdr(n, m);
2220 }
2221 len = m->m_len;
2222 off = 0;
2223 mfirst = n;
2224 mlast = NULL;
2225 for (;;) {
2226 int cc = min(len, MCLBYTES);
2227 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
2228 n->m_len = cc;
2229 if (mlast != NULL)
2230 mlast->m_next = n;
2231 mlast = n;
2232 #if 0
2233 newipsecstat.ips_clcopied++;
2234 #endif
2235
2236 len -= cc;
2237 if (len <= 0)
2238 break;
2239 off += cc;
2240
2241 n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2242 if (n == NULL) {
2243 m_freem(mfirst);
2244 m_freem(m0);
2245 return (NULL);
2246 }
2247 }
2248 n->m_next = m->m_next;
2249 if (mprev == NULL)
2250 m0 = mfirst; /* new head of chain */
2251 else
2252 mprev->m_next = mfirst; /* replace old mbuf */
2253 m_free(m); /* release old mbuf */
2254 mprev = mfirst;
2255 }
2256 return (m0);
2257 }
2258
2259 #ifdef MBUF_PROFILING
2260
2261 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
2262 struct mbufprofile {
2263 uintmax_t wasted[MP_BUCKETS];
2264 uintmax_t used[MP_BUCKETS];
2265 uintmax_t segments[MP_BUCKETS];
2266 } mbprof;
2267
2268 void
m_profile(struct mbuf * m)2269 m_profile(struct mbuf *m)
2270 {
2271 int segments = 0;
2272 int used = 0;
2273 int wasted = 0;
2274
2275 while (m) {
2276 segments++;
2277 used += m->m_len;
2278 if (m->m_flags & M_EXT) {
2279 wasted += MHLEN - sizeof(m->m_ext) +
2280 m->m_ext.ext_size - m->m_len;
2281 } else {
2282 if (m->m_flags & M_PKTHDR)
2283 wasted += MHLEN - m->m_len;
2284 else
2285 wasted += MLEN - m->m_len;
2286 }
2287 m = m->m_next;
2288 }
2289 /* be paranoid.. it helps */
2290 if (segments > MP_BUCKETS - 1)
2291 segments = MP_BUCKETS - 1;
2292 if (used > 100000)
2293 used = 100000;
2294 if (wasted > 100000)
2295 wasted = 100000;
2296 /* store in the appropriate bucket */
2297 /* don't bother locking. if it's slightly off, so what? */
2298 mbprof.segments[segments]++;
2299 mbprof.used[fls(used)]++;
2300 mbprof.wasted[fls(wasted)]++;
2301 }
2302
2303 static int
mbprof_handler(SYSCTL_HANDLER_ARGS)2304 mbprof_handler(SYSCTL_HANDLER_ARGS)
2305 {
2306 char buf[256];
2307 struct sbuf sb;
2308 int error;
2309 uint64_t *p;
2310
2311 sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req);
2312
2313 p = &mbprof.wasted[0];
2314 sbuf_printf(&sb,
2315 "wasted:\n"
2316 "%ju %ju %ju %ju %ju %ju %ju %ju "
2317 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2318 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2319 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2320 #ifdef BIG_ARRAY
2321 p = &mbprof.wasted[16];
2322 sbuf_printf(&sb,
2323 "%ju %ju %ju %ju %ju %ju %ju %ju "
2324 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2325 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2326 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2327 #endif
2328 p = &mbprof.used[0];
2329 sbuf_printf(&sb,
2330 "used:\n"
2331 "%ju %ju %ju %ju %ju %ju %ju %ju "
2332 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2333 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2334 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2335 #ifdef BIG_ARRAY
2336 p = &mbprof.used[16];
2337 sbuf_printf(&sb,
2338 "%ju %ju %ju %ju %ju %ju %ju %ju "
2339 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2340 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2341 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2342 #endif
2343 p = &mbprof.segments[0];
2344 sbuf_printf(&sb,
2345 "segments:\n"
2346 "%ju %ju %ju %ju %ju %ju %ju %ju "
2347 "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2348 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2349 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2350 #ifdef BIG_ARRAY
2351 p = &mbprof.segments[16];
2352 sbuf_printf(&sb,
2353 "%ju %ju %ju %ju %ju %ju %ju %ju "
2354 "%ju %ju %ju %ju %ju %ju %ju %jju",
2355 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2356 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2357 #endif
2358
2359 error = sbuf_finish(&sb);
2360 sbuf_delete(&sb);
2361 return (error);
2362 }
2363
2364 static int
mbprof_clr_handler(SYSCTL_HANDLER_ARGS)2365 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
2366 {
2367 int clear, error;
2368
2369 clear = 0;
2370 error = sysctl_handle_int(oidp, &clear, 0, req);
2371 if (error || !req->newptr)
2372 return (error);
2373
2374 if (clear) {
2375 bzero(&mbprof, sizeof(mbprof));
2376 }
2377
2378 return (error);
2379 }
2380
2381 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile,
2382 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2383 mbprof_handler, "A",
2384 "mbuf profiling statistics");
2385
2386 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr,
2387 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
2388 mbprof_clr_handler, "I",
2389 "clear mbuf profiling statistics");
2390 #endif
2391