xref: /freebsd/sys/kern/uipc_mbuf.c (revision 963f5dc7a30624e95d72fb7f87b8892651164e46)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_param.h"
38 #include "opt_mbuf_stress_test.h"
39 #include "opt_mbuf_profiling.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/sysctl.h>
49 #include <sys/domain.h>
50 #include <sys/protosw.h>
51 #include <sys/uio.h>
52 #include <sys/vmmeter.h>
53 #include <sys/sbuf.h>
54 #include <sys/sdt.h>
55 #include <vm/vm.h>
56 #include <vm/vm_pageout.h>
57 #include <vm/vm_page.h>
58 
59 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
60     "struct mbuf *", "mbufinfo_t *",
61     "uint32_t", "uint32_t",
62     "uint16_t", "uint16_t",
63     "uint32_t", "uint32_t",
64     "uint32_t", "uint32_t");
65 
66 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw,
67     "uint32_t", "uint32_t",
68     "uint16_t", "uint16_t",
69     "struct mbuf *", "mbufinfo_t *");
70 
71 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
72     "uint32_t", "uint32_t",
73     "uint16_t", "uint16_t",
74     "struct mbuf *", "mbufinfo_t *");
75 
76 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw,
77     "uint32_t", "uint32_t",
78     "uint16_t", "uint16_t",
79     "struct mbuf *", "mbufinfo_t *");
80 
81 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
82     "uint32_t", "uint32_t",
83     "uint16_t", "uint16_t",
84     "struct mbuf *", "mbufinfo_t *");
85 
86 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
87     "uint32_t", "uint32_t",
88     "uint16_t", "uint16_t",
89     "uint32_t", "uint32_t",
90     "struct mbuf *", "mbufinfo_t *");
91 
92 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl,
93     "uint32_t", "uint32_t",
94     "uint16_t", "uint16_t",
95     "uint32_t", "uint32_t",
96     "uint32_t", "uint32_t",
97     "struct mbuf *", "mbufinfo_t *");
98 
99 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
100     "struct mbuf *", "mbufinfo_t *",
101     "uint32_t", "uint32_t",
102     "uint32_t", "uint32_t");
103 
104 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
105     "struct mbuf *", "mbufinfo_t *",
106     "uint32_t", "uint32_t",
107     "uint32_t", "uint32_t",
108     "void*", "void*");
109 
110 SDT_PROBE_DEFINE(sdt, , , m__cljset);
111 
112 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
113         "struct mbuf *", "mbufinfo_t *");
114 
115 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
116     "struct mbuf *", "mbufinfo_t *");
117 
118 #include <security/mac/mac_framework.h>
119 
120 int	max_linkhdr;
121 int	max_protohdr;
122 int	max_hdr;
123 int	max_datalen;
124 #ifdef MBUF_STRESS_TEST
125 int	m_defragpackets;
126 int	m_defragbytes;
127 int	m_defraguseless;
128 int	m_defragfailure;
129 int	m_defragrandomfailures;
130 #endif
131 
132 /*
133  * sysctl(8) exported objects
134  */
135 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
136 	   &max_linkhdr, 0, "Size of largest link layer header");
137 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
138 	   &max_protohdr, 0, "Size of largest protocol layer header");
139 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
140 	   &max_hdr, 0, "Size of largest link plus protocol header");
141 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
142 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
143 #ifdef MBUF_STRESS_TEST
144 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
145 	   &m_defragpackets, 0, "");
146 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
147 	   &m_defragbytes, 0, "");
148 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
149 	   &m_defraguseless, 0, "");
150 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
151 	   &m_defragfailure, 0, "");
152 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
153 	   &m_defragrandomfailures, 0, "");
154 #endif
155 
156 /*
157  * Ensure the correct size of various mbuf parameters.  It could be off due
158  * to compiler-induced padding and alignment artifacts.
159  */
160 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
161 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
162 
163 /*
164  * mbuf data storage should be 64-bit aligned regardless of architectural
165  * pointer size; check this is the case with and without a packet header.
166  */
167 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
168 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
169 
170 /*
171  * While the specific values here don't matter too much (i.e., +/- a few
172  * words), we do want to ensure that changes to these values are carefully
173  * reasoned about and properly documented.  This is especially the case as
174  * network-protocol and device-driver modules encode these layouts, and must
175  * be recompiled if the structures change.  Check these values at compile time
176  * against the ones documented in comments in mbuf.h.
177  *
178  * NB: Possibly they should be documented there via #define's and not just
179  * comments.
180  */
181 #if defined(__LP64__)
182 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
183 CTASSERT(sizeof(struct pkthdr) == 56);
184 CTASSERT(sizeof(struct m_ext) == 160);
185 #else
186 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
187 CTASSERT(sizeof(struct pkthdr) == 48);
188 #if defined(__powerpc__) && defined(BOOKE)
189 /* PowerPC booke has 64-bit physical pointers. */
190 CTASSERT(sizeof(struct m_ext) == 184);
191 #else
192 CTASSERT(sizeof(struct m_ext) == 180);
193 #endif
194 #endif
195 
196 /*
197  * Assert that the queue(3) macros produce code of the same size as an old
198  * plain pointer does.
199  */
200 #ifdef INVARIANTS
201 static struct mbuf __used m_assertbuf;
202 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
203 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
204 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
205 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
206 #endif
207 
208 /*
209  * Attach the cluster from *m to *n, set up m_ext in *n
210  * and bump the refcount of the cluster.
211  */
212 void
213 mb_dupcl(struct mbuf *n, struct mbuf *m)
214 {
215 	volatile u_int *refcnt;
216 
217 	KASSERT(m->m_flags & (M_EXT|M_EXTPG),
218 	    ("%s: M_EXT|M_EXTPG not set on %p", __func__, m));
219 	KASSERT(!(n->m_flags & (M_EXT|M_EXTPG)),
220 	    ("%s: M_EXT|M_EXTPG set on %p", __func__, n));
221 
222 	/*
223 	 * Cache access optimization.
224 	 *
225 	 * o Regular M_EXT storage doesn't need full copy of m_ext, since
226 	 *   the holder of the 'ext_count' is responsible to carry the free
227 	 *   routine and its arguments.
228 	 * o M_EXTPG data is split between main part of mbuf and m_ext, the
229 	 *   main part is copied in full, the m_ext part is similar to M_EXT.
230 	 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is
231 	 *   special - it needs full copy of m_ext into each mbuf, since any
232 	 *   copy could end up as the last to free.
233 	 */
234 	if (m->m_flags & M_EXTPG) {
235 		bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy,
236 		    __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy));
237 		bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen);
238 	} else if (m->m_ext.ext_type == EXT_EXTREF)
239 		bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
240 	else
241 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
242 
243 	n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG);
244 
245 	/* See if this is the mbuf that holds the embedded refcount. */
246 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
247 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
248 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
249 	} else {
250 		KASSERT(m->m_ext.ext_cnt != NULL,
251 		    ("%s: no refcounting pointer on %p", __func__, m));
252 		refcnt = m->m_ext.ext_cnt;
253 	}
254 
255 	if (*refcnt == 1)
256 		*refcnt += 1;
257 	else
258 		atomic_add_int(refcnt, 1);
259 }
260 
261 void
262 m_demote_pkthdr(struct mbuf *m)
263 {
264 
265 	M_ASSERTPKTHDR(m);
266 
267 	m_tag_delete_chain(m, NULL);
268 	m->m_flags &= ~M_PKTHDR;
269 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
270 }
271 
272 /*
273  * Clean up mbuf (chain) from any tags and packet headers.
274  * If "all" is set then the first mbuf in the chain will be
275  * cleaned too.
276  */
277 void
278 m_demote(struct mbuf *m0, int all, int flags)
279 {
280 	struct mbuf *m;
281 
282 	flags |= M_DEMOTEFLAGS;
283 
284 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
285 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
286 		    __func__, m, m0));
287 		if (m->m_flags & M_PKTHDR)
288 			m_demote_pkthdr(m);
289 		m->m_flags &= flags;
290 	}
291 }
292 
293 /*
294  * Sanity checks on mbuf (chain) for use in KASSERT() and general
295  * debugging.
296  * Returns 0 or panics when bad and 1 on all tests passed.
297  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
298  * blow up later.
299  */
300 int
301 m_sanity(struct mbuf *m0, int sanitize)
302 {
303 	struct mbuf *m;
304 	caddr_t a, b;
305 	int pktlen = 0;
306 
307 #ifdef INVARIANTS
308 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
309 #else
310 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
311 #endif
312 
313 	for (m = m0; m != NULL; m = m->m_next) {
314 		/*
315 		 * Basic pointer checks.  If any of these fails then some
316 		 * unrelated kernel memory before or after us is trashed.
317 		 * No way to recover from that.
318 		 */
319 		a = M_START(m);
320 		b = a + M_SIZE(m);
321 		if ((caddr_t)m->m_data < a)
322 			M_SANITY_ACTION("m_data outside mbuf data range left");
323 		if ((caddr_t)m->m_data > b)
324 			M_SANITY_ACTION("m_data outside mbuf data range right");
325 		if ((caddr_t)m->m_data + m->m_len > b)
326 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
327 
328 		/* m->m_nextpkt may only be set on first mbuf in chain. */
329 		if (m != m0 && m->m_nextpkt != NULL) {
330 			if (sanitize) {
331 				m_freem(m->m_nextpkt);
332 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
333 			} else
334 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
335 		}
336 
337 		/* packet length (not mbuf length!) calculation */
338 		if (m0->m_flags & M_PKTHDR)
339 			pktlen += m->m_len;
340 
341 		/* m_tags may only be attached to first mbuf in chain. */
342 		if (m != m0 && m->m_flags & M_PKTHDR &&
343 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
344 			if (sanitize) {
345 				m_tag_delete_chain(m, NULL);
346 				/* put in 0xDEADC0DE perhaps? */
347 			} else
348 				M_SANITY_ACTION("m_tags on in-chain mbuf");
349 		}
350 
351 		/* M_PKTHDR may only be set on first mbuf in chain */
352 		if (m != m0 && m->m_flags & M_PKTHDR) {
353 			if (sanitize) {
354 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
355 				m->m_flags &= ~M_PKTHDR;
356 				/* put in 0xDEADCODE and leave hdr flag in */
357 			} else
358 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
359 		}
360 	}
361 	m = m0;
362 	if (pktlen && pktlen != m->m_pkthdr.len) {
363 		if (sanitize)
364 			m->m_pkthdr.len = 0;
365 		else
366 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
367 	}
368 	return 1;
369 
370 #undef	M_SANITY_ACTION
371 }
372 
373 /*
374  * Non-inlined part of m_init().
375  */
376 int
377 m_pkthdr_init(struct mbuf *m, int how)
378 {
379 #ifdef MAC
380 	int error;
381 #endif
382 	m->m_data = m->m_pktdat;
383 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
384 #ifdef NUMA
385 	m->m_pkthdr.numa_domain = M_NODOM;
386 #endif
387 #ifdef MAC
388 	/* If the label init fails, fail the alloc */
389 	error = mac_mbuf_init(m, how);
390 	if (error)
391 		return (error);
392 #endif
393 
394 	return (0);
395 }
396 
397 /*
398  * "Move" mbuf pkthdr from "from" to "to".
399  * "from" must have M_PKTHDR set, and "to" must be empty.
400  */
401 void
402 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
403 {
404 
405 #if 0
406 	/* see below for why these are not enabled */
407 	M_ASSERTPKTHDR(to);
408 	/* Note: with MAC, this may not be a good assertion. */
409 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
410 	    ("m_move_pkthdr: to has tags"));
411 #endif
412 #ifdef MAC
413 	/*
414 	 * XXXMAC: It could be this should also occur for non-MAC?
415 	 */
416 	if (to->m_flags & M_PKTHDR)
417 		m_tag_delete_chain(to, NULL);
418 #endif
419 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
420 	    (to->m_flags & (M_EXT | M_EXTPG));
421 	if ((to->m_flags & M_EXT) == 0)
422 		to->m_data = to->m_pktdat;
423 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
424 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
425 	from->m_flags &= ~M_PKTHDR;
426 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
427 		from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
428 		from->m_pkthdr.snd_tag = NULL;
429 	}
430 }
431 
432 /*
433  * Duplicate "from"'s mbuf pkthdr in "to".
434  * "from" must have M_PKTHDR set, and "to" must be empty.
435  * In particular, this does a deep copy of the packet tags.
436  */
437 int
438 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
439 {
440 
441 #if 0
442 	/*
443 	 * The mbuf allocator only initializes the pkthdr
444 	 * when the mbuf is allocated with m_gethdr(). Many users
445 	 * (e.g. m_copy*, m_prepend) use m_get() and then
446 	 * smash the pkthdr as needed causing these
447 	 * assertions to trip.  For now just disable them.
448 	 */
449 	M_ASSERTPKTHDR(to);
450 	/* Note: with MAC, this may not be a good assertion. */
451 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
452 #endif
453 	MBUF_CHECKSLEEP(how);
454 #ifdef MAC
455 	if (to->m_flags & M_PKTHDR)
456 		m_tag_delete_chain(to, NULL);
457 #endif
458 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
459 	    (to->m_flags & (M_EXT | M_EXTPG));
460 	if ((to->m_flags & M_EXT) == 0)
461 		to->m_data = to->m_pktdat;
462 	to->m_pkthdr = from->m_pkthdr;
463 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
464 		m_snd_tag_ref(from->m_pkthdr.snd_tag);
465 	SLIST_INIT(&to->m_pkthdr.tags);
466 	return (m_tag_copy_chain(to, from, how));
467 }
468 
469 /*
470  * Lesser-used path for M_PREPEND:
471  * allocate new mbuf to prepend to chain,
472  * copy junk along.
473  */
474 struct mbuf *
475 m_prepend(struct mbuf *m, int len, int how)
476 {
477 	struct mbuf *mn;
478 
479 	if (m->m_flags & M_PKTHDR)
480 		mn = m_gethdr(how, m->m_type);
481 	else
482 		mn = m_get(how, m->m_type);
483 	if (mn == NULL) {
484 		m_freem(m);
485 		return (NULL);
486 	}
487 	if (m->m_flags & M_PKTHDR)
488 		m_move_pkthdr(mn, m);
489 	mn->m_next = m;
490 	m = mn;
491 	if (len < M_SIZE(m))
492 		M_ALIGN(m, len);
493 	m->m_len = len;
494 	return (m);
495 }
496 
497 /*
498  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
499  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
500  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
501  * Note that the copy is read-only, because clusters are not copied,
502  * only their reference counts are incremented.
503  */
504 struct mbuf *
505 m_copym(struct mbuf *m, int off0, int len, int wait)
506 {
507 	struct mbuf *n, **np;
508 	int off = off0;
509 	struct mbuf *top;
510 	int copyhdr = 0;
511 
512 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
513 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
514 	MBUF_CHECKSLEEP(wait);
515 	if (off == 0 && m->m_flags & M_PKTHDR)
516 		copyhdr = 1;
517 	while (off > 0) {
518 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
519 		if (off < m->m_len)
520 			break;
521 		off -= m->m_len;
522 		m = m->m_next;
523 	}
524 	np = &top;
525 	top = NULL;
526 	while (len > 0) {
527 		if (m == NULL) {
528 			KASSERT(len == M_COPYALL,
529 			    ("m_copym, length > size of mbuf chain"));
530 			break;
531 		}
532 		if (copyhdr)
533 			n = m_gethdr(wait, m->m_type);
534 		else
535 			n = m_get(wait, m->m_type);
536 		*np = n;
537 		if (n == NULL)
538 			goto nospace;
539 		if (copyhdr) {
540 			if (!m_dup_pkthdr(n, m, wait))
541 				goto nospace;
542 			if (len == M_COPYALL)
543 				n->m_pkthdr.len -= off0;
544 			else
545 				n->m_pkthdr.len = len;
546 			copyhdr = 0;
547 		}
548 		n->m_len = min(len, m->m_len - off);
549 		if (m->m_flags & (M_EXT|M_EXTPG)) {
550 			n->m_data = m->m_data + off;
551 			mb_dupcl(n, m);
552 		} else
553 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
554 			    (u_int)n->m_len);
555 		if (len != M_COPYALL)
556 			len -= n->m_len;
557 		off = 0;
558 		m = m->m_next;
559 		np = &n->m_next;
560 	}
561 
562 	return (top);
563 nospace:
564 	m_freem(top);
565 	return (NULL);
566 }
567 
568 /*
569  * Copy an entire packet, including header (which must be present).
570  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
571  * Note that the copy is read-only, because clusters are not copied,
572  * only their reference counts are incremented.
573  * Preserve alignment of the first mbuf so if the creator has left
574  * some room at the beginning (e.g. for inserting protocol headers)
575  * the copies still have the room available.
576  */
577 struct mbuf *
578 m_copypacket(struct mbuf *m, int how)
579 {
580 	struct mbuf *top, *n, *o;
581 
582 	MBUF_CHECKSLEEP(how);
583 	n = m_get(how, m->m_type);
584 	top = n;
585 	if (n == NULL)
586 		goto nospace;
587 
588 	if (!m_dup_pkthdr(n, m, how))
589 		goto nospace;
590 	n->m_len = m->m_len;
591 	if (m->m_flags & (M_EXT|M_EXTPG)) {
592 		n->m_data = m->m_data;
593 		mb_dupcl(n, m);
594 	} else {
595 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
596 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
597 	}
598 
599 	m = m->m_next;
600 	while (m) {
601 		o = m_get(how, m->m_type);
602 		if (o == NULL)
603 			goto nospace;
604 
605 		n->m_next = o;
606 		n = n->m_next;
607 
608 		n->m_len = m->m_len;
609 		if (m->m_flags & (M_EXT|M_EXTPG)) {
610 			n->m_data = m->m_data;
611 			mb_dupcl(n, m);
612 		} else {
613 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
614 		}
615 
616 		m = m->m_next;
617 	}
618 	return top;
619 nospace:
620 	m_freem(top);
621 	return (NULL);
622 }
623 
624 static void
625 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
626 {
627 	struct iovec iov;
628 	struct uio uio;
629 	int error __diagused;
630 
631 	KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
632 	KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
633 	KASSERT(off < m->m_len,
634 	    ("m_copyfromunmapped: len exceeds mbuf length"));
635 	iov.iov_base = cp;
636 	iov.iov_len = len;
637 	uio.uio_resid = len;
638 	uio.uio_iov = &iov;
639 	uio.uio_segflg = UIO_SYSSPACE;
640 	uio.uio_iovcnt = 1;
641 	uio.uio_offset = 0;
642 	uio.uio_rw = UIO_READ;
643 	error = m_unmapped_uiomove(m, off, &uio, len);
644 	KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
645 	   len));
646 }
647 
648 /*
649  * Copy data from an mbuf chain starting "off" bytes from the beginning,
650  * continuing for "len" bytes, into the indicated buffer.
651  */
652 void
653 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
654 {
655 	u_int count;
656 
657 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
658 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
659 	while (off > 0) {
660 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
661 		if (off < m->m_len)
662 			break;
663 		off -= m->m_len;
664 		m = m->m_next;
665 	}
666 	while (len > 0) {
667 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
668 		count = min(m->m_len - off, len);
669 		if ((m->m_flags & M_EXTPG) != 0)
670 			m_copyfromunmapped(m, off, count, cp);
671 		else
672 			bcopy(mtod(m, caddr_t) + off, cp, count);
673 		len -= count;
674 		cp += count;
675 		off = 0;
676 		m = m->m_next;
677 	}
678 }
679 
680 /*
681  * Copy a packet header mbuf chain into a completely new chain, including
682  * copying any mbuf clusters.  Use this instead of m_copypacket() when
683  * you need a writable copy of an mbuf chain.
684  */
685 struct mbuf *
686 m_dup(const struct mbuf *m, int how)
687 {
688 	struct mbuf **p, *top = NULL;
689 	int remain, moff, nsize;
690 
691 	MBUF_CHECKSLEEP(how);
692 	/* Sanity check */
693 	if (m == NULL)
694 		return (NULL);
695 	M_ASSERTPKTHDR(m);
696 
697 	/* While there's more data, get a new mbuf, tack it on, and fill it */
698 	remain = m->m_pkthdr.len;
699 	moff = 0;
700 	p = &top;
701 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
702 		struct mbuf *n;
703 
704 		/* Get the next new mbuf */
705 		if (remain >= MINCLSIZE) {
706 			n = m_getcl(how, m->m_type, 0);
707 			nsize = MCLBYTES;
708 		} else {
709 			n = m_get(how, m->m_type);
710 			nsize = MLEN;
711 		}
712 		if (n == NULL)
713 			goto nospace;
714 
715 		if (top == NULL) {		/* First one, must be PKTHDR */
716 			if (!m_dup_pkthdr(n, m, how)) {
717 				m_free(n);
718 				goto nospace;
719 			}
720 			if ((n->m_flags & M_EXT) == 0)
721 				nsize = MHLEN;
722 			n->m_flags &= ~M_RDONLY;
723 		}
724 		n->m_len = 0;
725 
726 		/* Link it into the new chain */
727 		*p = n;
728 		p = &n->m_next;
729 
730 		/* Copy data from original mbuf(s) into new mbuf */
731 		while (n->m_len < nsize && m != NULL) {
732 			int chunk = min(nsize - n->m_len, m->m_len - moff);
733 
734 			m_copydata(m, moff, chunk, n->m_data + n->m_len);
735 			moff += chunk;
736 			n->m_len += chunk;
737 			remain -= chunk;
738 			if (moff == m->m_len) {
739 				m = m->m_next;
740 				moff = 0;
741 			}
742 		}
743 
744 		/* Check correct total mbuf length */
745 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
746 		    	("%s: bogus m_pkthdr.len", __func__));
747 	}
748 	return (top);
749 
750 nospace:
751 	m_freem(top);
752 	return (NULL);
753 }
754 
755 /*
756  * Concatenate mbuf chain n to m.
757  * Both chains must be of the same type (e.g. MT_DATA).
758  * Any m_pkthdr is not updated.
759  */
760 void
761 m_cat(struct mbuf *m, struct mbuf *n)
762 {
763 	while (m->m_next)
764 		m = m->m_next;
765 	while (n) {
766 		if (!M_WRITABLE(m) ||
767 		    (n->m_flags & M_EXTPG) != 0 ||
768 		    M_TRAILINGSPACE(m) < n->m_len) {
769 			/* just join the two chains */
770 			m->m_next = n;
771 			return;
772 		}
773 		/* splat the data from one into the other */
774 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
775 		    (u_int)n->m_len);
776 		m->m_len += n->m_len;
777 		n = m_free(n);
778 	}
779 }
780 
781 /*
782  * Concatenate two pkthdr mbuf chains.
783  */
784 void
785 m_catpkt(struct mbuf *m, struct mbuf *n)
786 {
787 
788 	M_ASSERTPKTHDR(m);
789 	M_ASSERTPKTHDR(n);
790 
791 	m->m_pkthdr.len += n->m_pkthdr.len;
792 	m_demote(n, 1, 0);
793 
794 	m_cat(m, n);
795 }
796 
797 void
798 m_adj(struct mbuf *mp, int req_len)
799 {
800 	int len = req_len;
801 	struct mbuf *m;
802 	int count;
803 
804 	if ((m = mp) == NULL)
805 		return;
806 	if (len >= 0) {
807 		/*
808 		 * Trim from head.
809 		 */
810 		while (m != NULL && len > 0) {
811 			if (m->m_len <= len) {
812 				len -= m->m_len;
813 				m->m_len = 0;
814 				m = m->m_next;
815 			} else {
816 				m->m_len -= len;
817 				m->m_data += len;
818 				len = 0;
819 			}
820 		}
821 		if (mp->m_flags & M_PKTHDR)
822 			mp->m_pkthdr.len -= (req_len - len);
823 	} else {
824 		/*
825 		 * Trim from tail.  Scan the mbuf chain,
826 		 * calculating its length and finding the last mbuf.
827 		 * If the adjustment only affects this mbuf, then just
828 		 * adjust and return.  Otherwise, rescan and truncate
829 		 * after the remaining size.
830 		 */
831 		len = -len;
832 		count = 0;
833 		for (;;) {
834 			count += m->m_len;
835 			if (m->m_next == (struct mbuf *)0)
836 				break;
837 			m = m->m_next;
838 		}
839 		if (m->m_len >= len) {
840 			m->m_len -= len;
841 			if (mp->m_flags & M_PKTHDR)
842 				mp->m_pkthdr.len -= len;
843 			return;
844 		}
845 		count -= len;
846 		if (count < 0)
847 			count = 0;
848 		/*
849 		 * Correct length for chain is "count".
850 		 * Find the mbuf with last data, adjust its length,
851 		 * and toss data from remaining mbufs on chain.
852 		 */
853 		m = mp;
854 		if (m->m_flags & M_PKTHDR)
855 			m->m_pkthdr.len = count;
856 		for (; m; m = m->m_next) {
857 			if (m->m_len >= count) {
858 				m->m_len = count;
859 				if (m->m_next != NULL) {
860 					m_freem(m->m_next);
861 					m->m_next = NULL;
862 				}
863 				break;
864 			}
865 			count -= m->m_len;
866 		}
867 	}
868 }
869 
870 void
871 m_adj_decap(struct mbuf *mp, int len)
872 {
873 	uint8_t rsstype;
874 
875 	m_adj(mp, len);
876 	if ((mp->m_flags & M_PKTHDR) != 0) {
877 		/*
878 		 * If flowid was calculated by card from the inner
879 		 * headers, move flowid to the decapsulated mbuf
880 		 * chain, otherwise clear.  This depends on the
881 		 * internals of m_adj, which keeps pkthdr as is, in
882 		 * particular not changing rsstype and flowid.
883 		 */
884 		rsstype = mp->m_pkthdr.rsstype;
885 		if ((rsstype & M_HASHTYPE_INNER) != 0) {
886 			M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER);
887 		} else {
888 			M_HASHTYPE_CLEAR(mp);
889 		}
890 	}
891 }
892 
893 /*
894  * Rearange an mbuf chain so that len bytes are contiguous
895  * and in the data area of an mbuf (so that mtod will work
896  * for a structure of size len).  Returns the resulting
897  * mbuf chain on success, frees it and returns null on failure.
898  * If there is room, it will add up to max_protohdr-len extra bytes to the
899  * contiguous region in an attempt to avoid being called next time.
900  */
901 struct mbuf *
902 m_pullup(struct mbuf *n, int len)
903 {
904 	struct mbuf *m;
905 	int count;
906 	int space;
907 
908 	KASSERT((n->m_flags & M_EXTPG) == 0,
909 	    ("%s: unmapped mbuf %p", __func__, n));
910 
911 	/*
912 	 * If first mbuf has no cluster, and has room for len bytes
913 	 * without shifting current data, pullup into it,
914 	 * otherwise allocate a new mbuf to prepend to the chain.
915 	 */
916 	if ((n->m_flags & M_EXT) == 0 &&
917 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
918 		if (n->m_len >= len)
919 			return (n);
920 		m = n;
921 		n = n->m_next;
922 		len -= m->m_len;
923 	} else {
924 		if (len > MHLEN)
925 			goto bad;
926 		m = m_get(M_NOWAIT, n->m_type);
927 		if (m == NULL)
928 			goto bad;
929 		if (n->m_flags & M_PKTHDR)
930 			m_move_pkthdr(m, n);
931 	}
932 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
933 	do {
934 		count = min(min(max(len, max_protohdr), space), n->m_len);
935 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
936 		  (u_int)count);
937 		len -= count;
938 		m->m_len += count;
939 		n->m_len -= count;
940 		space -= count;
941 		if (n->m_len)
942 			n->m_data += count;
943 		else
944 			n = m_free(n);
945 	} while (len > 0 && n);
946 	if (len > 0) {
947 		(void) m_free(m);
948 		goto bad;
949 	}
950 	m->m_next = n;
951 	return (m);
952 bad:
953 	m_freem(n);
954 	return (NULL);
955 }
956 
957 /*
958  * Like m_pullup(), except a new mbuf is always allocated, and we allow
959  * the amount of empty space before the data in the new mbuf to be specified
960  * (in the event that the caller expects to prepend later).
961  */
962 struct mbuf *
963 m_copyup(struct mbuf *n, int len, int dstoff)
964 {
965 	struct mbuf *m;
966 	int count, space;
967 
968 	if (len > (MHLEN - dstoff))
969 		goto bad;
970 	m = m_get(M_NOWAIT, n->m_type);
971 	if (m == NULL)
972 		goto bad;
973 	if (n->m_flags & M_PKTHDR)
974 		m_move_pkthdr(m, n);
975 	m->m_data += dstoff;
976 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
977 	do {
978 		count = min(min(max(len, max_protohdr), space), n->m_len);
979 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
980 		    (unsigned)count);
981 		len -= count;
982 		m->m_len += count;
983 		n->m_len -= count;
984 		space -= count;
985 		if (n->m_len)
986 			n->m_data += count;
987 		else
988 			n = m_free(n);
989 	} while (len > 0 && n);
990 	if (len > 0) {
991 		(void) m_free(m);
992 		goto bad;
993 	}
994 	m->m_next = n;
995 	return (m);
996  bad:
997 	m_freem(n);
998 	return (NULL);
999 }
1000 
1001 /*
1002  * Partition an mbuf chain in two pieces, returning the tail --
1003  * all but the first len0 bytes.  In case of failure, it returns NULL and
1004  * attempts to restore the chain to its original state.
1005  *
1006  * Note that the resulting mbufs might be read-only, because the new
1007  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1008  * the "breaking point" happens to lie within a cluster mbuf. Use the
1009  * M_WRITABLE() macro to check for this case.
1010  */
1011 struct mbuf *
1012 m_split(struct mbuf *m0, int len0, int wait)
1013 {
1014 	struct mbuf *m, *n;
1015 	u_int len = len0, remain;
1016 
1017 	MBUF_CHECKSLEEP(wait);
1018 	for (m = m0; m && len > m->m_len; m = m->m_next)
1019 		len -= m->m_len;
1020 	if (m == NULL)
1021 		return (NULL);
1022 	remain = m->m_len - len;
1023 	if (m0->m_flags & M_PKTHDR && remain == 0) {
1024 		n = m_gethdr(wait, m0->m_type);
1025 		if (n == NULL)
1026 			return (NULL);
1027 		n->m_next = m->m_next;
1028 		m->m_next = NULL;
1029 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1030 			n->m_pkthdr.snd_tag =
1031 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1032 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1033 		} else
1034 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1035 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1036 		m0->m_pkthdr.len = len0;
1037 		return (n);
1038 	} else if (m0->m_flags & M_PKTHDR) {
1039 		n = m_gethdr(wait, m0->m_type);
1040 		if (n == NULL)
1041 			return (NULL);
1042 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1043 			n->m_pkthdr.snd_tag =
1044 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1045 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1046 		} else
1047 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1048 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1049 		m0->m_pkthdr.len = len0;
1050 		if (m->m_flags & (M_EXT|M_EXTPG))
1051 			goto extpacket;
1052 		if (remain > MHLEN) {
1053 			/* m can't be the lead packet */
1054 			M_ALIGN(n, 0);
1055 			n->m_next = m_split(m, len, wait);
1056 			if (n->m_next == NULL) {
1057 				(void) m_free(n);
1058 				return (NULL);
1059 			} else {
1060 				n->m_len = 0;
1061 				return (n);
1062 			}
1063 		} else
1064 			M_ALIGN(n, remain);
1065 	} else if (remain == 0) {
1066 		n = m->m_next;
1067 		m->m_next = NULL;
1068 		return (n);
1069 	} else {
1070 		n = m_get(wait, m->m_type);
1071 		if (n == NULL)
1072 			return (NULL);
1073 		M_ALIGN(n, remain);
1074 	}
1075 extpacket:
1076 	if (m->m_flags & (M_EXT|M_EXTPG)) {
1077 		n->m_data = m->m_data + len;
1078 		mb_dupcl(n, m);
1079 	} else {
1080 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1081 	}
1082 	n->m_len = remain;
1083 	m->m_len = len;
1084 	n->m_next = m->m_next;
1085 	m->m_next = NULL;
1086 	return (n);
1087 }
1088 /*
1089  * Routine to copy from device local memory into mbufs.
1090  * Note that `off' argument is offset into first mbuf of target chain from
1091  * which to begin copying the data to.
1092  */
1093 struct mbuf *
1094 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
1095     void (*copy)(char *from, caddr_t to, u_int len))
1096 {
1097 	struct mbuf *m;
1098 	struct mbuf *top = NULL, **mp = &top;
1099 	int len;
1100 
1101 	if (off < 0 || off > MHLEN)
1102 		return (NULL);
1103 
1104 	while (totlen > 0) {
1105 		if (top == NULL) {	/* First one, must be PKTHDR */
1106 			if (totlen + off >= MINCLSIZE) {
1107 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1108 				len = MCLBYTES;
1109 			} else {
1110 				m = m_gethdr(M_NOWAIT, MT_DATA);
1111 				len = MHLEN;
1112 
1113 				/* Place initial small packet/header at end of mbuf */
1114 				if (m && totlen + off + max_linkhdr <= MHLEN) {
1115 					m->m_data += max_linkhdr;
1116 					len -= max_linkhdr;
1117 				}
1118 			}
1119 			if (m == NULL)
1120 				return NULL;
1121 			m->m_pkthdr.rcvif = ifp;
1122 			m->m_pkthdr.len = totlen;
1123 		} else {
1124 			if (totlen + off >= MINCLSIZE) {
1125 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
1126 				len = MCLBYTES;
1127 			} else {
1128 				m = m_get(M_NOWAIT, MT_DATA);
1129 				len = MLEN;
1130 			}
1131 			if (m == NULL) {
1132 				m_freem(top);
1133 				return NULL;
1134 			}
1135 		}
1136 		if (off) {
1137 			m->m_data += off;
1138 			len -= off;
1139 			off = 0;
1140 		}
1141 		m->m_len = len = min(totlen, len);
1142 		if (copy)
1143 			copy(buf, mtod(m, caddr_t), (u_int)len);
1144 		else
1145 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
1146 		buf += len;
1147 		*mp = m;
1148 		mp = &m->m_next;
1149 		totlen -= len;
1150 	}
1151 	return (top);
1152 }
1153 
1154 static void
1155 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp)
1156 {
1157 	struct iovec iov;
1158 	struct uio uio;
1159 	int error __diagused;
1160 
1161 	KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off));
1162 	KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len));
1163 	KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length"));
1164 	iov.iov_base = __DECONST(caddr_t, cp);
1165 	iov.iov_len = len;
1166 	uio.uio_resid = len;
1167 	uio.uio_iov = &iov;
1168 	uio.uio_segflg = UIO_SYSSPACE;
1169 	uio.uio_iovcnt = 1;
1170 	uio.uio_offset = 0;
1171 	uio.uio_rw = UIO_WRITE;
1172 	error = m_unmapped_uiomove(m, off, &uio, len);
1173 	KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
1174 	   len));
1175 }
1176 
1177 /*
1178  * Copy data from a buffer back into the indicated mbuf chain,
1179  * starting "off" bytes from the beginning, extending the mbuf
1180  * chain if necessary.
1181  */
1182 void
1183 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
1184 {
1185 	int mlen;
1186 	struct mbuf *m = m0, *n;
1187 	int totlen = 0;
1188 
1189 	if (m0 == NULL)
1190 		return;
1191 	while (off > (mlen = m->m_len)) {
1192 		off -= mlen;
1193 		totlen += mlen;
1194 		if (m->m_next == NULL) {
1195 			n = m_get(M_NOWAIT, m->m_type);
1196 			if (n == NULL)
1197 				goto out;
1198 			bzero(mtod(n, caddr_t), MLEN);
1199 			n->m_len = min(MLEN, len + off);
1200 			m->m_next = n;
1201 		}
1202 		m = m->m_next;
1203 	}
1204 	while (len > 0) {
1205 		if (m->m_next == NULL && (len > m->m_len - off)) {
1206 			m->m_len += min(len - (m->m_len - off),
1207 			    M_TRAILINGSPACE(m));
1208 		}
1209 		mlen = min (m->m_len - off, len);
1210 		if ((m->m_flags & M_EXTPG) != 0)
1211 			m_copytounmapped(m, off, mlen, cp);
1212 		else
1213 			bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1214 		cp += mlen;
1215 		len -= mlen;
1216 		mlen += off;
1217 		off = 0;
1218 		totlen += mlen;
1219 		if (len == 0)
1220 			break;
1221 		if (m->m_next == NULL) {
1222 			n = m_get(M_NOWAIT, m->m_type);
1223 			if (n == NULL)
1224 				break;
1225 			n->m_len = min(MLEN, len);
1226 			m->m_next = n;
1227 		}
1228 		m = m->m_next;
1229 	}
1230 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1231 		m->m_pkthdr.len = totlen;
1232 }
1233 
1234 /*
1235  * Append the specified data to the indicated mbuf chain,
1236  * Extend the mbuf chain if the new data does not fit in
1237  * existing space.
1238  *
1239  * Return 1 if able to complete the job; otherwise 0.
1240  */
1241 int
1242 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1243 {
1244 	struct mbuf *m, *n;
1245 	int remainder, space;
1246 
1247 	for (m = m0; m->m_next != NULL; m = m->m_next)
1248 		;
1249 	remainder = len;
1250 	space = M_TRAILINGSPACE(m);
1251 	if (space > 0) {
1252 		/*
1253 		 * Copy into available space.
1254 		 */
1255 		if (space > remainder)
1256 			space = remainder;
1257 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1258 		m->m_len += space;
1259 		cp += space, remainder -= space;
1260 	}
1261 	while (remainder > 0) {
1262 		/*
1263 		 * Allocate a new mbuf; could check space
1264 		 * and allocate a cluster instead.
1265 		 */
1266 		n = m_get(M_NOWAIT, m->m_type);
1267 		if (n == NULL)
1268 			break;
1269 		n->m_len = min(MLEN, remainder);
1270 		bcopy(cp, mtod(n, caddr_t), n->m_len);
1271 		cp += n->m_len, remainder -= n->m_len;
1272 		m->m_next = n;
1273 		m = n;
1274 	}
1275 	if (m0->m_flags & M_PKTHDR)
1276 		m0->m_pkthdr.len += len - remainder;
1277 	return (remainder == 0);
1278 }
1279 
1280 static int
1281 m_apply_extpg_one(struct mbuf *m, int off, int len,
1282     int (*f)(void *, void *, u_int), void *arg)
1283 {
1284 	void *p;
1285 	u_int i, count, pgoff, pglen;
1286 	int rval;
1287 
1288 	KASSERT(PMAP_HAS_DMAP,
1289 	    ("m_apply_extpg_one does not support unmapped mbufs"));
1290 	off += mtod(m, vm_offset_t);
1291 	if (off < m->m_epg_hdrlen) {
1292 		count = min(m->m_epg_hdrlen - off, len);
1293 		rval = f(arg, m->m_epg_hdr + off, count);
1294 		if (rval)
1295 			return (rval);
1296 		len -= count;
1297 		off = 0;
1298 	} else
1299 		off -= m->m_epg_hdrlen;
1300 	pgoff = m->m_epg_1st_off;
1301 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
1302 		pglen = m_epg_pagelen(m, i, pgoff);
1303 		if (off < pglen) {
1304 			count = min(pglen - off, len);
1305 			p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off);
1306 			rval = f(arg, p, count);
1307 			if (rval)
1308 				return (rval);
1309 			len -= count;
1310 			off = 0;
1311 		} else
1312 			off -= pglen;
1313 		pgoff = 0;
1314 	}
1315 	if (len > 0) {
1316 		KASSERT(off < m->m_epg_trllen,
1317 		    ("m_apply_extpg_one: offset beyond trailer"));
1318 		KASSERT(len <= m->m_epg_trllen - off,
1319 		    ("m_apply_extpg_one: length beyond trailer"));
1320 		return (f(arg, m->m_epg_trail + off, len));
1321 	}
1322 	return (0);
1323 }
1324 
1325 /* Apply function f to the data in a single mbuf. */
1326 static int
1327 m_apply_one(struct mbuf *m, int off, int len,
1328     int (*f)(void *, void *, u_int), void *arg)
1329 {
1330 	if ((m->m_flags & M_EXTPG) != 0)
1331 		return (m_apply_extpg_one(m, off, len, f, arg));
1332 	else
1333 		return (f(arg, mtod(m, caddr_t) + off, len));
1334 }
1335 
1336 /*
1337  * Apply function f to the data in an mbuf chain starting "off" bytes from
1338  * the beginning, continuing for "len" bytes.
1339  */
1340 int
1341 m_apply(struct mbuf *m, int off, int len,
1342     int (*f)(void *, void *, u_int), void *arg)
1343 {
1344 	u_int count;
1345 	int rval;
1346 
1347 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1348 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1349 	while (off > 0) {
1350 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1351 		if (off < m->m_len)
1352 			break;
1353 		off -= m->m_len;
1354 		m = m->m_next;
1355 	}
1356 	while (len > 0) {
1357 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1358 		count = min(m->m_len - off, len);
1359 		rval = m_apply_one(m, off, count, f, arg);
1360 		if (rval)
1361 			return (rval);
1362 		len -= count;
1363 		off = 0;
1364 		m = m->m_next;
1365 	}
1366 	return (0);
1367 }
1368 
1369 /*
1370  * Return a pointer to mbuf/offset of location in mbuf chain.
1371  */
1372 struct mbuf *
1373 m_getptr(struct mbuf *m, int loc, int *off)
1374 {
1375 
1376 	while (loc >= 0) {
1377 		/* Normal end of search. */
1378 		if (m->m_len > loc) {
1379 			*off = loc;
1380 			return (m);
1381 		} else {
1382 			loc -= m->m_len;
1383 			if (m->m_next == NULL) {
1384 				if (loc == 0) {
1385 					/* Point at the end of valid data. */
1386 					*off = m->m_len;
1387 					return (m);
1388 				}
1389 				return (NULL);
1390 			}
1391 			m = m->m_next;
1392 		}
1393 	}
1394 	return (NULL);
1395 }
1396 
1397 void
1398 m_print(const struct mbuf *m, int maxlen)
1399 {
1400 	int len;
1401 	int pdata;
1402 	const struct mbuf *m2;
1403 
1404 	if (m == NULL) {
1405 		printf("mbuf: %p\n", m);
1406 		return;
1407 	}
1408 
1409 	if (m->m_flags & M_PKTHDR)
1410 		len = m->m_pkthdr.len;
1411 	else
1412 		len = -1;
1413 	m2 = m;
1414 	while (m2 != NULL && (len == -1 || len)) {
1415 		pdata = m2->m_len;
1416 		if (maxlen != -1 && pdata > maxlen)
1417 			pdata = maxlen;
1418 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1419 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1420 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1421 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1422 		if (pdata)
1423 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1424 		if (len != -1)
1425 			len -= m2->m_len;
1426 		m2 = m2->m_next;
1427 	}
1428 	if (len > 0)
1429 		printf("%d bytes unaccounted for.\n", len);
1430 	return;
1431 }
1432 
1433 u_int
1434 m_fixhdr(struct mbuf *m0)
1435 {
1436 	u_int len;
1437 
1438 	len = m_length(m0, NULL);
1439 	m0->m_pkthdr.len = len;
1440 	return (len);
1441 }
1442 
1443 u_int
1444 m_length(struct mbuf *m0, struct mbuf **last)
1445 {
1446 	struct mbuf *m;
1447 	u_int len;
1448 
1449 	len = 0;
1450 	for (m = m0; m != NULL; m = m->m_next) {
1451 		len += m->m_len;
1452 		if (m->m_next == NULL)
1453 			break;
1454 	}
1455 	if (last != NULL)
1456 		*last = m;
1457 	return (len);
1458 }
1459 
1460 /*
1461  * Defragment a mbuf chain, returning the shortest possible
1462  * chain of mbufs and clusters.  If allocation fails and
1463  * this cannot be completed, NULL will be returned, but
1464  * the passed in chain will be unchanged.  Upon success,
1465  * the original chain will be freed, and the new chain
1466  * will be returned.
1467  *
1468  * If a non-packet header is passed in, the original
1469  * mbuf (chain?) will be returned unharmed.
1470  */
1471 struct mbuf *
1472 m_defrag(struct mbuf *m0, int how)
1473 {
1474 	struct mbuf *m_new = NULL, *m_final = NULL;
1475 	int progress = 0, length;
1476 
1477 	MBUF_CHECKSLEEP(how);
1478 	if (!(m0->m_flags & M_PKTHDR))
1479 		return (m0);
1480 
1481 	m_fixhdr(m0); /* Needed sanity check */
1482 
1483 #ifdef MBUF_STRESS_TEST
1484 	if (m_defragrandomfailures) {
1485 		int temp = arc4random() & 0xff;
1486 		if (temp == 0xba)
1487 			goto nospace;
1488 	}
1489 #endif
1490 
1491 	if (m0->m_pkthdr.len > MHLEN)
1492 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1493 	else
1494 		m_final = m_gethdr(how, MT_DATA);
1495 
1496 	if (m_final == NULL)
1497 		goto nospace;
1498 
1499 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1500 		goto nospace;
1501 
1502 	m_new = m_final;
1503 
1504 	while (progress < m0->m_pkthdr.len) {
1505 		length = m0->m_pkthdr.len - progress;
1506 		if (length > MCLBYTES)
1507 			length = MCLBYTES;
1508 
1509 		if (m_new == NULL) {
1510 			if (length > MLEN)
1511 				m_new = m_getcl(how, MT_DATA, 0);
1512 			else
1513 				m_new = m_get(how, MT_DATA);
1514 			if (m_new == NULL)
1515 				goto nospace;
1516 		}
1517 
1518 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1519 		progress += length;
1520 		m_new->m_len = length;
1521 		if (m_new != m_final)
1522 			m_cat(m_final, m_new);
1523 		m_new = NULL;
1524 	}
1525 #ifdef MBUF_STRESS_TEST
1526 	if (m0->m_next == NULL)
1527 		m_defraguseless++;
1528 #endif
1529 	m_freem(m0);
1530 	m0 = m_final;
1531 #ifdef MBUF_STRESS_TEST
1532 	m_defragpackets++;
1533 	m_defragbytes += m0->m_pkthdr.len;
1534 #endif
1535 	return (m0);
1536 nospace:
1537 #ifdef MBUF_STRESS_TEST
1538 	m_defragfailure++;
1539 #endif
1540 	if (m_final)
1541 		m_freem(m_final);
1542 	return (NULL);
1543 }
1544 
1545 /*
1546  * Return the number of fragments an mbuf will use.  This is usually
1547  * used as a proxy for the number of scatter/gather elements needed by
1548  * a DMA engine to access an mbuf.  In general mapped mbufs are
1549  * assumed to be backed by physically contiguous buffers that only
1550  * need a single fragment.  Unmapped mbufs, on the other hand, can
1551  * span disjoint physical pages.
1552  */
1553 static int
1554 frags_per_mbuf(struct mbuf *m)
1555 {
1556 	int frags;
1557 
1558 	if ((m->m_flags & M_EXTPG) == 0)
1559 		return (1);
1560 
1561 	/*
1562 	 * The header and trailer are counted as a single fragment
1563 	 * each when present.
1564 	 *
1565 	 * XXX: This overestimates the number of fragments by assuming
1566 	 * all the backing physical pages are disjoint.
1567 	 */
1568 	frags = 0;
1569 	if (m->m_epg_hdrlen != 0)
1570 		frags++;
1571 	frags += m->m_epg_npgs;
1572 	if (m->m_epg_trllen != 0)
1573 		frags++;
1574 
1575 	return (frags);
1576 }
1577 
1578 /*
1579  * Defragment an mbuf chain, returning at most maxfrags separate
1580  * mbufs+clusters.  If this is not possible NULL is returned and
1581  * the original mbuf chain is left in its present (potentially
1582  * modified) state.  We use two techniques: collapsing consecutive
1583  * mbufs and replacing consecutive mbufs by a cluster.
1584  *
1585  * NB: this should really be named m_defrag but that name is taken
1586  */
1587 struct mbuf *
1588 m_collapse(struct mbuf *m0, int how, int maxfrags)
1589 {
1590 	struct mbuf *m, *n, *n2, **prev;
1591 	u_int curfrags;
1592 
1593 	/*
1594 	 * Calculate the current number of frags.
1595 	 */
1596 	curfrags = 0;
1597 	for (m = m0; m != NULL; m = m->m_next)
1598 		curfrags += frags_per_mbuf(m);
1599 	/*
1600 	 * First, try to collapse mbufs.  Note that we always collapse
1601 	 * towards the front so we don't need to deal with moving the
1602 	 * pkthdr.  This may be suboptimal if the first mbuf has much
1603 	 * less data than the following.
1604 	 */
1605 	m = m0;
1606 again:
1607 	for (;;) {
1608 		n = m->m_next;
1609 		if (n == NULL)
1610 			break;
1611 		if (M_WRITABLE(m) &&
1612 		    n->m_len < M_TRAILINGSPACE(m)) {
1613 			m_copydata(n, 0, n->m_len,
1614 			    mtod(m, char *) + m->m_len);
1615 			m->m_len += n->m_len;
1616 			m->m_next = n->m_next;
1617 			curfrags -= frags_per_mbuf(n);
1618 			m_free(n);
1619 			if (curfrags <= maxfrags)
1620 				return m0;
1621 		} else
1622 			m = n;
1623 	}
1624 	KASSERT(maxfrags > 1,
1625 		("maxfrags %u, but normal collapse failed", maxfrags));
1626 	/*
1627 	 * Collapse consecutive mbufs to a cluster.
1628 	 */
1629 	prev = &m0->m_next;		/* NB: not the first mbuf */
1630 	while ((n = *prev) != NULL) {
1631 		if ((n2 = n->m_next) != NULL &&
1632 		    n->m_len + n2->m_len < MCLBYTES) {
1633 			m = m_getcl(how, MT_DATA, 0);
1634 			if (m == NULL)
1635 				goto bad;
1636 			m_copydata(n, 0,  n->m_len, mtod(m, char *));
1637 			m_copydata(n2, 0,  n2->m_len,
1638 			    mtod(m, char *) + n->m_len);
1639 			m->m_len = n->m_len + n2->m_len;
1640 			m->m_next = n2->m_next;
1641 			*prev = m;
1642 			curfrags += 1;  /* For the new cluster */
1643 			curfrags -= frags_per_mbuf(n);
1644 			curfrags -= frags_per_mbuf(n2);
1645 			m_free(n);
1646 			m_free(n2);
1647 			if (curfrags <= maxfrags)
1648 				return m0;
1649 			/*
1650 			 * Still not there, try the normal collapse
1651 			 * again before we allocate another cluster.
1652 			 */
1653 			goto again;
1654 		}
1655 		prev = &n->m_next;
1656 	}
1657 	/*
1658 	 * No place where we can collapse to a cluster; punt.
1659 	 * This can occur if, for example, you request 2 frags
1660 	 * but the packet requires that both be clusters (we
1661 	 * never reallocate the first mbuf to avoid moving the
1662 	 * packet header).
1663 	 */
1664 bad:
1665 	return NULL;
1666 }
1667 
1668 #ifdef MBUF_STRESS_TEST
1669 
1670 /*
1671  * Fragment an mbuf chain.  There's no reason you'd ever want to do
1672  * this in normal usage, but it's great for stress testing various
1673  * mbuf consumers.
1674  *
1675  * If fragmentation is not possible, the original chain will be
1676  * returned.
1677  *
1678  * Possible length values:
1679  * 0	 no fragmentation will occur
1680  * > 0	each fragment will be of the specified length
1681  * -1	each fragment will be the same random value in length
1682  * -2	each fragment's length will be entirely random
1683  * (Random values range from 1 to 256)
1684  */
1685 struct mbuf *
1686 m_fragment(struct mbuf *m0, int how, int length)
1687 {
1688 	struct mbuf *m_first, *m_last;
1689 	int divisor = 255, progress = 0, fraglen;
1690 
1691 	if (!(m0->m_flags & M_PKTHDR))
1692 		return (m0);
1693 
1694 	if (length == 0 || length < -2)
1695 		return (m0);
1696 	if (length > MCLBYTES)
1697 		length = MCLBYTES;
1698 	if (length < 0 && divisor > MCLBYTES)
1699 		divisor = MCLBYTES;
1700 	if (length == -1)
1701 		length = 1 + (arc4random() % divisor);
1702 	if (length > 0)
1703 		fraglen = length;
1704 
1705 	m_fixhdr(m0); /* Needed sanity check */
1706 
1707 	m_first = m_getcl(how, MT_DATA, M_PKTHDR);
1708 	if (m_first == NULL)
1709 		goto nospace;
1710 
1711 	if (m_dup_pkthdr(m_first, m0, how) == 0)
1712 		goto nospace;
1713 
1714 	m_last = m_first;
1715 
1716 	while (progress < m0->m_pkthdr.len) {
1717 		if (length == -2)
1718 			fraglen = 1 + (arc4random() % divisor);
1719 		if (fraglen > m0->m_pkthdr.len - progress)
1720 			fraglen = m0->m_pkthdr.len - progress;
1721 
1722 		if (progress != 0) {
1723 			struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
1724 			if (m_new == NULL)
1725 				goto nospace;
1726 
1727 			m_last->m_next = m_new;
1728 			m_last = m_new;
1729 		}
1730 
1731 		m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
1732 		progress += fraglen;
1733 		m_last->m_len = fraglen;
1734 	}
1735 	m_freem(m0);
1736 	m0 = m_first;
1737 	return (m0);
1738 nospace:
1739 	if (m_first)
1740 		m_freem(m_first);
1741 	/* Return the original chain on failure */
1742 	return (m0);
1743 }
1744 
1745 #endif
1746 
1747 /*
1748  * Free pages from mbuf_ext_pgs, assuming they were allocated via
1749  * vm_page_alloc() and aren't associated with any object.  Complement
1750  * to allocator from m_uiotombuf_nomap().
1751  */
1752 void
1753 mb_free_mext_pgs(struct mbuf *m)
1754 {
1755 	vm_page_t pg;
1756 
1757 	M_ASSERTEXTPG(m);
1758 	for (int i = 0; i < m->m_epg_npgs; i++) {
1759 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
1760 		vm_page_unwire_noq(pg);
1761 		vm_page_free(pg);
1762 	}
1763 }
1764 
1765 static struct mbuf *
1766 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
1767 {
1768 	struct mbuf *m, *mb, *prev;
1769 	vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
1770 	int error, length, i, needed;
1771 	ssize_t total;
1772 	int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED;
1773 
1774 	MPASS((flags & M_PKTHDR) == 0);
1775 	MPASS((how & M_ZERO) == 0);
1776 
1777 	/*
1778 	 * len can be zero or an arbitrary large value bound by
1779 	 * the total data supplied by the uio.
1780 	 */
1781 	if (len > 0)
1782 		total = MIN(uio->uio_resid, len);
1783 	else
1784 		total = uio->uio_resid;
1785 
1786 	if (maxseg == 0)
1787 		maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
1788 
1789 	/*
1790 	 * If total is zero, return an empty mbuf.  This can occur
1791 	 * for TLS 1.0 connections which send empty fragments as
1792 	 * a countermeasure against the known-IV weakness in CBC
1793 	 * ciphersuites.
1794 	 */
1795 	if (__predict_false(total == 0)) {
1796 		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
1797 		if (mb == NULL)
1798 			return (NULL);
1799 		mb->m_epg_flags = EPG_FLAG_ANON;
1800 		return (mb);
1801 	}
1802 
1803 	/*
1804 	 * Allocate the pages
1805 	 */
1806 	m = NULL;
1807 	while (total > 0) {
1808 		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
1809 		if (mb == NULL)
1810 			goto failed;
1811 		if (m == NULL)
1812 			m = mb;
1813 		else
1814 			prev->m_next = mb;
1815 		prev = mb;
1816 		mb->m_epg_flags = EPG_FLAG_ANON;
1817 		needed = length = MIN(maxseg, total);
1818 		for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
1819 retry_page:
1820 			pg_array[i] = vm_page_alloc_noobj(pflags);
1821 			if (pg_array[i] == NULL) {
1822 				if (how & M_NOWAIT) {
1823 					goto failed;
1824 				} else {
1825 					vm_wait(NULL);
1826 					goto retry_page;
1827 				}
1828 			}
1829 			mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
1830 			mb->m_epg_npgs++;
1831 		}
1832 		mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1);
1833 		MBUF_EXT_PGS_ASSERT_SANITY(mb);
1834 		total -= length;
1835 		error = uiomove_fromphys(pg_array, 0, length, uio);
1836 		if (error != 0)
1837 			goto failed;
1838 		mb->m_len = length;
1839 		mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs;
1840 		if (flags & M_PKTHDR)
1841 			m->m_pkthdr.len += length;
1842 	}
1843 	return (m);
1844 
1845 failed:
1846 	m_freem(m);
1847 	return (NULL);
1848 }
1849 
1850 /*
1851  * Copy the contents of uio into a properly sized mbuf chain.
1852  */
1853 struct mbuf *
1854 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
1855 {
1856 	struct mbuf *m, *mb;
1857 	int error, length;
1858 	ssize_t total;
1859 	int progress = 0;
1860 
1861 	if (flags & M_EXTPG)
1862 		return (m_uiotombuf_nomap(uio, how, len, align, flags));
1863 
1864 	/*
1865 	 * len can be zero or an arbitrary large value bound by
1866 	 * the total data supplied by the uio.
1867 	 */
1868 	if (len > 0)
1869 		total = (uio->uio_resid < len) ? uio->uio_resid : len;
1870 	else
1871 		total = uio->uio_resid;
1872 
1873 	/*
1874 	 * The smallest unit returned by m_getm2() is a single mbuf
1875 	 * with pkthdr.  We can't align past it.
1876 	 */
1877 	if (align >= MHLEN)
1878 		return (NULL);
1879 
1880 	/*
1881 	 * Give us the full allocation or nothing.
1882 	 * If len is zero return the smallest empty mbuf.
1883 	 */
1884 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
1885 	if (m == NULL)
1886 		return (NULL);
1887 	m->m_data += align;
1888 
1889 	/* Fill all mbufs with uio data and update header information. */
1890 	for (mb = m; mb != NULL; mb = mb->m_next) {
1891 		length = min(M_TRAILINGSPACE(mb), total - progress);
1892 
1893 		error = uiomove(mtod(mb, void *), length, uio);
1894 		if (error) {
1895 			m_freem(m);
1896 			return (NULL);
1897 		}
1898 
1899 		mb->m_len = length;
1900 		progress += length;
1901 		if (flags & M_PKTHDR)
1902 			m->m_pkthdr.len += length;
1903 	}
1904 	KASSERT(progress == total, ("%s: progress != total", __func__));
1905 
1906 	return (m);
1907 }
1908 
1909 /*
1910  * Copy data to/from an unmapped mbuf into a uio limited by len if set.
1911  */
1912 int
1913 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len)
1914 {
1915 	vm_page_t pg;
1916 	int error, i, off, pglen, pgoff, seglen, segoff;
1917 
1918 	M_ASSERTEXTPG(m);
1919 	error = 0;
1920 
1921 	/* Skip over any data removed from the front. */
1922 	off = mtod(m, vm_offset_t);
1923 
1924 	off += m_off;
1925 	if (m->m_epg_hdrlen != 0) {
1926 		if (off >= m->m_epg_hdrlen) {
1927 			off -= m->m_epg_hdrlen;
1928 		} else {
1929 			seglen = m->m_epg_hdrlen - off;
1930 			segoff = off;
1931 			seglen = min(seglen, len);
1932 			off = 0;
1933 			len -= seglen;
1934 			error = uiomove(__DECONST(void *,
1935 			    &m->m_epg_hdr[segoff]), seglen, uio);
1936 		}
1937 	}
1938 	pgoff = m->m_epg_1st_off;
1939 	for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
1940 		pglen = m_epg_pagelen(m, i, pgoff);
1941 		if (off >= pglen) {
1942 			off -= pglen;
1943 			pgoff = 0;
1944 			continue;
1945 		}
1946 		seglen = pglen - off;
1947 		segoff = pgoff + off;
1948 		off = 0;
1949 		seglen = min(seglen, len);
1950 		len -= seglen;
1951 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
1952 		error = uiomove_fromphys(&pg, segoff, seglen, uio);
1953 		pgoff = 0;
1954 	};
1955 	if (len != 0 && error == 0) {
1956 		KASSERT((off + len) <= m->m_epg_trllen,
1957 		    ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
1958 		    m->m_epg_trllen, m_off));
1959 		error = uiomove(__DECONST(void *, &m->m_epg_trail[off]),
1960 		    len, uio);
1961 	}
1962 	return (error);
1963 }
1964 
1965 /*
1966  * Copy an mbuf chain into a uio limited by len if set.
1967  */
1968 int
1969 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
1970 {
1971 	int error, length, total;
1972 	int progress = 0;
1973 
1974 	if (len > 0)
1975 		total = min(uio->uio_resid, len);
1976 	else
1977 		total = uio->uio_resid;
1978 
1979 	/* Fill the uio with data from the mbufs. */
1980 	for (; m != NULL; m = m->m_next) {
1981 		length = min(m->m_len, total - progress);
1982 
1983 		if ((m->m_flags & M_EXTPG) != 0)
1984 			error = m_unmapped_uiomove(m, 0, uio, length);
1985 		else
1986 			error = uiomove(mtod(m, void *), length, uio);
1987 		if (error)
1988 			return (error);
1989 
1990 		progress += length;
1991 	}
1992 
1993 	return (0);
1994 }
1995 
1996 /*
1997  * Create a writable copy of the mbuf chain.  While doing this
1998  * we compact the chain with a goal of producing a chain with
1999  * at most two mbufs.  The second mbuf in this chain is likely
2000  * to be a cluster.  The primary purpose of this work is to create
2001  * a writable packet for encryption, compression, etc.  The
2002  * secondary goal is to linearize the data so the data can be
2003  * passed to crypto hardware in the most efficient manner possible.
2004  */
2005 struct mbuf *
2006 m_unshare(struct mbuf *m0, int how)
2007 {
2008 	struct mbuf *m, *mprev;
2009 	struct mbuf *n, *mfirst, *mlast;
2010 	int len, off;
2011 
2012 	mprev = NULL;
2013 	for (m = m0; m != NULL; m = mprev->m_next) {
2014 		/*
2015 		 * Regular mbufs are ignored unless there's a cluster
2016 		 * in front of it that we can use to coalesce.  We do
2017 		 * the latter mainly so later clusters can be coalesced
2018 		 * also w/o having to handle them specially (i.e. convert
2019 		 * mbuf+cluster -> cluster).  This optimization is heavily
2020 		 * influenced by the assumption that we're running over
2021 		 * Ethernet where MCLBYTES is large enough that the max
2022 		 * packet size will permit lots of coalescing into a
2023 		 * single cluster.  This in turn permits efficient
2024 		 * crypto operations, especially when using hardware.
2025 		 */
2026 		if ((m->m_flags & M_EXT) == 0) {
2027 			if (mprev && (mprev->m_flags & M_EXT) &&
2028 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
2029 				/* XXX: this ignores mbuf types */
2030 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2031 				    mtod(m, caddr_t), m->m_len);
2032 				mprev->m_len += m->m_len;
2033 				mprev->m_next = m->m_next;	/* unlink from chain */
2034 				m_free(m);			/* reclaim mbuf */
2035 			} else {
2036 				mprev = m;
2037 			}
2038 			continue;
2039 		}
2040 		/*
2041 		 * Writable mbufs are left alone (for now).
2042 		 */
2043 		if (M_WRITABLE(m)) {
2044 			mprev = m;
2045 			continue;
2046 		}
2047 
2048 		/*
2049 		 * Not writable, replace with a copy or coalesce with
2050 		 * the previous mbuf if possible (since we have to copy
2051 		 * it anyway, we try to reduce the number of mbufs and
2052 		 * clusters so that future work is easier).
2053 		 */
2054 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
2055 		/* NB: we only coalesce into a cluster or larger */
2056 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
2057 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
2058 			/* XXX: this ignores mbuf types */
2059 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2060 			    mtod(m, caddr_t), m->m_len);
2061 			mprev->m_len += m->m_len;
2062 			mprev->m_next = m->m_next;	/* unlink from chain */
2063 			m_free(m);			/* reclaim mbuf */
2064 			continue;
2065 		}
2066 
2067 		/*
2068 		 * Allocate new space to hold the copy and copy the data.
2069 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
2070 		 * splitting them into clusters.  We could just malloc a
2071 		 * buffer and make it external but too many device drivers
2072 		 * don't know how to break up the non-contiguous memory when
2073 		 * doing DMA.
2074 		 */
2075 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2076 		if (n == NULL) {
2077 			m_freem(m0);
2078 			return (NULL);
2079 		}
2080 		if (m->m_flags & M_PKTHDR) {
2081 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
2082 			    __func__, m0, m));
2083 			m_move_pkthdr(n, m);
2084 		}
2085 		len = m->m_len;
2086 		off = 0;
2087 		mfirst = n;
2088 		mlast = NULL;
2089 		for (;;) {
2090 			int cc = min(len, MCLBYTES);
2091 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
2092 			n->m_len = cc;
2093 			if (mlast != NULL)
2094 				mlast->m_next = n;
2095 			mlast = n;
2096 #if 0
2097 			newipsecstat.ips_clcopied++;
2098 #endif
2099 
2100 			len -= cc;
2101 			if (len <= 0)
2102 				break;
2103 			off += cc;
2104 
2105 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2106 			if (n == NULL) {
2107 				m_freem(mfirst);
2108 				m_freem(m0);
2109 				return (NULL);
2110 			}
2111 		}
2112 		n->m_next = m->m_next;
2113 		if (mprev == NULL)
2114 			m0 = mfirst;		/* new head of chain */
2115 		else
2116 			mprev->m_next = mfirst;	/* replace old mbuf */
2117 		m_free(m);			/* release old mbuf */
2118 		mprev = mfirst;
2119 	}
2120 	return (m0);
2121 }
2122 
2123 #ifdef MBUF_PROFILING
2124 
2125 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
2126 struct mbufprofile {
2127 	uintmax_t wasted[MP_BUCKETS];
2128 	uintmax_t used[MP_BUCKETS];
2129 	uintmax_t segments[MP_BUCKETS];
2130 } mbprof;
2131 
2132 void
2133 m_profile(struct mbuf *m)
2134 {
2135 	int segments = 0;
2136 	int used = 0;
2137 	int wasted = 0;
2138 
2139 	while (m) {
2140 		segments++;
2141 		used += m->m_len;
2142 		if (m->m_flags & M_EXT) {
2143 			wasted += MHLEN - sizeof(m->m_ext) +
2144 			    m->m_ext.ext_size - m->m_len;
2145 		} else {
2146 			if (m->m_flags & M_PKTHDR)
2147 				wasted += MHLEN - m->m_len;
2148 			else
2149 				wasted += MLEN - m->m_len;
2150 		}
2151 		m = m->m_next;
2152 	}
2153 	/* be paranoid.. it helps */
2154 	if (segments > MP_BUCKETS - 1)
2155 		segments = MP_BUCKETS - 1;
2156 	if (used > 100000)
2157 		used = 100000;
2158 	if (wasted > 100000)
2159 		wasted = 100000;
2160 	/* store in the appropriate bucket */
2161 	/* don't bother locking. if it's slightly off, so what? */
2162 	mbprof.segments[segments]++;
2163 	mbprof.used[fls(used)]++;
2164 	mbprof.wasted[fls(wasted)]++;
2165 }
2166 
2167 static int
2168 mbprof_handler(SYSCTL_HANDLER_ARGS)
2169 {
2170 	char buf[256];
2171 	struct sbuf sb;
2172 	int error;
2173 	uint64_t *p;
2174 
2175 	sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req);
2176 
2177 	p = &mbprof.wasted[0];
2178 	sbuf_printf(&sb,
2179 	    "wasted:\n"
2180 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2181 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2182 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2183 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2184 #ifdef BIG_ARRAY
2185 	p = &mbprof.wasted[16];
2186 	sbuf_printf(&sb,
2187 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2188 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2189 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2190 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2191 #endif
2192 	p = &mbprof.used[0];
2193 	sbuf_printf(&sb,
2194 	    "used:\n"
2195 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2196 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2197 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2198 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2199 #ifdef BIG_ARRAY
2200 	p = &mbprof.used[16];
2201 	sbuf_printf(&sb,
2202 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2203 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2204 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2205 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2206 #endif
2207 	p = &mbprof.segments[0];
2208 	sbuf_printf(&sb,
2209 	    "segments:\n"
2210 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2211 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2212 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2213 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2214 #ifdef BIG_ARRAY
2215 	p = &mbprof.segments[16];
2216 	sbuf_printf(&sb,
2217 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2218 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
2219 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2220 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2221 #endif
2222 
2223 	error = sbuf_finish(&sb);
2224 	sbuf_delete(&sb);
2225 	return (error);
2226 }
2227 
2228 static int
2229 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
2230 {
2231 	int clear, error;
2232 
2233 	clear = 0;
2234 	error = sysctl_handle_int(oidp, &clear, 0, req);
2235 	if (error || !req->newptr)
2236 		return (error);
2237 
2238 	if (clear) {
2239 		bzero(&mbprof, sizeof(mbprof));
2240 	}
2241 
2242 	return (error);
2243 }
2244 
2245 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile,
2246     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2247     mbprof_handler, "A",
2248     "mbuf profiling statistics");
2249 
2250 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr,
2251     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
2252     mbprof_clr_handler, "I",
2253     "clear mbuf profiling statistics");
2254 #endif
2255