xref: /freebsd/sys/kern/uipc_mbuf.c (revision 3e8eb5c7f4909209c042403ddee340b2ee7003a5)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_param.h"
38 #include "opt_mbuf_stress_test.h"
39 #include "opt_mbuf_profiling.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/sysctl.h>
49 #include <sys/domain.h>
50 #include <sys/protosw.h>
51 #include <sys/uio.h>
52 #include <sys/vmmeter.h>
53 #include <sys/sbuf.h>
54 #include <sys/sdt.h>
55 #include <vm/vm.h>
56 #include <vm/vm_pageout.h>
57 #include <vm/vm_page.h>
58 
59 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
60     "struct mbuf *", "mbufinfo_t *",
61     "uint32_t", "uint32_t",
62     "uint16_t", "uint16_t",
63     "uint32_t", "uint32_t",
64     "uint32_t", "uint32_t");
65 
66 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw,
67     "uint32_t", "uint32_t",
68     "uint16_t", "uint16_t",
69     "struct mbuf *", "mbufinfo_t *");
70 
71 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
72     "uint32_t", "uint32_t",
73     "uint16_t", "uint16_t",
74     "struct mbuf *", "mbufinfo_t *");
75 
76 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw,
77     "uint32_t", "uint32_t",
78     "uint16_t", "uint16_t",
79     "struct mbuf *", "mbufinfo_t *");
80 
81 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
82     "uint32_t", "uint32_t",
83     "uint16_t", "uint16_t",
84     "struct mbuf *", "mbufinfo_t *");
85 
86 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
87     "uint32_t", "uint32_t",
88     "uint16_t", "uint16_t",
89     "uint32_t", "uint32_t",
90     "struct mbuf *", "mbufinfo_t *");
91 
92 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl,
93     "uint32_t", "uint32_t",
94     "uint16_t", "uint16_t",
95     "uint32_t", "uint32_t",
96     "uint32_t", "uint32_t",
97     "struct mbuf *", "mbufinfo_t *");
98 
99 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
100     "struct mbuf *", "mbufinfo_t *",
101     "uint32_t", "uint32_t",
102     "uint32_t", "uint32_t");
103 
104 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
105     "struct mbuf *", "mbufinfo_t *",
106     "uint32_t", "uint32_t",
107     "uint32_t", "uint32_t",
108     "void*", "void*");
109 
110 SDT_PROBE_DEFINE(sdt, , , m__cljset);
111 
112 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
113         "struct mbuf *", "mbufinfo_t *");
114 
115 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
116     "struct mbuf *", "mbufinfo_t *");
117 
118 #include <security/mac/mac_framework.h>
119 
120 int	max_linkhdr;
121 int	max_protohdr;
122 int	max_hdr;
123 int	max_datalen;
124 #ifdef MBUF_STRESS_TEST
125 int	m_defragpackets;
126 int	m_defragbytes;
127 int	m_defraguseless;
128 int	m_defragfailure;
129 int	m_defragrandomfailures;
130 #endif
131 
132 /*
133  * sysctl(8) exported objects
134  */
135 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
136 	   &max_linkhdr, 0, "Size of largest link layer header");
137 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
138 	   &max_protohdr, 0, "Size of largest protocol layer header");
139 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
140 	   &max_hdr, 0, "Size of largest link plus protocol header");
141 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
142 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
143 #ifdef MBUF_STRESS_TEST
144 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
145 	   &m_defragpackets, 0, "");
146 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
147 	   &m_defragbytes, 0, "");
148 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
149 	   &m_defraguseless, 0, "");
150 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
151 	   &m_defragfailure, 0, "");
152 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
153 	   &m_defragrandomfailures, 0, "");
154 #endif
155 
156 /*
157  * Ensure the correct size of various mbuf parameters.  It could be off due
158  * to compiler-induced padding and alignment artifacts.
159  */
160 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
161 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
162 
163 /*
164  * mbuf data storage should be 64-bit aligned regardless of architectural
165  * pointer size; check this is the case with and without a packet header.
166  */
167 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
168 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
169 
170 /*
171  * While the specific values here don't matter too much (i.e., +/- a few
172  * words), we do want to ensure that changes to these values are carefully
173  * reasoned about and properly documented.  This is especially the case as
174  * network-protocol and device-driver modules encode these layouts, and must
175  * be recompiled if the structures change.  Check these values at compile time
176  * against the ones documented in comments in mbuf.h.
177  *
178  * NB: Possibly they should be documented there via #define's and not just
179  * comments.
180  */
181 #if defined(__LP64__)
182 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
183 CTASSERT(sizeof(struct pkthdr) == 64);
184 CTASSERT(sizeof(struct m_ext) == 160);
185 #else
186 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
187 CTASSERT(sizeof(struct pkthdr) == 56);
188 #if defined(__powerpc__) && defined(BOOKE)
189 /* PowerPC booke has 64-bit physical pointers. */
190 CTASSERT(sizeof(struct m_ext) == 176);
191 #else
192 CTASSERT(sizeof(struct m_ext) == 172);
193 #endif
194 #endif
195 
196 /*
197  * Assert that the queue(3) macros produce code of the same size as an old
198  * plain pointer does.
199  */
200 #ifdef INVARIANTS
201 static struct mbuf __used m_assertbuf;
202 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
203 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
204 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
205 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
206 #endif
207 
208 /*
209  * Attach the cluster from *m to *n, set up m_ext in *n
210  * and bump the refcount of the cluster.
211  */
212 void
213 mb_dupcl(struct mbuf *n, struct mbuf *m)
214 {
215 	volatile u_int *refcnt;
216 
217 	KASSERT(m->m_flags & (M_EXT|M_EXTPG),
218 	    ("%s: M_EXT|M_EXTPG not set on %p", __func__, m));
219 	KASSERT(!(n->m_flags & (M_EXT|M_EXTPG)),
220 	    ("%s: M_EXT|M_EXTPG set on %p", __func__, n));
221 
222 	/*
223 	 * Cache access optimization.
224 	 *
225 	 * o Regular M_EXT storage doesn't need full copy of m_ext, since
226 	 *   the holder of the 'ext_count' is responsible to carry the free
227 	 *   routine and its arguments.
228 	 * o M_EXTPG data is split between main part of mbuf and m_ext, the
229 	 *   main part is copied in full, the m_ext part is similar to M_EXT.
230 	 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is
231 	 *   special - it needs full copy of m_ext into each mbuf, since any
232 	 *   copy could end up as the last to free.
233 	 */
234 	if (m->m_flags & M_EXTPG) {
235 		bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy,
236 		    __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy));
237 		bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen);
238 	} else if (m->m_ext.ext_type == EXT_EXTREF)
239 		bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
240 	else
241 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
242 
243 	n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG);
244 
245 	/* See if this is the mbuf that holds the embedded refcount. */
246 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
247 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
248 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
249 	} else {
250 		KASSERT(m->m_ext.ext_cnt != NULL,
251 		    ("%s: no refcounting pointer on %p", __func__, m));
252 		refcnt = m->m_ext.ext_cnt;
253 	}
254 
255 	if (*refcnt == 1)
256 		*refcnt += 1;
257 	else
258 		atomic_add_int(refcnt, 1);
259 }
260 
261 void
262 m_demote_pkthdr(struct mbuf *m)
263 {
264 
265 	M_ASSERTPKTHDR(m);
266 	M_ASSERT_NO_SND_TAG(m);
267 
268 	m_tag_delete_chain(m, NULL);
269 	m->m_flags &= ~M_PKTHDR;
270 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
271 }
272 
273 /*
274  * Clean up mbuf (chain) from any tags and packet headers.
275  * If "all" is set then the first mbuf in the chain will be
276  * cleaned too.
277  */
278 void
279 m_demote(struct mbuf *m0, int all, int flags)
280 {
281 	struct mbuf *m;
282 
283 	flags |= M_DEMOTEFLAGS;
284 
285 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
286 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
287 		    __func__, m, m0));
288 		if (m->m_flags & M_PKTHDR)
289 			m_demote_pkthdr(m);
290 		m->m_flags &= flags;
291 	}
292 }
293 
294 /*
295  * Sanity checks on mbuf (chain) for use in KASSERT() and general
296  * debugging.
297  * Returns 0 or panics when bad and 1 on all tests passed.
298  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
299  * blow up later.
300  */
301 int
302 m_sanity(struct mbuf *m0, int sanitize)
303 {
304 	struct mbuf *m;
305 	caddr_t a, b;
306 	int pktlen = 0;
307 
308 #ifdef INVARIANTS
309 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
310 #else
311 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
312 #endif
313 
314 	for (m = m0; m != NULL; m = m->m_next) {
315 		/*
316 		 * Basic pointer checks.  If any of these fails then some
317 		 * unrelated kernel memory before or after us is trashed.
318 		 * No way to recover from that.
319 		 */
320 		a = M_START(m);
321 		b = a + M_SIZE(m);
322 		if ((caddr_t)m->m_data < a)
323 			M_SANITY_ACTION("m_data outside mbuf data range left");
324 		if ((caddr_t)m->m_data > b)
325 			M_SANITY_ACTION("m_data outside mbuf data range right");
326 		if ((caddr_t)m->m_data + m->m_len > b)
327 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
328 
329 		/* m->m_nextpkt may only be set on first mbuf in chain. */
330 		if (m != m0 && m->m_nextpkt != NULL) {
331 			if (sanitize) {
332 				m_freem(m->m_nextpkt);
333 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
334 			} else
335 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
336 		}
337 
338 		/* packet length (not mbuf length!) calculation */
339 		if (m0->m_flags & M_PKTHDR)
340 			pktlen += m->m_len;
341 
342 		/* m_tags may only be attached to first mbuf in chain. */
343 		if (m != m0 && m->m_flags & M_PKTHDR &&
344 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
345 			if (sanitize) {
346 				m_tag_delete_chain(m, NULL);
347 				/* put in 0xDEADC0DE perhaps? */
348 			} else
349 				M_SANITY_ACTION("m_tags on in-chain mbuf");
350 		}
351 
352 		/* M_PKTHDR may only be set on first mbuf in chain */
353 		if (m != m0 && m->m_flags & M_PKTHDR) {
354 			if (sanitize) {
355 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
356 				m->m_flags &= ~M_PKTHDR;
357 				/* put in 0xDEADCODE and leave hdr flag in */
358 			} else
359 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
360 		}
361 	}
362 	m = m0;
363 	if (pktlen && pktlen != m->m_pkthdr.len) {
364 		if (sanitize)
365 			m->m_pkthdr.len = 0;
366 		else
367 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
368 	}
369 	return 1;
370 
371 #undef	M_SANITY_ACTION
372 }
373 
374 /*
375  * Non-inlined part of m_init().
376  */
377 int
378 m_pkthdr_init(struct mbuf *m, int how)
379 {
380 #ifdef MAC
381 	int error;
382 #endif
383 	m->m_data = m->m_pktdat;
384 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
385 #ifdef NUMA
386 	m->m_pkthdr.numa_domain = M_NODOM;
387 #endif
388 #ifdef MAC
389 	/* If the label init fails, fail the alloc */
390 	error = mac_mbuf_init(m, how);
391 	if (error)
392 		return (error);
393 #endif
394 
395 	return (0);
396 }
397 
398 /*
399  * "Move" mbuf pkthdr from "from" to "to".
400  * "from" must have M_PKTHDR set, and "to" must be empty.
401  */
402 void
403 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
404 {
405 
406 #if 0
407 	/* see below for why these are not enabled */
408 	M_ASSERTPKTHDR(to);
409 	/* Note: with MAC, this may not be a good assertion. */
410 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
411 	    ("m_move_pkthdr: to has tags"));
412 #endif
413 #ifdef MAC
414 	/*
415 	 * XXXMAC: It could be this should also occur for non-MAC?
416 	 */
417 	if (to->m_flags & M_PKTHDR)
418 		m_tag_delete_chain(to, NULL);
419 #endif
420 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
421 	    (to->m_flags & (M_EXT | M_EXTPG));
422 	if ((to->m_flags & M_EXT) == 0)
423 		to->m_data = to->m_pktdat;
424 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
425 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
426 	from->m_flags &= ~M_PKTHDR;
427 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
428 		from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
429 		from->m_pkthdr.snd_tag = NULL;
430 	}
431 }
432 
433 /*
434  * Duplicate "from"'s mbuf pkthdr in "to".
435  * "from" must have M_PKTHDR set, and "to" must be empty.
436  * In particular, this does a deep copy of the packet tags.
437  */
438 int
439 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
440 {
441 
442 #if 0
443 	/*
444 	 * The mbuf allocator only initializes the pkthdr
445 	 * when the mbuf is allocated with m_gethdr(). Many users
446 	 * (e.g. m_copy*, m_prepend) use m_get() and then
447 	 * smash the pkthdr as needed causing these
448 	 * assertions to trip.  For now just disable them.
449 	 */
450 	M_ASSERTPKTHDR(to);
451 	/* Note: with MAC, this may not be a good assertion. */
452 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
453 #endif
454 	MBUF_CHECKSLEEP(how);
455 #ifdef MAC
456 	if (to->m_flags & M_PKTHDR)
457 		m_tag_delete_chain(to, NULL);
458 #endif
459 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
460 	    (to->m_flags & (M_EXT | M_EXTPG));
461 	if ((to->m_flags & M_EXT) == 0)
462 		to->m_data = to->m_pktdat;
463 	to->m_pkthdr = from->m_pkthdr;
464 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
465 		m_snd_tag_ref(from->m_pkthdr.snd_tag);
466 	SLIST_INIT(&to->m_pkthdr.tags);
467 	return (m_tag_copy_chain(to, from, how));
468 }
469 
470 /*
471  * Lesser-used path for M_PREPEND:
472  * allocate new mbuf to prepend to chain,
473  * copy junk along.
474  */
475 struct mbuf *
476 m_prepend(struct mbuf *m, int len, int how)
477 {
478 	struct mbuf *mn;
479 
480 	if (m->m_flags & M_PKTHDR)
481 		mn = m_gethdr(how, m->m_type);
482 	else
483 		mn = m_get(how, m->m_type);
484 	if (mn == NULL) {
485 		m_freem(m);
486 		return (NULL);
487 	}
488 	if (m->m_flags & M_PKTHDR)
489 		m_move_pkthdr(mn, m);
490 	mn->m_next = m;
491 	m = mn;
492 	if (len < M_SIZE(m))
493 		M_ALIGN(m, len);
494 	m->m_len = len;
495 	return (m);
496 }
497 
498 /*
499  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
500  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
501  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
502  * Note that the copy is read-only, because clusters are not copied,
503  * only their reference counts are incremented.
504  */
505 struct mbuf *
506 m_copym(struct mbuf *m, int off0, int len, int wait)
507 {
508 	struct mbuf *n, **np;
509 	int off = off0;
510 	struct mbuf *top;
511 	int copyhdr = 0;
512 
513 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
514 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
515 	MBUF_CHECKSLEEP(wait);
516 	if (off == 0 && m->m_flags & M_PKTHDR)
517 		copyhdr = 1;
518 	while (off > 0) {
519 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
520 		if (off < m->m_len)
521 			break;
522 		off -= m->m_len;
523 		m = m->m_next;
524 	}
525 	np = &top;
526 	top = NULL;
527 	while (len > 0) {
528 		if (m == NULL) {
529 			KASSERT(len == M_COPYALL,
530 			    ("m_copym, length > size of mbuf chain"));
531 			break;
532 		}
533 		if (copyhdr)
534 			n = m_gethdr(wait, m->m_type);
535 		else
536 			n = m_get(wait, m->m_type);
537 		*np = n;
538 		if (n == NULL)
539 			goto nospace;
540 		if (copyhdr) {
541 			if (!m_dup_pkthdr(n, m, wait))
542 				goto nospace;
543 			if (len == M_COPYALL)
544 				n->m_pkthdr.len -= off0;
545 			else
546 				n->m_pkthdr.len = len;
547 			copyhdr = 0;
548 		}
549 		n->m_len = min(len, m->m_len - off);
550 		if (m->m_flags & (M_EXT|M_EXTPG)) {
551 			n->m_data = m->m_data + off;
552 			mb_dupcl(n, m);
553 		} else
554 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
555 			    (u_int)n->m_len);
556 		if (len != M_COPYALL)
557 			len -= n->m_len;
558 		off = 0;
559 		m = m->m_next;
560 		np = &n->m_next;
561 	}
562 
563 	return (top);
564 nospace:
565 	m_freem(top);
566 	return (NULL);
567 }
568 
569 /*
570  * Copy an entire packet, including header (which must be present).
571  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
572  * Note that the copy is read-only, because clusters are not copied,
573  * only their reference counts are incremented.
574  * Preserve alignment of the first mbuf so if the creator has left
575  * some room at the beginning (e.g. for inserting protocol headers)
576  * the copies still have the room available.
577  */
578 struct mbuf *
579 m_copypacket(struct mbuf *m, int how)
580 {
581 	struct mbuf *top, *n, *o;
582 
583 	MBUF_CHECKSLEEP(how);
584 	n = m_get(how, m->m_type);
585 	top = n;
586 	if (n == NULL)
587 		goto nospace;
588 
589 	if (!m_dup_pkthdr(n, m, how))
590 		goto nospace;
591 	n->m_len = m->m_len;
592 	if (m->m_flags & (M_EXT|M_EXTPG)) {
593 		n->m_data = m->m_data;
594 		mb_dupcl(n, m);
595 	} else {
596 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
597 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
598 	}
599 
600 	m = m->m_next;
601 	while (m) {
602 		o = m_get(how, m->m_type);
603 		if (o == NULL)
604 			goto nospace;
605 
606 		n->m_next = o;
607 		n = n->m_next;
608 
609 		n->m_len = m->m_len;
610 		if (m->m_flags & (M_EXT|M_EXTPG)) {
611 			n->m_data = m->m_data;
612 			mb_dupcl(n, m);
613 		} else {
614 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
615 		}
616 
617 		m = m->m_next;
618 	}
619 	return top;
620 nospace:
621 	m_freem(top);
622 	return (NULL);
623 }
624 
625 static void
626 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
627 {
628 	struct iovec iov;
629 	struct uio uio;
630 	int error __diagused;
631 
632 	KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
633 	KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
634 	KASSERT(off < m->m_len,
635 	    ("m_copyfromunmapped: len exceeds mbuf length"));
636 	iov.iov_base = cp;
637 	iov.iov_len = len;
638 	uio.uio_resid = len;
639 	uio.uio_iov = &iov;
640 	uio.uio_segflg = UIO_SYSSPACE;
641 	uio.uio_iovcnt = 1;
642 	uio.uio_offset = 0;
643 	uio.uio_rw = UIO_READ;
644 	error = m_unmapped_uiomove(m, off, &uio, len);
645 	KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
646 	   len));
647 }
648 
649 /*
650  * Copy data from an mbuf chain starting "off" bytes from the beginning,
651  * continuing for "len" bytes, into the indicated buffer.
652  */
653 void
654 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
655 {
656 	u_int count;
657 
658 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
659 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
660 	while (off > 0) {
661 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
662 		if (off < m->m_len)
663 			break;
664 		off -= m->m_len;
665 		m = m->m_next;
666 	}
667 	while (len > 0) {
668 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
669 		count = min(m->m_len - off, len);
670 		if ((m->m_flags & M_EXTPG) != 0)
671 			m_copyfromunmapped(m, off, count, cp);
672 		else
673 			bcopy(mtod(m, caddr_t) + off, cp, count);
674 		len -= count;
675 		cp += count;
676 		off = 0;
677 		m = m->m_next;
678 	}
679 }
680 
681 /*
682  * Copy a packet header mbuf chain into a completely new chain, including
683  * copying any mbuf clusters.  Use this instead of m_copypacket() when
684  * you need a writable copy of an mbuf chain.
685  */
686 struct mbuf *
687 m_dup(const struct mbuf *m, int how)
688 {
689 	struct mbuf **p, *top = NULL;
690 	int remain, moff, nsize;
691 
692 	MBUF_CHECKSLEEP(how);
693 	/* Sanity check */
694 	if (m == NULL)
695 		return (NULL);
696 	M_ASSERTPKTHDR(m);
697 
698 	/* While there's more data, get a new mbuf, tack it on, and fill it */
699 	remain = m->m_pkthdr.len;
700 	moff = 0;
701 	p = &top;
702 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
703 		struct mbuf *n;
704 
705 		/* Get the next new mbuf */
706 		if (remain >= MINCLSIZE) {
707 			n = m_getcl(how, m->m_type, 0);
708 			nsize = MCLBYTES;
709 		} else {
710 			n = m_get(how, m->m_type);
711 			nsize = MLEN;
712 		}
713 		if (n == NULL)
714 			goto nospace;
715 
716 		if (top == NULL) {		/* First one, must be PKTHDR */
717 			if (!m_dup_pkthdr(n, m, how)) {
718 				m_free(n);
719 				goto nospace;
720 			}
721 			if ((n->m_flags & M_EXT) == 0)
722 				nsize = MHLEN;
723 			n->m_flags &= ~M_RDONLY;
724 		}
725 		n->m_len = 0;
726 
727 		/* Link it into the new chain */
728 		*p = n;
729 		p = &n->m_next;
730 
731 		/* Copy data from original mbuf(s) into new mbuf */
732 		while (n->m_len < nsize && m != NULL) {
733 			int chunk = min(nsize - n->m_len, m->m_len - moff);
734 
735 			m_copydata(m, moff, chunk, n->m_data + n->m_len);
736 			moff += chunk;
737 			n->m_len += chunk;
738 			remain -= chunk;
739 			if (moff == m->m_len) {
740 				m = m->m_next;
741 				moff = 0;
742 			}
743 		}
744 
745 		/* Check correct total mbuf length */
746 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
747 		    	("%s: bogus m_pkthdr.len", __func__));
748 	}
749 	return (top);
750 
751 nospace:
752 	m_freem(top);
753 	return (NULL);
754 }
755 
756 /*
757  * Concatenate mbuf chain n to m.
758  * Both chains must be of the same type (e.g. MT_DATA).
759  * Any m_pkthdr is not updated.
760  */
761 void
762 m_cat(struct mbuf *m, struct mbuf *n)
763 {
764 	while (m->m_next)
765 		m = m->m_next;
766 	while (n) {
767 		if (!M_WRITABLE(m) ||
768 		    (n->m_flags & M_EXTPG) != 0 ||
769 		    M_TRAILINGSPACE(m) < n->m_len) {
770 			/* just join the two chains */
771 			m->m_next = n;
772 			return;
773 		}
774 		/* splat the data from one into the other */
775 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
776 		    (u_int)n->m_len);
777 		m->m_len += n->m_len;
778 		n = m_free(n);
779 	}
780 }
781 
782 /*
783  * Concatenate two pkthdr mbuf chains.
784  */
785 void
786 m_catpkt(struct mbuf *m, struct mbuf *n)
787 {
788 
789 	M_ASSERTPKTHDR(m);
790 	M_ASSERTPKTHDR(n);
791 
792 	m->m_pkthdr.len += n->m_pkthdr.len;
793 	m_demote(n, 1, 0);
794 
795 	m_cat(m, n);
796 }
797 
798 void
799 m_adj(struct mbuf *mp, int req_len)
800 {
801 	int len = req_len;
802 	struct mbuf *m;
803 	int count;
804 
805 	if ((m = mp) == NULL)
806 		return;
807 	if (len >= 0) {
808 		/*
809 		 * Trim from head.
810 		 */
811 		while (m != NULL && len > 0) {
812 			if (m->m_len <= len) {
813 				len -= m->m_len;
814 				m->m_len = 0;
815 				m = m->m_next;
816 			} else {
817 				m->m_len -= len;
818 				m->m_data += len;
819 				len = 0;
820 			}
821 		}
822 		if (mp->m_flags & M_PKTHDR)
823 			mp->m_pkthdr.len -= (req_len - len);
824 	} else {
825 		/*
826 		 * Trim from tail.  Scan the mbuf chain,
827 		 * calculating its length and finding the last mbuf.
828 		 * If the adjustment only affects this mbuf, then just
829 		 * adjust and return.  Otherwise, rescan and truncate
830 		 * after the remaining size.
831 		 */
832 		len = -len;
833 		count = 0;
834 		for (;;) {
835 			count += m->m_len;
836 			if (m->m_next == (struct mbuf *)0)
837 				break;
838 			m = m->m_next;
839 		}
840 		if (m->m_len >= len) {
841 			m->m_len -= len;
842 			if (mp->m_flags & M_PKTHDR)
843 				mp->m_pkthdr.len -= len;
844 			return;
845 		}
846 		count -= len;
847 		if (count < 0)
848 			count = 0;
849 		/*
850 		 * Correct length for chain is "count".
851 		 * Find the mbuf with last data, adjust its length,
852 		 * and toss data from remaining mbufs on chain.
853 		 */
854 		m = mp;
855 		if (m->m_flags & M_PKTHDR)
856 			m->m_pkthdr.len = count;
857 		for (; m; m = m->m_next) {
858 			if (m->m_len >= count) {
859 				m->m_len = count;
860 				if (m->m_next != NULL) {
861 					m_freem(m->m_next);
862 					m->m_next = NULL;
863 				}
864 				break;
865 			}
866 			count -= m->m_len;
867 		}
868 	}
869 }
870 
871 void
872 m_adj_decap(struct mbuf *mp, int len)
873 {
874 	uint8_t rsstype;
875 
876 	m_adj(mp, len);
877 	if ((mp->m_flags & M_PKTHDR) != 0) {
878 		/*
879 		 * If flowid was calculated by card from the inner
880 		 * headers, move flowid to the decapsulated mbuf
881 		 * chain, otherwise clear.  This depends on the
882 		 * internals of m_adj, which keeps pkthdr as is, in
883 		 * particular not changing rsstype and flowid.
884 		 */
885 		rsstype = mp->m_pkthdr.rsstype;
886 		if ((rsstype & M_HASHTYPE_INNER) != 0) {
887 			M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER);
888 		} else {
889 			M_HASHTYPE_CLEAR(mp);
890 		}
891 	}
892 }
893 
894 /*
895  * Rearange an mbuf chain so that len bytes are contiguous
896  * and in the data area of an mbuf (so that mtod will work
897  * for a structure of size len).  Returns the resulting
898  * mbuf chain on success, frees it and returns null on failure.
899  * If there is room, it will add up to max_protohdr-len extra bytes to the
900  * contiguous region in an attempt to avoid being called next time.
901  */
902 struct mbuf *
903 m_pullup(struct mbuf *n, int len)
904 {
905 	struct mbuf *m;
906 	int count;
907 	int space;
908 
909 	KASSERT((n->m_flags & M_EXTPG) == 0,
910 	    ("%s: unmapped mbuf %p", __func__, n));
911 
912 	/*
913 	 * If first mbuf has no cluster, and has room for len bytes
914 	 * without shifting current data, pullup into it,
915 	 * otherwise allocate a new mbuf to prepend to the chain.
916 	 */
917 	if ((n->m_flags & M_EXT) == 0 &&
918 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
919 		if (n->m_len >= len)
920 			return (n);
921 		m = n;
922 		n = n->m_next;
923 		len -= m->m_len;
924 	} else {
925 		if (len > MHLEN)
926 			goto bad;
927 		m = m_get(M_NOWAIT, n->m_type);
928 		if (m == NULL)
929 			goto bad;
930 		if (n->m_flags & M_PKTHDR)
931 			m_move_pkthdr(m, n);
932 	}
933 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
934 	do {
935 		count = min(min(max(len, max_protohdr), space), n->m_len);
936 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
937 		  (u_int)count);
938 		len -= count;
939 		m->m_len += count;
940 		n->m_len -= count;
941 		space -= count;
942 		if (n->m_len)
943 			n->m_data += count;
944 		else
945 			n = m_free(n);
946 	} while (len > 0 && n);
947 	if (len > 0) {
948 		(void) m_free(m);
949 		goto bad;
950 	}
951 	m->m_next = n;
952 	return (m);
953 bad:
954 	m_freem(n);
955 	return (NULL);
956 }
957 
958 /*
959  * Like m_pullup(), except a new mbuf is always allocated, and we allow
960  * the amount of empty space before the data in the new mbuf to be specified
961  * (in the event that the caller expects to prepend later).
962  */
963 struct mbuf *
964 m_copyup(struct mbuf *n, int len, int dstoff)
965 {
966 	struct mbuf *m;
967 	int count, space;
968 
969 	if (len > (MHLEN - dstoff))
970 		goto bad;
971 	m = m_get(M_NOWAIT, n->m_type);
972 	if (m == NULL)
973 		goto bad;
974 	if (n->m_flags & M_PKTHDR)
975 		m_move_pkthdr(m, n);
976 	m->m_data += dstoff;
977 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
978 	do {
979 		count = min(min(max(len, max_protohdr), space), n->m_len);
980 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
981 		    (unsigned)count);
982 		len -= count;
983 		m->m_len += count;
984 		n->m_len -= count;
985 		space -= count;
986 		if (n->m_len)
987 			n->m_data += count;
988 		else
989 			n = m_free(n);
990 	} while (len > 0 && n);
991 	if (len > 0) {
992 		(void) m_free(m);
993 		goto bad;
994 	}
995 	m->m_next = n;
996 	return (m);
997  bad:
998 	m_freem(n);
999 	return (NULL);
1000 }
1001 
1002 /*
1003  * Partition an mbuf chain in two pieces, returning the tail --
1004  * all but the first len0 bytes.  In case of failure, it returns NULL and
1005  * attempts to restore the chain to its original state.
1006  *
1007  * Note that the resulting mbufs might be read-only, because the new
1008  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1009  * the "breaking point" happens to lie within a cluster mbuf. Use the
1010  * M_WRITABLE() macro to check for this case.
1011  */
1012 struct mbuf *
1013 m_split(struct mbuf *m0, int len0, int wait)
1014 {
1015 	struct mbuf *m, *n;
1016 	u_int len = len0, remain;
1017 
1018 	MBUF_CHECKSLEEP(wait);
1019 	for (m = m0; m && len > m->m_len; m = m->m_next)
1020 		len -= m->m_len;
1021 	if (m == NULL)
1022 		return (NULL);
1023 	remain = m->m_len - len;
1024 	if (m0->m_flags & M_PKTHDR && remain == 0) {
1025 		n = m_gethdr(wait, m0->m_type);
1026 		if (n == NULL)
1027 			return (NULL);
1028 		n->m_next = m->m_next;
1029 		m->m_next = NULL;
1030 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1031 			n->m_pkthdr.snd_tag =
1032 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1033 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1034 		} else
1035 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1036 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1037 		m0->m_pkthdr.len = len0;
1038 		return (n);
1039 	} else if (m0->m_flags & M_PKTHDR) {
1040 		n = m_gethdr(wait, m0->m_type);
1041 		if (n == NULL)
1042 			return (NULL);
1043 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
1044 			n->m_pkthdr.snd_tag =
1045 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
1046 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
1047 		} else
1048 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1049 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1050 		m0->m_pkthdr.len = len0;
1051 		if (m->m_flags & (M_EXT|M_EXTPG))
1052 			goto extpacket;
1053 		if (remain > MHLEN) {
1054 			/* m can't be the lead packet */
1055 			M_ALIGN(n, 0);
1056 			n->m_next = m_split(m, len, wait);
1057 			if (n->m_next == NULL) {
1058 				(void) m_free(n);
1059 				return (NULL);
1060 			} else {
1061 				n->m_len = 0;
1062 				return (n);
1063 			}
1064 		} else
1065 			M_ALIGN(n, remain);
1066 	} else if (remain == 0) {
1067 		n = m->m_next;
1068 		m->m_next = NULL;
1069 		return (n);
1070 	} else {
1071 		n = m_get(wait, m->m_type);
1072 		if (n == NULL)
1073 			return (NULL);
1074 		M_ALIGN(n, remain);
1075 	}
1076 extpacket:
1077 	if (m->m_flags & (M_EXT|M_EXTPG)) {
1078 		n->m_data = m->m_data + len;
1079 		mb_dupcl(n, m);
1080 	} else {
1081 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1082 	}
1083 	n->m_len = remain;
1084 	m->m_len = len;
1085 	n->m_next = m->m_next;
1086 	m->m_next = NULL;
1087 	return (n);
1088 }
1089 /*
1090  * Routine to copy from device local memory into mbufs.
1091  * Note that `off' argument is offset into first mbuf of target chain from
1092  * which to begin copying the data to.
1093  */
1094 struct mbuf *
1095 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
1096     void (*copy)(char *from, caddr_t to, u_int len))
1097 {
1098 	struct mbuf *m;
1099 	struct mbuf *top = NULL, **mp = &top;
1100 	int len;
1101 
1102 	if (off < 0 || off > MHLEN)
1103 		return (NULL);
1104 
1105 	while (totlen > 0) {
1106 		if (top == NULL) {	/* First one, must be PKTHDR */
1107 			if (totlen + off >= MINCLSIZE) {
1108 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1109 				len = MCLBYTES;
1110 			} else {
1111 				m = m_gethdr(M_NOWAIT, MT_DATA);
1112 				len = MHLEN;
1113 
1114 				/* Place initial small packet/header at end of mbuf */
1115 				if (m && totlen + off + max_linkhdr <= MHLEN) {
1116 					m->m_data += max_linkhdr;
1117 					len -= max_linkhdr;
1118 				}
1119 			}
1120 			if (m == NULL)
1121 				return NULL;
1122 			m->m_pkthdr.rcvif = ifp;
1123 			m->m_pkthdr.len = totlen;
1124 		} else {
1125 			if (totlen + off >= MINCLSIZE) {
1126 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
1127 				len = MCLBYTES;
1128 			} else {
1129 				m = m_get(M_NOWAIT, MT_DATA);
1130 				len = MLEN;
1131 			}
1132 			if (m == NULL) {
1133 				m_freem(top);
1134 				return NULL;
1135 			}
1136 		}
1137 		if (off) {
1138 			m->m_data += off;
1139 			len -= off;
1140 			off = 0;
1141 		}
1142 		m->m_len = len = min(totlen, len);
1143 		if (copy)
1144 			copy(buf, mtod(m, caddr_t), (u_int)len);
1145 		else
1146 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
1147 		buf += len;
1148 		*mp = m;
1149 		mp = &m->m_next;
1150 		totlen -= len;
1151 	}
1152 	return (top);
1153 }
1154 
1155 static void
1156 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp)
1157 {
1158 	struct iovec iov;
1159 	struct uio uio;
1160 	int error __diagused;
1161 
1162 	KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off));
1163 	KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len));
1164 	KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length"));
1165 	iov.iov_base = __DECONST(caddr_t, cp);
1166 	iov.iov_len = len;
1167 	uio.uio_resid = len;
1168 	uio.uio_iov = &iov;
1169 	uio.uio_segflg = UIO_SYSSPACE;
1170 	uio.uio_iovcnt = 1;
1171 	uio.uio_offset = 0;
1172 	uio.uio_rw = UIO_WRITE;
1173 	error = m_unmapped_uiomove(m, off, &uio, len);
1174 	KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
1175 	   len));
1176 }
1177 
1178 /*
1179  * Copy data from a buffer back into the indicated mbuf chain,
1180  * starting "off" bytes from the beginning, extending the mbuf
1181  * chain if necessary.
1182  */
1183 void
1184 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
1185 {
1186 	int mlen;
1187 	struct mbuf *m = m0, *n;
1188 	int totlen = 0;
1189 
1190 	if (m0 == NULL)
1191 		return;
1192 	while (off > (mlen = m->m_len)) {
1193 		off -= mlen;
1194 		totlen += mlen;
1195 		if (m->m_next == NULL) {
1196 			n = m_get(M_NOWAIT, m->m_type);
1197 			if (n == NULL)
1198 				goto out;
1199 			bzero(mtod(n, caddr_t), MLEN);
1200 			n->m_len = min(MLEN, len + off);
1201 			m->m_next = n;
1202 		}
1203 		m = m->m_next;
1204 	}
1205 	while (len > 0) {
1206 		if (m->m_next == NULL && (len > m->m_len - off)) {
1207 			m->m_len += min(len - (m->m_len - off),
1208 			    M_TRAILINGSPACE(m));
1209 		}
1210 		mlen = min (m->m_len - off, len);
1211 		if ((m->m_flags & M_EXTPG) != 0)
1212 			m_copytounmapped(m, off, mlen, cp);
1213 		else
1214 			bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1215 		cp += mlen;
1216 		len -= mlen;
1217 		mlen += off;
1218 		off = 0;
1219 		totlen += mlen;
1220 		if (len == 0)
1221 			break;
1222 		if (m->m_next == NULL) {
1223 			n = m_get(M_NOWAIT, m->m_type);
1224 			if (n == NULL)
1225 				break;
1226 			n->m_len = min(MLEN, len);
1227 			m->m_next = n;
1228 		}
1229 		m = m->m_next;
1230 	}
1231 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1232 		m->m_pkthdr.len = totlen;
1233 }
1234 
1235 /*
1236  * Append the specified data to the indicated mbuf chain,
1237  * Extend the mbuf chain if the new data does not fit in
1238  * existing space.
1239  *
1240  * Return 1 if able to complete the job; otherwise 0.
1241  */
1242 int
1243 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1244 {
1245 	struct mbuf *m, *n;
1246 	int remainder, space;
1247 
1248 	for (m = m0; m->m_next != NULL; m = m->m_next)
1249 		;
1250 	remainder = len;
1251 	space = M_TRAILINGSPACE(m);
1252 	if (space > 0) {
1253 		/*
1254 		 * Copy into available space.
1255 		 */
1256 		if (space > remainder)
1257 			space = remainder;
1258 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1259 		m->m_len += space;
1260 		cp += space, remainder -= space;
1261 	}
1262 	while (remainder > 0) {
1263 		/*
1264 		 * Allocate a new mbuf; could check space
1265 		 * and allocate a cluster instead.
1266 		 */
1267 		n = m_get(M_NOWAIT, m->m_type);
1268 		if (n == NULL)
1269 			break;
1270 		n->m_len = min(MLEN, remainder);
1271 		bcopy(cp, mtod(n, caddr_t), n->m_len);
1272 		cp += n->m_len, remainder -= n->m_len;
1273 		m->m_next = n;
1274 		m = n;
1275 	}
1276 	if (m0->m_flags & M_PKTHDR)
1277 		m0->m_pkthdr.len += len - remainder;
1278 	return (remainder == 0);
1279 }
1280 
1281 static int
1282 m_apply_extpg_one(struct mbuf *m, int off, int len,
1283     int (*f)(void *, void *, u_int), void *arg)
1284 {
1285 	void *p;
1286 	u_int i, count, pgoff, pglen;
1287 	int rval;
1288 
1289 	KASSERT(PMAP_HAS_DMAP,
1290 	    ("m_apply_extpg_one does not support unmapped mbufs"));
1291 	off += mtod(m, vm_offset_t);
1292 	if (off < m->m_epg_hdrlen) {
1293 		count = min(m->m_epg_hdrlen - off, len);
1294 		rval = f(arg, m->m_epg_hdr + off, count);
1295 		if (rval)
1296 			return (rval);
1297 		len -= count;
1298 		off = 0;
1299 	} else
1300 		off -= m->m_epg_hdrlen;
1301 	pgoff = m->m_epg_1st_off;
1302 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
1303 		pglen = m_epg_pagelen(m, i, pgoff);
1304 		if (off < pglen) {
1305 			count = min(pglen - off, len);
1306 			p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off);
1307 			rval = f(arg, p, count);
1308 			if (rval)
1309 				return (rval);
1310 			len -= count;
1311 			off = 0;
1312 		} else
1313 			off -= pglen;
1314 		pgoff = 0;
1315 	}
1316 	if (len > 0) {
1317 		KASSERT(off < m->m_epg_trllen,
1318 		    ("m_apply_extpg_one: offset beyond trailer"));
1319 		KASSERT(len <= m->m_epg_trllen - off,
1320 		    ("m_apply_extpg_one: length beyond trailer"));
1321 		return (f(arg, m->m_epg_trail + off, len));
1322 	}
1323 	return (0);
1324 }
1325 
1326 /* Apply function f to the data in a single mbuf. */
1327 static int
1328 m_apply_one(struct mbuf *m, int off, int len,
1329     int (*f)(void *, void *, u_int), void *arg)
1330 {
1331 	if ((m->m_flags & M_EXTPG) != 0)
1332 		return (m_apply_extpg_one(m, off, len, f, arg));
1333 	else
1334 		return (f(arg, mtod(m, caddr_t) + off, len));
1335 }
1336 
1337 /*
1338  * Apply function f to the data in an mbuf chain starting "off" bytes from
1339  * the beginning, continuing for "len" bytes.
1340  */
1341 int
1342 m_apply(struct mbuf *m, int off, int len,
1343     int (*f)(void *, void *, u_int), void *arg)
1344 {
1345 	u_int count;
1346 	int rval;
1347 
1348 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1349 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1350 	while (off > 0) {
1351 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1352 		if (off < m->m_len)
1353 			break;
1354 		off -= m->m_len;
1355 		m = m->m_next;
1356 	}
1357 	while (len > 0) {
1358 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1359 		count = min(m->m_len - off, len);
1360 		rval = m_apply_one(m, off, count, f, arg);
1361 		if (rval)
1362 			return (rval);
1363 		len -= count;
1364 		off = 0;
1365 		m = m->m_next;
1366 	}
1367 	return (0);
1368 }
1369 
1370 /*
1371  * Return a pointer to mbuf/offset of location in mbuf chain.
1372  */
1373 struct mbuf *
1374 m_getptr(struct mbuf *m, int loc, int *off)
1375 {
1376 
1377 	while (loc >= 0) {
1378 		/* Normal end of search. */
1379 		if (m->m_len > loc) {
1380 			*off = loc;
1381 			return (m);
1382 		} else {
1383 			loc -= m->m_len;
1384 			if (m->m_next == NULL) {
1385 				if (loc == 0) {
1386 					/* Point at the end of valid data. */
1387 					*off = m->m_len;
1388 					return (m);
1389 				}
1390 				return (NULL);
1391 			}
1392 			m = m->m_next;
1393 		}
1394 	}
1395 	return (NULL);
1396 }
1397 
1398 void
1399 m_print(const struct mbuf *m, int maxlen)
1400 {
1401 	int len;
1402 	int pdata;
1403 	const struct mbuf *m2;
1404 
1405 	if (m == NULL) {
1406 		printf("mbuf: %p\n", m);
1407 		return;
1408 	}
1409 
1410 	if (m->m_flags & M_PKTHDR)
1411 		len = m->m_pkthdr.len;
1412 	else
1413 		len = -1;
1414 	m2 = m;
1415 	while (m2 != NULL && (len == -1 || len)) {
1416 		pdata = m2->m_len;
1417 		if (maxlen != -1 && pdata > maxlen)
1418 			pdata = maxlen;
1419 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1420 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1421 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1422 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1423 		if (pdata)
1424 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1425 		if (len != -1)
1426 			len -= m2->m_len;
1427 		m2 = m2->m_next;
1428 	}
1429 	if (len > 0)
1430 		printf("%d bytes unaccounted for.\n", len);
1431 	return;
1432 }
1433 
1434 u_int
1435 m_fixhdr(struct mbuf *m0)
1436 {
1437 	u_int len;
1438 
1439 	len = m_length(m0, NULL);
1440 	m0->m_pkthdr.len = len;
1441 	return (len);
1442 }
1443 
1444 u_int
1445 m_length(struct mbuf *m0, struct mbuf **last)
1446 {
1447 	struct mbuf *m;
1448 	u_int len;
1449 
1450 	len = 0;
1451 	for (m = m0; m != NULL; m = m->m_next) {
1452 		len += m->m_len;
1453 		if (m->m_next == NULL)
1454 			break;
1455 	}
1456 	if (last != NULL)
1457 		*last = m;
1458 	return (len);
1459 }
1460 
1461 /*
1462  * Defragment a mbuf chain, returning the shortest possible
1463  * chain of mbufs and clusters.  If allocation fails and
1464  * this cannot be completed, NULL will be returned, but
1465  * the passed in chain will be unchanged.  Upon success,
1466  * the original chain will be freed, and the new chain
1467  * will be returned.
1468  *
1469  * If a non-packet header is passed in, the original
1470  * mbuf (chain?) will be returned unharmed.
1471  */
1472 struct mbuf *
1473 m_defrag(struct mbuf *m0, int how)
1474 {
1475 	struct mbuf *m_new = NULL, *m_final = NULL;
1476 	int progress = 0, length;
1477 
1478 	MBUF_CHECKSLEEP(how);
1479 	if (!(m0->m_flags & M_PKTHDR))
1480 		return (m0);
1481 
1482 	m_fixhdr(m0); /* Needed sanity check */
1483 
1484 #ifdef MBUF_STRESS_TEST
1485 	if (m_defragrandomfailures) {
1486 		int temp = arc4random() & 0xff;
1487 		if (temp == 0xba)
1488 			goto nospace;
1489 	}
1490 #endif
1491 
1492 	if (m0->m_pkthdr.len > MHLEN)
1493 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1494 	else
1495 		m_final = m_gethdr(how, MT_DATA);
1496 
1497 	if (m_final == NULL)
1498 		goto nospace;
1499 
1500 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1501 		goto nospace;
1502 
1503 	m_new = m_final;
1504 
1505 	while (progress < m0->m_pkthdr.len) {
1506 		length = m0->m_pkthdr.len - progress;
1507 		if (length > MCLBYTES)
1508 			length = MCLBYTES;
1509 
1510 		if (m_new == NULL) {
1511 			if (length > MLEN)
1512 				m_new = m_getcl(how, MT_DATA, 0);
1513 			else
1514 				m_new = m_get(how, MT_DATA);
1515 			if (m_new == NULL)
1516 				goto nospace;
1517 		}
1518 
1519 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1520 		progress += length;
1521 		m_new->m_len = length;
1522 		if (m_new != m_final)
1523 			m_cat(m_final, m_new);
1524 		m_new = NULL;
1525 	}
1526 #ifdef MBUF_STRESS_TEST
1527 	if (m0->m_next == NULL)
1528 		m_defraguseless++;
1529 #endif
1530 	m_freem(m0);
1531 	m0 = m_final;
1532 #ifdef MBUF_STRESS_TEST
1533 	m_defragpackets++;
1534 	m_defragbytes += m0->m_pkthdr.len;
1535 #endif
1536 	return (m0);
1537 nospace:
1538 #ifdef MBUF_STRESS_TEST
1539 	m_defragfailure++;
1540 #endif
1541 	if (m_final)
1542 		m_freem(m_final);
1543 	return (NULL);
1544 }
1545 
1546 /*
1547  * Return the number of fragments an mbuf will use.  This is usually
1548  * used as a proxy for the number of scatter/gather elements needed by
1549  * a DMA engine to access an mbuf.  In general mapped mbufs are
1550  * assumed to be backed by physically contiguous buffers that only
1551  * need a single fragment.  Unmapped mbufs, on the other hand, can
1552  * span disjoint physical pages.
1553  */
1554 static int
1555 frags_per_mbuf(struct mbuf *m)
1556 {
1557 	int frags;
1558 
1559 	if ((m->m_flags & M_EXTPG) == 0)
1560 		return (1);
1561 
1562 	/*
1563 	 * The header and trailer are counted as a single fragment
1564 	 * each when present.
1565 	 *
1566 	 * XXX: This overestimates the number of fragments by assuming
1567 	 * all the backing physical pages are disjoint.
1568 	 */
1569 	frags = 0;
1570 	if (m->m_epg_hdrlen != 0)
1571 		frags++;
1572 	frags += m->m_epg_npgs;
1573 	if (m->m_epg_trllen != 0)
1574 		frags++;
1575 
1576 	return (frags);
1577 }
1578 
1579 /*
1580  * Defragment an mbuf chain, returning at most maxfrags separate
1581  * mbufs+clusters.  If this is not possible NULL is returned and
1582  * the original mbuf chain is left in its present (potentially
1583  * modified) state.  We use two techniques: collapsing consecutive
1584  * mbufs and replacing consecutive mbufs by a cluster.
1585  *
1586  * NB: this should really be named m_defrag but that name is taken
1587  */
1588 struct mbuf *
1589 m_collapse(struct mbuf *m0, int how, int maxfrags)
1590 {
1591 	struct mbuf *m, *n, *n2, **prev;
1592 	u_int curfrags;
1593 
1594 	/*
1595 	 * Calculate the current number of frags.
1596 	 */
1597 	curfrags = 0;
1598 	for (m = m0; m != NULL; m = m->m_next)
1599 		curfrags += frags_per_mbuf(m);
1600 	/*
1601 	 * First, try to collapse mbufs.  Note that we always collapse
1602 	 * towards the front so we don't need to deal with moving the
1603 	 * pkthdr.  This may be suboptimal if the first mbuf has much
1604 	 * less data than the following.
1605 	 */
1606 	m = m0;
1607 again:
1608 	for (;;) {
1609 		n = m->m_next;
1610 		if (n == NULL)
1611 			break;
1612 		if (M_WRITABLE(m) &&
1613 		    n->m_len < M_TRAILINGSPACE(m)) {
1614 			m_copydata(n, 0, n->m_len,
1615 			    mtod(m, char *) + m->m_len);
1616 			m->m_len += n->m_len;
1617 			m->m_next = n->m_next;
1618 			curfrags -= frags_per_mbuf(n);
1619 			m_free(n);
1620 			if (curfrags <= maxfrags)
1621 				return m0;
1622 		} else
1623 			m = n;
1624 	}
1625 	KASSERT(maxfrags > 1,
1626 		("maxfrags %u, but normal collapse failed", maxfrags));
1627 	/*
1628 	 * Collapse consecutive mbufs to a cluster.
1629 	 */
1630 	prev = &m0->m_next;		/* NB: not the first mbuf */
1631 	while ((n = *prev) != NULL) {
1632 		if ((n2 = n->m_next) != NULL &&
1633 		    n->m_len + n2->m_len < MCLBYTES) {
1634 			m = m_getcl(how, MT_DATA, 0);
1635 			if (m == NULL)
1636 				goto bad;
1637 			m_copydata(n, 0,  n->m_len, mtod(m, char *));
1638 			m_copydata(n2, 0,  n2->m_len,
1639 			    mtod(m, char *) + n->m_len);
1640 			m->m_len = n->m_len + n2->m_len;
1641 			m->m_next = n2->m_next;
1642 			*prev = m;
1643 			curfrags += 1;  /* For the new cluster */
1644 			curfrags -= frags_per_mbuf(n);
1645 			curfrags -= frags_per_mbuf(n2);
1646 			m_free(n);
1647 			m_free(n2);
1648 			if (curfrags <= maxfrags)
1649 				return m0;
1650 			/*
1651 			 * Still not there, try the normal collapse
1652 			 * again before we allocate another cluster.
1653 			 */
1654 			goto again;
1655 		}
1656 		prev = &n->m_next;
1657 	}
1658 	/*
1659 	 * No place where we can collapse to a cluster; punt.
1660 	 * This can occur if, for example, you request 2 frags
1661 	 * but the packet requires that both be clusters (we
1662 	 * never reallocate the first mbuf to avoid moving the
1663 	 * packet header).
1664 	 */
1665 bad:
1666 	return NULL;
1667 }
1668 
1669 #ifdef MBUF_STRESS_TEST
1670 
1671 /*
1672  * Fragment an mbuf chain.  There's no reason you'd ever want to do
1673  * this in normal usage, but it's great for stress testing various
1674  * mbuf consumers.
1675  *
1676  * If fragmentation is not possible, the original chain will be
1677  * returned.
1678  *
1679  * Possible length values:
1680  * 0	 no fragmentation will occur
1681  * > 0	each fragment will be of the specified length
1682  * -1	each fragment will be the same random value in length
1683  * -2	each fragment's length will be entirely random
1684  * (Random values range from 1 to 256)
1685  */
1686 struct mbuf *
1687 m_fragment(struct mbuf *m0, int how, int length)
1688 {
1689 	struct mbuf *m_first, *m_last;
1690 	int divisor = 255, progress = 0, fraglen;
1691 
1692 	if (!(m0->m_flags & M_PKTHDR))
1693 		return (m0);
1694 
1695 	if (length == 0 || length < -2)
1696 		return (m0);
1697 	if (length > MCLBYTES)
1698 		length = MCLBYTES;
1699 	if (length < 0 && divisor > MCLBYTES)
1700 		divisor = MCLBYTES;
1701 	if (length == -1)
1702 		length = 1 + (arc4random() % divisor);
1703 	if (length > 0)
1704 		fraglen = length;
1705 
1706 	m_fixhdr(m0); /* Needed sanity check */
1707 
1708 	m_first = m_getcl(how, MT_DATA, M_PKTHDR);
1709 	if (m_first == NULL)
1710 		goto nospace;
1711 
1712 	if (m_dup_pkthdr(m_first, m0, how) == 0)
1713 		goto nospace;
1714 
1715 	m_last = m_first;
1716 
1717 	while (progress < m0->m_pkthdr.len) {
1718 		if (length == -2)
1719 			fraglen = 1 + (arc4random() % divisor);
1720 		if (fraglen > m0->m_pkthdr.len - progress)
1721 			fraglen = m0->m_pkthdr.len - progress;
1722 
1723 		if (progress != 0) {
1724 			struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
1725 			if (m_new == NULL)
1726 				goto nospace;
1727 
1728 			m_last->m_next = m_new;
1729 			m_last = m_new;
1730 		}
1731 
1732 		m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
1733 		progress += fraglen;
1734 		m_last->m_len = fraglen;
1735 	}
1736 	m_freem(m0);
1737 	m0 = m_first;
1738 	return (m0);
1739 nospace:
1740 	if (m_first)
1741 		m_freem(m_first);
1742 	/* Return the original chain on failure */
1743 	return (m0);
1744 }
1745 
1746 #endif
1747 
1748 /*
1749  * Free pages from mbuf_ext_pgs, assuming they were allocated via
1750  * vm_page_alloc() and aren't associated with any object.  Complement
1751  * to allocator from m_uiotombuf_nomap().
1752  */
1753 void
1754 mb_free_mext_pgs(struct mbuf *m)
1755 {
1756 	vm_page_t pg;
1757 
1758 	M_ASSERTEXTPG(m);
1759 	for (int i = 0; i < m->m_epg_npgs; i++) {
1760 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
1761 		vm_page_unwire_noq(pg);
1762 		vm_page_free(pg);
1763 	}
1764 }
1765 
1766 static struct mbuf *
1767 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
1768 {
1769 	struct mbuf *m, *mb, *prev;
1770 	vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
1771 	int error, length, i, needed;
1772 	ssize_t total;
1773 	int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED;
1774 
1775 	MPASS((flags & M_PKTHDR) == 0);
1776 	MPASS((how & M_ZERO) == 0);
1777 
1778 	/*
1779 	 * len can be zero or an arbitrary large value bound by
1780 	 * the total data supplied by the uio.
1781 	 */
1782 	if (len > 0)
1783 		total = MIN(uio->uio_resid, len);
1784 	else
1785 		total = uio->uio_resid;
1786 
1787 	if (maxseg == 0)
1788 		maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
1789 
1790 	/*
1791 	 * If total is zero, return an empty mbuf.  This can occur
1792 	 * for TLS 1.0 connections which send empty fragments as
1793 	 * a countermeasure against the known-IV weakness in CBC
1794 	 * ciphersuites.
1795 	 */
1796 	if (__predict_false(total == 0)) {
1797 		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
1798 		if (mb == NULL)
1799 			return (NULL);
1800 		mb->m_epg_flags = EPG_FLAG_ANON;
1801 		return (mb);
1802 	}
1803 
1804 	/*
1805 	 * Allocate the pages
1806 	 */
1807 	m = NULL;
1808 	while (total > 0) {
1809 		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
1810 		if (mb == NULL)
1811 			goto failed;
1812 		if (m == NULL)
1813 			m = mb;
1814 		else
1815 			prev->m_next = mb;
1816 		prev = mb;
1817 		mb->m_epg_flags = EPG_FLAG_ANON;
1818 		needed = length = MIN(maxseg, total);
1819 		for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
1820 retry_page:
1821 			pg_array[i] = vm_page_alloc_noobj(pflags);
1822 			if (pg_array[i] == NULL) {
1823 				if (how & M_NOWAIT) {
1824 					goto failed;
1825 				} else {
1826 					vm_wait(NULL);
1827 					goto retry_page;
1828 				}
1829 			}
1830 			mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
1831 			mb->m_epg_npgs++;
1832 		}
1833 		mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1);
1834 		MBUF_EXT_PGS_ASSERT_SANITY(mb);
1835 		total -= length;
1836 		error = uiomove_fromphys(pg_array, 0, length, uio);
1837 		if (error != 0)
1838 			goto failed;
1839 		mb->m_len = length;
1840 		mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs;
1841 		if (flags & M_PKTHDR)
1842 			m->m_pkthdr.len += length;
1843 	}
1844 	return (m);
1845 
1846 failed:
1847 	m_freem(m);
1848 	return (NULL);
1849 }
1850 
1851 /*
1852  * Copy the contents of uio into a properly sized mbuf chain.
1853  */
1854 struct mbuf *
1855 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
1856 {
1857 	struct mbuf *m, *mb;
1858 	int error, length;
1859 	ssize_t total;
1860 	int progress = 0;
1861 
1862 	if (flags & M_EXTPG)
1863 		return (m_uiotombuf_nomap(uio, how, len, align, flags));
1864 
1865 	/*
1866 	 * len can be zero or an arbitrary large value bound by
1867 	 * the total data supplied by the uio.
1868 	 */
1869 	if (len > 0)
1870 		total = (uio->uio_resid < len) ? uio->uio_resid : len;
1871 	else
1872 		total = uio->uio_resid;
1873 
1874 	/*
1875 	 * The smallest unit returned by m_getm2() is a single mbuf
1876 	 * with pkthdr.  We can't align past it.
1877 	 */
1878 	if (align >= MHLEN)
1879 		return (NULL);
1880 
1881 	/*
1882 	 * Give us the full allocation or nothing.
1883 	 * If len is zero return the smallest empty mbuf.
1884 	 */
1885 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
1886 	if (m == NULL)
1887 		return (NULL);
1888 	m->m_data += align;
1889 
1890 	/* Fill all mbufs with uio data and update header information. */
1891 	for (mb = m; mb != NULL; mb = mb->m_next) {
1892 		length = min(M_TRAILINGSPACE(mb), total - progress);
1893 
1894 		error = uiomove(mtod(mb, void *), length, uio);
1895 		if (error) {
1896 			m_freem(m);
1897 			return (NULL);
1898 		}
1899 
1900 		mb->m_len = length;
1901 		progress += length;
1902 		if (flags & M_PKTHDR)
1903 			m->m_pkthdr.len += length;
1904 	}
1905 	KASSERT(progress == total, ("%s: progress != total", __func__));
1906 
1907 	return (m);
1908 }
1909 
1910 /*
1911  * Copy data to/from an unmapped mbuf into a uio limited by len if set.
1912  */
1913 int
1914 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len)
1915 {
1916 	vm_page_t pg;
1917 	int error, i, off, pglen, pgoff, seglen, segoff;
1918 
1919 	M_ASSERTEXTPG(m);
1920 	error = 0;
1921 
1922 	/* Skip over any data removed from the front. */
1923 	off = mtod(m, vm_offset_t);
1924 
1925 	off += m_off;
1926 	if (m->m_epg_hdrlen != 0) {
1927 		if (off >= m->m_epg_hdrlen) {
1928 			off -= m->m_epg_hdrlen;
1929 		} else {
1930 			seglen = m->m_epg_hdrlen - off;
1931 			segoff = off;
1932 			seglen = min(seglen, len);
1933 			off = 0;
1934 			len -= seglen;
1935 			error = uiomove(__DECONST(void *,
1936 			    &m->m_epg_hdr[segoff]), seglen, uio);
1937 		}
1938 	}
1939 	pgoff = m->m_epg_1st_off;
1940 	for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
1941 		pglen = m_epg_pagelen(m, i, pgoff);
1942 		if (off >= pglen) {
1943 			off -= pglen;
1944 			pgoff = 0;
1945 			continue;
1946 		}
1947 		seglen = pglen - off;
1948 		segoff = pgoff + off;
1949 		off = 0;
1950 		seglen = min(seglen, len);
1951 		len -= seglen;
1952 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
1953 		error = uiomove_fromphys(&pg, segoff, seglen, uio);
1954 		pgoff = 0;
1955 	};
1956 	if (len != 0 && error == 0) {
1957 		KASSERT((off + len) <= m->m_epg_trllen,
1958 		    ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
1959 		    m->m_epg_trllen, m_off));
1960 		error = uiomove(__DECONST(void *, &m->m_epg_trail[off]),
1961 		    len, uio);
1962 	}
1963 	return (error);
1964 }
1965 
1966 /*
1967  * Copy an mbuf chain into a uio limited by len if set.
1968  */
1969 int
1970 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
1971 {
1972 	int error, length, total;
1973 	int progress = 0;
1974 
1975 	if (len > 0)
1976 		total = min(uio->uio_resid, len);
1977 	else
1978 		total = uio->uio_resid;
1979 
1980 	/* Fill the uio with data from the mbufs. */
1981 	for (; m != NULL; m = m->m_next) {
1982 		length = min(m->m_len, total - progress);
1983 
1984 		if ((m->m_flags & M_EXTPG) != 0)
1985 			error = m_unmapped_uiomove(m, 0, uio, length);
1986 		else
1987 			error = uiomove(mtod(m, void *), length, uio);
1988 		if (error)
1989 			return (error);
1990 
1991 		progress += length;
1992 	}
1993 
1994 	return (0);
1995 }
1996 
1997 /*
1998  * Create a writable copy of the mbuf chain.  While doing this
1999  * we compact the chain with a goal of producing a chain with
2000  * at most two mbufs.  The second mbuf in this chain is likely
2001  * to be a cluster.  The primary purpose of this work is to create
2002  * a writable packet for encryption, compression, etc.  The
2003  * secondary goal is to linearize the data so the data can be
2004  * passed to crypto hardware in the most efficient manner possible.
2005  */
2006 struct mbuf *
2007 m_unshare(struct mbuf *m0, int how)
2008 {
2009 	struct mbuf *m, *mprev;
2010 	struct mbuf *n, *mfirst, *mlast;
2011 	int len, off;
2012 
2013 	mprev = NULL;
2014 	for (m = m0; m != NULL; m = mprev->m_next) {
2015 		/*
2016 		 * Regular mbufs are ignored unless there's a cluster
2017 		 * in front of it that we can use to coalesce.  We do
2018 		 * the latter mainly so later clusters can be coalesced
2019 		 * also w/o having to handle them specially (i.e. convert
2020 		 * mbuf+cluster -> cluster).  This optimization is heavily
2021 		 * influenced by the assumption that we're running over
2022 		 * Ethernet where MCLBYTES is large enough that the max
2023 		 * packet size will permit lots of coalescing into a
2024 		 * single cluster.  This in turn permits efficient
2025 		 * crypto operations, especially when using hardware.
2026 		 */
2027 		if ((m->m_flags & M_EXT) == 0) {
2028 			if (mprev && (mprev->m_flags & M_EXT) &&
2029 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
2030 				/* XXX: this ignores mbuf types */
2031 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2032 				    mtod(m, caddr_t), m->m_len);
2033 				mprev->m_len += m->m_len;
2034 				mprev->m_next = m->m_next;	/* unlink from chain */
2035 				m_free(m);			/* reclaim mbuf */
2036 			} else {
2037 				mprev = m;
2038 			}
2039 			continue;
2040 		}
2041 		/*
2042 		 * Writable mbufs are left alone (for now).
2043 		 */
2044 		if (M_WRITABLE(m)) {
2045 			mprev = m;
2046 			continue;
2047 		}
2048 
2049 		/*
2050 		 * Not writable, replace with a copy or coalesce with
2051 		 * the previous mbuf if possible (since we have to copy
2052 		 * it anyway, we try to reduce the number of mbufs and
2053 		 * clusters so that future work is easier).
2054 		 */
2055 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
2056 		/* NB: we only coalesce into a cluster or larger */
2057 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
2058 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
2059 			/* XXX: this ignores mbuf types */
2060 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2061 			    mtod(m, caddr_t), m->m_len);
2062 			mprev->m_len += m->m_len;
2063 			mprev->m_next = m->m_next;	/* unlink from chain */
2064 			m_free(m);			/* reclaim mbuf */
2065 			continue;
2066 		}
2067 
2068 		/*
2069 		 * Allocate new space to hold the copy and copy the data.
2070 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
2071 		 * splitting them into clusters.  We could just malloc a
2072 		 * buffer and make it external but too many device drivers
2073 		 * don't know how to break up the non-contiguous memory when
2074 		 * doing DMA.
2075 		 */
2076 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2077 		if (n == NULL) {
2078 			m_freem(m0);
2079 			return (NULL);
2080 		}
2081 		if (m->m_flags & M_PKTHDR) {
2082 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
2083 			    __func__, m0, m));
2084 			m_move_pkthdr(n, m);
2085 		}
2086 		len = m->m_len;
2087 		off = 0;
2088 		mfirst = n;
2089 		mlast = NULL;
2090 		for (;;) {
2091 			int cc = min(len, MCLBYTES);
2092 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
2093 			n->m_len = cc;
2094 			if (mlast != NULL)
2095 				mlast->m_next = n;
2096 			mlast = n;
2097 #if 0
2098 			newipsecstat.ips_clcopied++;
2099 #endif
2100 
2101 			len -= cc;
2102 			if (len <= 0)
2103 				break;
2104 			off += cc;
2105 
2106 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
2107 			if (n == NULL) {
2108 				m_freem(mfirst);
2109 				m_freem(m0);
2110 				return (NULL);
2111 			}
2112 		}
2113 		n->m_next = m->m_next;
2114 		if (mprev == NULL)
2115 			m0 = mfirst;		/* new head of chain */
2116 		else
2117 			mprev->m_next = mfirst;	/* replace old mbuf */
2118 		m_free(m);			/* release old mbuf */
2119 		mprev = mfirst;
2120 	}
2121 	return (m0);
2122 }
2123 
2124 #ifdef MBUF_PROFILING
2125 
2126 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
2127 struct mbufprofile {
2128 	uintmax_t wasted[MP_BUCKETS];
2129 	uintmax_t used[MP_BUCKETS];
2130 	uintmax_t segments[MP_BUCKETS];
2131 } mbprof;
2132 
2133 void
2134 m_profile(struct mbuf *m)
2135 {
2136 	int segments = 0;
2137 	int used = 0;
2138 	int wasted = 0;
2139 
2140 	while (m) {
2141 		segments++;
2142 		used += m->m_len;
2143 		if (m->m_flags & M_EXT) {
2144 			wasted += MHLEN - sizeof(m->m_ext) +
2145 			    m->m_ext.ext_size - m->m_len;
2146 		} else {
2147 			if (m->m_flags & M_PKTHDR)
2148 				wasted += MHLEN - m->m_len;
2149 			else
2150 				wasted += MLEN - m->m_len;
2151 		}
2152 		m = m->m_next;
2153 	}
2154 	/* be paranoid.. it helps */
2155 	if (segments > MP_BUCKETS - 1)
2156 		segments = MP_BUCKETS - 1;
2157 	if (used > 100000)
2158 		used = 100000;
2159 	if (wasted > 100000)
2160 		wasted = 100000;
2161 	/* store in the appropriate bucket */
2162 	/* don't bother locking. if it's slightly off, so what? */
2163 	mbprof.segments[segments]++;
2164 	mbprof.used[fls(used)]++;
2165 	mbprof.wasted[fls(wasted)]++;
2166 }
2167 
2168 static int
2169 mbprof_handler(SYSCTL_HANDLER_ARGS)
2170 {
2171 	char buf[256];
2172 	struct sbuf sb;
2173 	int error;
2174 	uint64_t *p;
2175 
2176 	sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req);
2177 
2178 	p = &mbprof.wasted[0];
2179 	sbuf_printf(&sb,
2180 	    "wasted:\n"
2181 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2182 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2183 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2184 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2185 #ifdef BIG_ARRAY
2186 	p = &mbprof.wasted[16];
2187 	sbuf_printf(&sb,
2188 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2189 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2190 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2191 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2192 #endif
2193 	p = &mbprof.used[0];
2194 	sbuf_printf(&sb,
2195 	    "used:\n"
2196 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2197 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2198 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2199 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2200 #ifdef BIG_ARRAY
2201 	p = &mbprof.used[16];
2202 	sbuf_printf(&sb,
2203 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2204 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2205 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2206 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2207 #endif
2208 	p = &mbprof.segments[0];
2209 	sbuf_printf(&sb,
2210 	    "segments:\n"
2211 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2212 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
2213 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2214 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2215 #ifdef BIG_ARRAY
2216 	p = &mbprof.segments[16];
2217 	sbuf_printf(&sb,
2218 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
2219 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
2220 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
2221 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
2222 #endif
2223 
2224 	error = sbuf_finish(&sb);
2225 	sbuf_delete(&sb);
2226 	return (error);
2227 }
2228 
2229 static int
2230 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
2231 {
2232 	int clear, error;
2233 
2234 	clear = 0;
2235 	error = sysctl_handle_int(oidp, &clear, 0, req);
2236 	if (error || !req->newptr)
2237 		return (error);
2238 
2239 	if (clear) {
2240 		bzero(&mbprof, sizeof(mbprof));
2241 	}
2242 
2243 	return (error);
2244 }
2245 
2246 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile,
2247     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2248     mbprof_handler, "A",
2249     "mbuf profiling statistics");
2250 
2251 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr,
2252     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
2253     mbprof_clr_handler, "I",
2254     "clear mbuf profiling statistics");
2255 #endif
2256