xref: /freebsd/sys/kern/uipc_mbuf.c (revision 59c3cb81c1769fdb6c840c971df129b52f4a848d)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_param.h"
36 #include "opt_mbuf_stress_test.h"
37 #include "opt_mbuf_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/sysctl.h>
47 #include <sys/domain.h>
48 #include <sys/protosw.h>
49 #include <sys/uio.h>
50 #include <sys/sdt.h>
51 
52 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
53     "struct mbuf *", "mbufinfo_t *",
54     "uint32_t", "uint32_t",
55     "uint16_t", "uint16_t",
56     "uint32_t", "uint32_t",
57     "uint32_t", "uint32_t");
58 
59 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
60     "uint32_t", "uint32_t",
61     "uint16_t", "uint16_t",
62     "struct mbuf *", "mbufinfo_t *");
63 
64 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
65     "uint32_t", "uint32_t",
66     "uint16_t", "uint16_t",
67     "struct mbuf *", "mbufinfo_t *");
68 
69 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
70     "uint32_t", "uint32_t",
71     "uint16_t", "uint16_t",
72     "uint32_t", "uint32_t",
73     "struct mbuf *", "mbufinfo_t *");
74 
75 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
76     "struct mbuf *", "mbufinfo_t *",
77     "uint32_t", "uint32_t",
78     "uint32_t", "uint32_t");
79 
80 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
81     "struct mbuf *", "mbufinfo_t *",
82     "uint32_t", "uint32_t",
83     "uint32_t", "uint32_t",
84     "void*", "void*");
85 
86 SDT_PROBE_DEFINE(sdt, , , m__cljset);
87 
88 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
89         "struct mbuf *", "mbufinfo_t *");
90 
91 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
92     "struct mbuf *", "mbufinfo_t *");
93 
94 #include <security/mac/mac_framework.h>
95 
96 int	max_linkhdr;
97 int	max_protohdr;
98 int	max_hdr;
99 int	max_datalen;
100 #ifdef MBUF_STRESS_TEST
101 int	m_defragpackets;
102 int	m_defragbytes;
103 int	m_defraguseless;
104 int	m_defragfailure;
105 int	m_defragrandomfailures;
106 #endif
107 
108 /*
109  * sysctl(8) exported objects
110  */
111 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
112 	   &max_linkhdr, 0, "Size of largest link layer header");
113 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
114 	   &max_protohdr, 0, "Size of largest protocol layer header");
115 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
116 	   &max_hdr, 0, "Size of largest link plus protocol header");
117 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
118 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
119 #ifdef MBUF_STRESS_TEST
120 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
121 	   &m_defragpackets, 0, "");
122 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
123 	   &m_defragbytes, 0, "");
124 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
125 	   &m_defraguseless, 0, "");
126 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
127 	   &m_defragfailure, 0, "");
128 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
129 	   &m_defragrandomfailures, 0, "");
130 #endif
131 
132 /*
133  * Ensure the correct size of various mbuf parameters.  It could be off due
134  * to compiler-induced padding and alignment artifacts.
135  */
136 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
137 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
138 
139 /*
140  * mbuf data storage should be 64-bit aligned regardless of architectural
141  * pointer size; check this is the case with and without a packet header.
142  */
143 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
144 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
145 
146 /*
147  * While the specific values here don't matter too much (i.e., +/- a few
148  * words), we do want to ensure that changes to these values are carefully
149  * reasoned about and properly documented.  This is especially the case as
150  * network-protocol and device-driver modules encode these layouts, and must
151  * be recompiled if the structures change.  Check these values at compile time
152  * against the ones documented in comments in mbuf.h.
153  *
154  * NB: Possibly they should be documented there via #define's and not just
155  * comments.
156  */
157 #if defined(__LP64__)
158 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
159 CTASSERT(sizeof(struct pkthdr) == 56);
160 CTASSERT(sizeof(struct m_ext) == 48);
161 #else
162 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
163 CTASSERT(sizeof(struct pkthdr) == 48);
164 CTASSERT(sizeof(struct m_ext) == 28);
165 #endif
166 
167 /*
168  * Assert that the queue(3) macros produce code of the same size as an old
169  * plain pointer does.
170  */
171 #ifdef INVARIANTS
172 static struct mbuf m_assertbuf;
173 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
174 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
175 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
176 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
177 #endif
178 
179 /*
180  * Attach the cluster from *m to *n, set up m_ext in *n
181  * and bump the refcount of the cluster.
182  */
183 void
184 mb_dupcl(struct mbuf *n, struct mbuf *m)
185 {
186 	volatile u_int *refcnt;
187 
188 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
189 	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
190 
191 	n->m_ext = m->m_ext;
192 	n->m_flags |= M_EXT;
193 	n->m_flags |= m->m_flags & M_RDONLY;
194 
195 	/* See if this is the mbuf that holds the embedded refcount. */
196 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
197 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
198 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
199 	} else {
200 		KASSERT(m->m_ext.ext_cnt != NULL,
201 		    ("%s: no refcounting pointer on %p", __func__, m));
202 		refcnt = m->m_ext.ext_cnt;
203 	}
204 
205 	if (*refcnt == 1)
206 		*refcnt += 1;
207 	else
208 		atomic_add_int(refcnt, 1);
209 }
210 
211 void
212 m_demote_pkthdr(struct mbuf *m)
213 {
214 
215 	M_ASSERTPKTHDR(m);
216 
217 	m_tag_delete_chain(m, NULL);
218 	m->m_flags &= ~M_PKTHDR;
219 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
220 }
221 
222 /*
223  * Clean up mbuf (chain) from any tags and packet headers.
224  * If "all" is set then the first mbuf in the chain will be
225  * cleaned too.
226  */
227 void
228 m_demote(struct mbuf *m0, int all, int flags)
229 {
230 	struct mbuf *m;
231 
232 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
233 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
234 		    __func__, m, m0));
235 		if (m->m_flags & M_PKTHDR)
236 			m_demote_pkthdr(m);
237 		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
238 	}
239 }
240 
241 /*
242  * Sanity checks on mbuf (chain) for use in KASSERT() and general
243  * debugging.
244  * Returns 0 or panics when bad and 1 on all tests passed.
245  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
246  * blow up later.
247  */
248 int
249 m_sanity(struct mbuf *m0, int sanitize)
250 {
251 	struct mbuf *m;
252 	caddr_t a, b;
253 	int pktlen = 0;
254 
255 #ifdef INVARIANTS
256 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
257 #else
258 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
259 #endif
260 
261 	for (m = m0; m != NULL; m = m->m_next) {
262 		/*
263 		 * Basic pointer checks.  If any of these fails then some
264 		 * unrelated kernel memory before or after us is trashed.
265 		 * No way to recover from that.
266 		 */
267 		a = M_START(m);
268 		b = a + M_SIZE(m);
269 		if ((caddr_t)m->m_data < a)
270 			M_SANITY_ACTION("m_data outside mbuf data range left");
271 		if ((caddr_t)m->m_data > b)
272 			M_SANITY_ACTION("m_data outside mbuf data range right");
273 		if ((caddr_t)m->m_data + m->m_len > b)
274 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
275 
276 		/* m->m_nextpkt may only be set on first mbuf in chain. */
277 		if (m != m0 && m->m_nextpkt != NULL) {
278 			if (sanitize) {
279 				m_freem(m->m_nextpkt);
280 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
281 			} else
282 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
283 		}
284 
285 		/* packet length (not mbuf length!) calculation */
286 		if (m0->m_flags & M_PKTHDR)
287 			pktlen += m->m_len;
288 
289 		/* m_tags may only be attached to first mbuf in chain. */
290 		if (m != m0 && m->m_flags & M_PKTHDR &&
291 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
292 			if (sanitize) {
293 				m_tag_delete_chain(m, NULL);
294 				/* put in 0xDEADC0DE perhaps? */
295 			} else
296 				M_SANITY_ACTION("m_tags on in-chain mbuf");
297 		}
298 
299 		/* M_PKTHDR may only be set on first mbuf in chain */
300 		if (m != m0 && m->m_flags & M_PKTHDR) {
301 			if (sanitize) {
302 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
303 				m->m_flags &= ~M_PKTHDR;
304 				/* put in 0xDEADCODE and leave hdr flag in */
305 			} else
306 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
307 		}
308 	}
309 	m = m0;
310 	if (pktlen && pktlen != m->m_pkthdr.len) {
311 		if (sanitize)
312 			m->m_pkthdr.len = 0;
313 		else
314 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
315 	}
316 	return 1;
317 
318 #undef	M_SANITY_ACTION
319 }
320 
321 /*
322  * Non-inlined part of m_init().
323  */
324 int
325 m_pkthdr_init(struct mbuf *m, int how)
326 {
327 #ifdef MAC
328 	int error;
329 #endif
330 	m->m_data = m->m_pktdat;
331 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
332 #ifdef MAC
333 	/* If the label init fails, fail the alloc */
334 	error = mac_mbuf_init(m, how);
335 	if (error)
336 		return (error);
337 #endif
338 
339 	return (0);
340 }
341 
342 /*
343  * "Move" mbuf pkthdr from "from" to "to".
344  * "from" must have M_PKTHDR set, and "to" must be empty.
345  */
346 void
347 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
348 {
349 
350 #if 0
351 	/* see below for why these are not enabled */
352 	M_ASSERTPKTHDR(to);
353 	/* Note: with MAC, this may not be a good assertion. */
354 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
355 	    ("m_move_pkthdr: to has tags"));
356 #endif
357 #ifdef MAC
358 	/*
359 	 * XXXMAC: It could be this should also occur for non-MAC?
360 	 */
361 	if (to->m_flags & M_PKTHDR)
362 		m_tag_delete_chain(to, NULL);
363 #endif
364 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
365 	if ((to->m_flags & M_EXT) == 0)
366 		to->m_data = to->m_pktdat;
367 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
368 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
369 	from->m_flags &= ~M_PKTHDR;
370 }
371 
372 /*
373  * Duplicate "from"'s mbuf pkthdr in "to".
374  * "from" must have M_PKTHDR set, and "to" must be empty.
375  * In particular, this does a deep copy of the packet tags.
376  */
377 int
378 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
379 {
380 
381 #if 0
382 	/*
383 	 * The mbuf allocator only initializes the pkthdr
384 	 * when the mbuf is allocated with m_gethdr(). Many users
385 	 * (e.g. m_copy*, m_prepend) use m_get() and then
386 	 * smash the pkthdr as needed causing these
387 	 * assertions to trip.  For now just disable them.
388 	 */
389 	M_ASSERTPKTHDR(to);
390 	/* Note: with MAC, this may not be a good assertion. */
391 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
392 #endif
393 	MBUF_CHECKSLEEP(how);
394 #ifdef MAC
395 	if (to->m_flags & M_PKTHDR)
396 		m_tag_delete_chain(to, NULL);
397 #endif
398 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
399 	if ((to->m_flags & M_EXT) == 0)
400 		to->m_data = to->m_pktdat;
401 	to->m_pkthdr = from->m_pkthdr;
402 	SLIST_INIT(&to->m_pkthdr.tags);
403 	return (m_tag_copy_chain(to, from, how));
404 }
405 
406 /*
407  * Lesser-used path for M_PREPEND:
408  * allocate new mbuf to prepend to chain,
409  * copy junk along.
410  */
411 struct mbuf *
412 m_prepend(struct mbuf *m, int len, int how)
413 {
414 	struct mbuf *mn;
415 
416 	if (m->m_flags & M_PKTHDR)
417 		mn = m_gethdr(how, m->m_type);
418 	else
419 		mn = m_get(how, m->m_type);
420 	if (mn == NULL) {
421 		m_freem(m);
422 		return (NULL);
423 	}
424 	if (m->m_flags & M_PKTHDR)
425 		m_move_pkthdr(mn, m);
426 	mn->m_next = m;
427 	m = mn;
428 	if (len < M_SIZE(m))
429 		M_ALIGN(m, len);
430 	m->m_len = len;
431 	return (m);
432 }
433 
434 /*
435  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
436  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
437  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
438  * Note that the copy is read-only, because clusters are not copied,
439  * only their reference counts are incremented.
440  */
441 struct mbuf *
442 m_copym(struct mbuf *m, int off0, int len, int wait)
443 {
444 	struct mbuf *n, **np;
445 	int off = off0;
446 	struct mbuf *top;
447 	int copyhdr = 0;
448 
449 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
450 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
451 	MBUF_CHECKSLEEP(wait);
452 	if (off == 0 && m->m_flags & M_PKTHDR)
453 		copyhdr = 1;
454 	while (off > 0) {
455 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
456 		if (off < m->m_len)
457 			break;
458 		off -= m->m_len;
459 		m = m->m_next;
460 	}
461 	np = &top;
462 	top = 0;
463 	while (len > 0) {
464 		if (m == NULL) {
465 			KASSERT(len == M_COPYALL,
466 			    ("m_copym, length > size of mbuf chain"));
467 			break;
468 		}
469 		if (copyhdr)
470 			n = m_gethdr(wait, m->m_type);
471 		else
472 			n = m_get(wait, m->m_type);
473 		*np = n;
474 		if (n == NULL)
475 			goto nospace;
476 		if (copyhdr) {
477 			if (!m_dup_pkthdr(n, m, wait))
478 				goto nospace;
479 			if (len == M_COPYALL)
480 				n->m_pkthdr.len -= off0;
481 			else
482 				n->m_pkthdr.len = len;
483 			copyhdr = 0;
484 		}
485 		n->m_len = min(len, m->m_len - off);
486 		if (m->m_flags & M_EXT) {
487 			n->m_data = m->m_data + off;
488 			mb_dupcl(n, m);
489 		} else
490 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
491 			    (u_int)n->m_len);
492 		if (len != M_COPYALL)
493 			len -= n->m_len;
494 		off = 0;
495 		m = m->m_next;
496 		np = &n->m_next;
497 	}
498 
499 	return (top);
500 nospace:
501 	m_freem(top);
502 	return (NULL);
503 }
504 
505 /*
506  * Copy an entire packet, including header (which must be present).
507  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
508  * Note that the copy is read-only, because clusters are not copied,
509  * only their reference counts are incremented.
510  * Preserve alignment of the first mbuf so if the creator has left
511  * some room at the beginning (e.g. for inserting protocol headers)
512  * the copies still have the room available.
513  */
514 struct mbuf *
515 m_copypacket(struct mbuf *m, int how)
516 {
517 	struct mbuf *top, *n, *o;
518 
519 	MBUF_CHECKSLEEP(how);
520 	n = m_get(how, m->m_type);
521 	top = n;
522 	if (n == NULL)
523 		goto nospace;
524 
525 	if (!m_dup_pkthdr(n, m, how))
526 		goto nospace;
527 	n->m_len = m->m_len;
528 	if (m->m_flags & M_EXT) {
529 		n->m_data = m->m_data;
530 		mb_dupcl(n, m);
531 	} else {
532 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
533 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
534 	}
535 
536 	m = m->m_next;
537 	while (m) {
538 		o = m_get(how, m->m_type);
539 		if (o == NULL)
540 			goto nospace;
541 
542 		n->m_next = o;
543 		n = n->m_next;
544 
545 		n->m_len = m->m_len;
546 		if (m->m_flags & M_EXT) {
547 			n->m_data = m->m_data;
548 			mb_dupcl(n, m);
549 		} else {
550 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
551 		}
552 
553 		m = m->m_next;
554 	}
555 	return top;
556 nospace:
557 	m_freem(top);
558 	return (NULL);
559 }
560 
561 /*
562  * Copy data from an mbuf chain starting "off" bytes from the beginning,
563  * continuing for "len" bytes, into the indicated buffer.
564  */
565 void
566 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
567 {
568 	u_int count;
569 
570 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
571 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
572 	while (off > 0) {
573 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
574 		if (off < m->m_len)
575 			break;
576 		off -= m->m_len;
577 		m = m->m_next;
578 	}
579 	while (len > 0) {
580 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
581 		count = min(m->m_len - off, len);
582 		bcopy(mtod(m, caddr_t) + off, cp, count);
583 		len -= count;
584 		cp += count;
585 		off = 0;
586 		m = m->m_next;
587 	}
588 }
589 
590 /*
591  * Copy a packet header mbuf chain into a completely new chain, including
592  * copying any mbuf clusters.  Use this instead of m_copypacket() when
593  * you need a writable copy of an mbuf chain.
594  */
595 struct mbuf *
596 m_dup(const struct mbuf *m, int how)
597 {
598 	struct mbuf **p, *top = NULL;
599 	int remain, moff, nsize;
600 
601 	MBUF_CHECKSLEEP(how);
602 	/* Sanity check */
603 	if (m == NULL)
604 		return (NULL);
605 	M_ASSERTPKTHDR(m);
606 
607 	/* While there's more data, get a new mbuf, tack it on, and fill it */
608 	remain = m->m_pkthdr.len;
609 	moff = 0;
610 	p = &top;
611 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
612 		struct mbuf *n;
613 
614 		/* Get the next new mbuf */
615 		if (remain >= MINCLSIZE) {
616 			n = m_getcl(how, m->m_type, 0);
617 			nsize = MCLBYTES;
618 		} else {
619 			n = m_get(how, m->m_type);
620 			nsize = MLEN;
621 		}
622 		if (n == NULL)
623 			goto nospace;
624 
625 		if (top == NULL) {		/* First one, must be PKTHDR */
626 			if (!m_dup_pkthdr(n, m, how)) {
627 				m_free(n);
628 				goto nospace;
629 			}
630 			if ((n->m_flags & M_EXT) == 0)
631 				nsize = MHLEN;
632 			n->m_flags &= ~M_RDONLY;
633 		}
634 		n->m_len = 0;
635 
636 		/* Link it into the new chain */
637 		*p = n;
638 		p = &n->m_next;
639 
640 		/* Copy data from original mbuf(s) into new mbuf */
641 		while (n->m_len < nsize && m != NULL) {
642 			int chunk = min(nsize - n->m_len, m->m_len - moff);
643 
644 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
645 			moff += chunk;
646 			n->m_len += chunk;
647 			remain -= chunk;
648 			if (moff == m->m_len) {
649 				m = m->m_next;
650 				moff = 0;
651 			}
652 		}
653 
654 		/* Check correct total mbuf length */
655 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
656 		    	("%s: bogus m_pkthdr.len", __func__));
657 	}
658 	return (top);
659 
660 nospace:
661 	m_freem(top);
662 	return (NULL);
663 }
664 
665 /*
666  * Concatenate mbuf chain n to m.
667  * Both chains must be of the same type (e.g. MT_DATA).
668  * Any m_pkthdr is not updated.
669  */
670 void
671 m_cat(struct mbuf *m, struct mbuf *n)
672 {
673 	while (m->m_next)
674 		m = m->m_next;
675 	while (n) {
676 		if (!M_WRITABLE(m) ||
677 		    M_TRAILINGSPACE(m) < n->m_len) {
678 			/* just join the two chains */
679 			m->m_next = n;
680 			return;
681 		}
682 		/* splat the data from one into the other */
683 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
684 		    (u_int)n->m_len);
685 		m->m_len += n->m_len;
686 		n = m_free(n);
687 	}
688 }
689 
690 /*
691  * Concatenate two pkthdr mbuf chains.
692  */
693 void
694 m_catpkt(struct mbuf *m, struct mbuf *n)
695 {
696 
697 	M_ASSERTPKTHDR(m);
698 	M_ASSERTPKTHDR(n);
699 
700 	m->m_pkthdr.len += n->m_pkthdr.len;
701 	m_demote(n, 1, 0);
702 
703 	m_cat(m, n);
704 }
705 
706 void
707 m_adj(struct mbuf *mp, int req_len)
708 {
709 	int len = req_len;
710 	struct mbuf *m;
711 	int count;
712 
713 	if ((m = mp) == NULL)
714 		return;
715 	if (len >= 0) {
716 		/*
717 		 * Trim from head.
718 		 */
719 		while (m != NULL && len > 0) {
720 			if (m->m_len <= len) {
721 				len -= m->m_len;
722 				m->m_len = 0;
723 				m = m->m_next;
724 			} else {
725 				m->m_len -= len;
726 				m->m_data += len;
727 				len = 0;
728 			}
729 		}
730 		if (mp->m_flags & M_PKTHDR)
731 			mp->m_pkthdr.len -= (req_len - len);
732 	} else {
733 		/*
734 		 * Trim from tail.  Scan the mbuf chain,
735 		 * calculating its length and finding the last mbuf.
736 		 * If the adjustment only affects this mbuf, then just
737 		 * adjust and return.  Otherwise, rescan and truncate
738 		 * after the remaining size.
739 		 */
740 		len = -len;
741 		count = 0;
742 		for (;;) {
743 			count += m->m_len;
744 			if (m->m_next == (struct mbuf *)0)
745 				break;
746 			m = m->m_next;
747 		}
748 		if (m->m_len >= len) {
749 			m->m_len -= len;
750 			if (mp->m_flags & M_PKTHDR)
751 				mp->m_pkthdr.len -= len;
752 			return;
753 		}
754 		count -= len;
755 		if (count < 0)
756 			count = 0;
757 		/*
758 		 * Correct length for chain is "count".
759 		 * Find the mbuf with last data, adjust its length,
760 		 * and toss data from remaining mbufs on chain.
761 		 */
762 		m = mp;
763 		if (m->m_flags & M_PKTHDR)
764 			m->m_pkthdr.len = count;
765 		for (; m; m = m->m_next) {
766 			if (m->m_len >= count) {
767 				m->m_len = count;
768 				if (m->m_next != NULL) {
769 					m_freem(m->m_next);
770 					m->m_next = NULL;
771 				}
772 				break;
773 			}
774 			count -= m->m_len;
775 		}
776 	}
777 }
778 
779 /*
780  * Rearange an mbuf chain so that len bytes are contiguous
781  * and in the data area of an mbuf (so that mtod will work
782  * for a structure of size len).  Returns the resulting
783  * mbuf chain on success, frees it and returns null on failure.
784  * If there is room, it will add up to max_protohdr-len extra bytes to the
785  * contiguous region in an attempt to avoid being called next time.
786  */
787 struct mbuf *
788 m_pullup(struct mbuf *n, int len)
789 {
790 	struct mbuf *m;
791 	int count;
792 	int space;
793 
794 	/*
795 	 * If first mbuf has no cluster, and has room for len bytes
796 	 * without shifting current data, pullup into it,
797 	 * otherwise allocate a new mbuf to prepend to the chain.
798 	 */
799 	if ((n->m_flags & M_EXT) == 0 &&
800 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
801 		if (n->m_len >= len)
802 			return (n);
803 		m = n;
804 		n = n->m_next;
805 		len -= m->m_len;
806 	} else {
807 		if (len > MHLEN)
808 			goto bad;
809 		m = m_get(M_NOWAIT, n->m_type);
810 		if (m == NULL)
811 			goto bad;
812 		if (n->m_flags & M_PKTHDR)
813 			m_move_pkthdr(m, n);
814 	}
815 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
816 	do {
817 		count = min(min(max(len, max_protohdr), space), n->m_len);
818 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
819 		  (u_int)count);
820 		len -= count;
821 		m->m_len += count;
822 		n->m_len -= count;
823 		space -= count;
824 		if (n->m_len)
825 			n->m_data += count;
826 		else
827 			n = m_free(n);
828 	} while (len > 0 && n);
829 	if (len > 0) {
830 		(void) m_free(m);
831 		goto bad;
832 	}
833 	m->m_next = n;
834 	return (m);
835 bad:
836 	m_freem(n);
837 	return (NULL);
838 }
839 
840 /*
841  * Like m_pullup(), except a new mbuf is always allocated, and we allow
842  * the amount of empty space before the data in the new mbuf to be specified
843  * (in the event that the caller expects to prepend later).
844  */
845 struct mbuf *
846 m_copyup(struct mbuf *n, int len, int dstoff)
847 {
848 	struct mbuf *m;
849 	int count, space;
850 
851 	if (len > (MHLEN - dstoff))
852 		goto bad;
853 	m = m_get(M_NOWAIT, n->m_type);
854 	if (m == NULL)
855 		goto bad;
856 	if (n->m_flags & M_PKTHDR)
857 		m_move_pkthdr(m, n);
858 	m->m_data += dstoff;
859 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
860 	do {
861 		count = min(min(max(len, max_protohdr), space), n->m_len);
862 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
863 		    (unsigned)count);
864 		len -= count;
865 		m->m_len += count;
866 		n->m_len -= count;
867 		space -= count;
868 		if (n->m_len)
869 			n->m_data += count;
870 		else
871 			n = m_free(n);
872 	} while (len > 0 && n);
873 	if (len > 0) {
874 		(void) m_free(m);
875 		goto bad;
876 	}
877 	m->m_next = n;
878 	return (m);
879  bad:
880 	m_freem(n);
881 	return (NULL);
882 }
883 
884 /*
885  * Partition an mbuf chain in two pieces, returning the tail --
886  * all but the first len0 bytes.  In case of failure, it returns NULL and
887  * attempts to restore the chain to its original state.
888  *
889  * Note that the resulting mbufs might be read-only, because the new
890  * mbuf can end up sharing an mbuf cluster with the original mbuf if
891  * the "breaking point" happens to lie within a cluster mbuf. Use the
892  * M_WRITABLE() macro to check for this case.
893  */
894 struct mbuf *
895 m_split(struct mbuf *m0, int len0, int wait)
896 {
897 	struct mbuf *m, *n;
898 	u_int len = len0, remain;
899 
900 	MBUF_CHECKSLEEP(wait);
901 	for (m = m0; m && len > m->m_len; m = m->m_next)
902 		len -= m->m_len;
903 	if (m == NULL)
904 		return (NULL);
905 	remain = m->m_len - len;
906 	if (m0->m_flags & M_PKTHDR && remain == 0) {
907 		n = m_gethdr(wait, m0->m_type);
908 		if (n == NULL)
909 			return (NULL);
910 		n->m_next = m->m_next;
911 		m->m_next = NULL;
912 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
913 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
914 		m0->m_pkthdr.len = len0;
915 		return (n);
916 	} else if (m0->m_flags & M_PKTHDR) {
917 		n = m_gethdr(wait, m0->m_type);
918 		if (n == NULL)
919 			return (NULL);
920 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
921 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
922 		m0->m_pkthdr.len = len0;
923 		if (m->m_flags & M_EXT)
924 			goto extpacket;
925 		if (remain > MHLEN) {
926 			/* m can't be the lead packet */
927 			M_ALIGN(n, 0);
928 			n->m_next = m_split(m, len, wait);
929 			if (n->m_next == NULL) {
930 				(void) m_free(n);
931 				return (NULL);
932 			} else {
933 				n->m_len = 0;
934 				return (n);
935 			}
936 		} else
937 			M_ALIGN(n, remain);
938 	} else if (remain == 0) {
939 		n = m->m_next;
940 		m->m_next = NULL;
941 		return (n);
942 	} else {
943 		n = m_get(wait, m->m_type);
944 		if (n == NULL)
945 			return (NULL);
946 		M_ALIGN(n, remain);
947 	}
948 extpacket:
949 	if (m->m_flags & M_EXT) {
950 		n->m_data = m->m_data + len;
951 		mb_dupcl(n, m);
952 	} else {
953 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
954 	}
955 	n->m_len = remain;
956 	m->m_len = len;
957 	n->m_next = m->m_next;
958 	m->m_next = NULL;
959 	return (n);
960 }
961 /*
962  * Routine to copy from device local memory into mbufs.
963  * Note that `off' argument is offset into first mbuf of target chain from
964  * which to begin copying the data to.
965  */
966 struct mbuf *
967 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
968     void (*copy)(char *from, caddr_t to, u_int len))
969 {
970 	struct mbuf *m;
971 	struct mbuf *top = NULL, **mp = &top;
972 	int len;
973 
974 	if (off < 0 || off > MHLEN)
975 		return (NULL);
976 
977 	while (totlen > 0) {
978 		if (top == NULL) {	/* First one, must be PKTHDR */
979 			if (totlen + off >= MINCLSIZE) {
980 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
981 				len = MCLBYTES;
982 			} else {
983 				m = m_gethdr(M_NOWAIT, MT_DATA);
984 				len = MHLEN;
985 
986 				/* Place initial small packet/header at end of mbuf */
987 				if (m && totlen + off + max_linkhdr <= MLEN) {
988 					m->m_data += max_linkhdr;
989 					len -= max_linkhdr;
990 				}
991 			}
992 			if (m == NULL)
993 				return NULL;
994 			m->m_pkthdr.rcvif = ifp;
995 			m->m_pkthdr.len = totlen;
996 		} else {
997 			if (totlen + off >= MINCLSIZE) {
998 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
999 				len = MCLBYTES;
1000 			} else {
1001 				m = m_get(M_NOWAIT, MT_DATA);
1002 				len = MLEN;
1003 			}
1004 			if (m == NULL) {
1005 				m_freem(top);
1006 				return NULL;
1007 			}
1008 		}
1009 		if (off) {
1010 			m->m_data += off;
1011 			len -= off;
1012 			off = 0;
1013 		}
1014 		m->m_len = len = min(totlen, len);
1015 		if (copy)
1016 			copy(buf, mtod(m, caddr_t), (u_int)len);
1017 		else
1018 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
1019 		buf += len;
1020 		*mp = m;
1021 		mp = &m->m_next;
1022 		totlen -= len;
1023 	}
1024 	return (top);
1025 }
1026 
1027 /*
1028  * Copy data from a buffer back into the indicated mbuf chain,
1029  * starting "off" bytes from the beginning, extending the mbuf
1030  * chain if necessary.
1031  */
1032 void
1033 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
1034 {
1035 	int mlen;
1036 	struct mbuf *m = m0, *n;
1037 	int totlen = 0;
1038 
1039 	if (m0 == NULL)
1040 		return;
1041 	while (off > (mlen = m->m_len)) {
1042 		off -= mlen;
1043 		totlen += mlen;
1044 		if (m->m_next == NULL) {
1045 			n = m_get(M_NOWAIT, m->m_type);
1046 			if (n == NULL)
1047 				goto out;
1048 			bzero(mtod(n, caddr_t), MLEN);
1049 			n->m_len = min(MLEN, len + off);
1050 			m->m_next = n;
1051 		}
1052 		m = m->m_next;
1053 	}
1054 	while (len > 0) {
1055 		if (m->m_next == NULL && (len > m->m_len - off)) {
1056 			m->m_len += min(len - (m->m_len - off),
1057 			    M_TRAILINGSPACE(m));
1058 		}
1059 		mlen = min (m->m_len - off, len);
1060 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1061 		cp += mlen;
1062 		len -= mlen;
1063 		mlen += off;
1064 		off = 0;
1065 		totlen += mlen;
1066 		if (len == 0)
1067 			break;
1068 		if (m->m_next == NULL) {
1069 			n = m_get(M_NOWAIT, m->m_type);
1070 			if (n == NULL)
1071 				break;
1072 			n->m_len = min(MLEN, len);
1073 			m->m_next = n;
1074 		}
1075 		m = m->m_next;
1076 	}
1077 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1078 		m->m_pkthdr.len = totlen;
1079 }
1080 
1081 /*
1082  * Append the specified data to the indicated mbuf chain,
1083  * Extend the mbuf chain if the new data does not fit in
1084  * existing space.
1085  *
1086  * Return 1 if able to complete the job; otherwise 0.
1087  */
1088 int
1089 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1090 {
1091 	struct mbuf *m, *n;
1092 	int remainder, space;
1093 
1094 	for (m = m0; m->m_next != NULL; m = m->m_next)
1095 		;
1096 	remainder = len;
1097 	space = M_TRAILINGSPACE(m);
1098 	if (space > 0) {
1099 		/*
1100 		 * Copy into available space.
1101 		 */
1102 		if (space > remainder)
1103 			space = remainder;
1104 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1105 		m->m_len += space;
1106 		cp += space, remainder -= space;
1107 	}
1108 	while (remainder > 0) {
1109 		/*
1110 		 * Allocate a new mbuf; could check space
1111 		 * and allocate a cluster instead.
1112 		 */
1113 		n = m_get(M_NOWAIT, m->m_type);
1114 		if (n == NULL)
1115 			break;
1116 		n->m_len = min(MLEN, remainder);
1117 		bcopy(cp, mtod(n, caddr_t), n->m_len);
1118 		cp += n->m_len, remainder -= n->m_len;
1119 		m->m_next = n;
1120 		m = n;
1121 	}
1122 	if (m0->m_flags & M_PKTHDR)
1123 		m0->m_pkthdr.len += len - remainder;
1124 	return (remainder == 0);
1125 }
1126 
1127 /*
1128  * Apply function f to the data in an mbuf chain starting "off" bytes from
1129  * the beginning, continuing for "len" bytes.
1130  */
1131 int
1132 m_apply(struct mbuf *m, int off, int len,
1133     int (*f)(void *, void *, u_int), void *arg)
1134 {
1135 	u_int count;
1136 	int rval;
1137 
1138 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1139 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1140 	while (off > 0) {
1141 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1142 		if (off < m->m_len)
1143 			break;
1144 		off -= m->m_len;
1145 		m = m->m_next;
1146 	}
1147 	while (len > 0) {
1148 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1149 		count = min(m->m_len - off, len);
1150 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
1151 		if (rval)
1152 			return (rval);
1153 		len -= count;
1154 		off = 0;
1155 		m = m->m_next;
1156 	}
1157 	return (0);
1158 }
1159 
1160 /*
1161  * Return a pointer to mbuf/offset of location in mbuf chain.
1162  */
1163 struct mbuf *
1164 m_getptr(struct mbuf *m, int loc, int *off)
1165 {
1166 
1167 	while (loc >= 0) {
1168 		/* Normal end of search. */
1169 		if (m->m_len > loc) {
1170 			*off = loc;
1171 			return (m);
1172 		} else {
1173 			loc -= m->m_len;
1174 			if (m->m_next == NULL) {
1175 				if (loc == 0) {
1176 					/* Point at the end of valid data. */
1177 					*off = m->m_len;
1178 					return (m);
1179 				}
1180 				return (NULL);
1181 			}
1182 			m = m->m_next;
1183 		}
1184 	}
1185 	return (NULL);
1186 }
1187 
1188 void
1189 m_print(const struct mbuf *m, int maxlen)
1190 {
1191 	int len;
1192 	int pdata;
1193 	const struct mbuf *m2;
1194 
1195 	if (m == NULL) {
1196 		printf("mbuf: %p\n", m);
1197 		return;
1198 	}
1199 
1200 	if (m->m_flags & M_PKTHDR)
1201 		len = m->m_pkthdr.len;
1202 	else
1203 		len = -1;
1204 	m2 = m;
1205 	while (m2 != NULL && (len == -1 || len)) {
1206 		pdata = m2->m_len;
1207 		if (maxlen != -1 && pdata > maxlen)
1208 			pdata = maxlen;
1209 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1210 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1211 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1212 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1213 		if (pdata)
1214 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1215 		if (len != -1)
1216 			len -= m2->m_len;
1217 		m2 = m2->m_next;
1218 	}
1219 	if (len > 0)
1220 		printf("%d bytes unaccounted for.\n", len);
1221 	return;
1222 }
1223 
1224 u_int
1225 m_fixhdr(struct mbuf *m0)
1226 {
1227 	u_int len;
1228 
1229 	len = m_length(m0, NULL);
1230 	m0->m_pkthdr.len = len;
1231 	return (len);
1232 }
1233 
1234 u_int
1235 m_length(struct mbuf *m0, struct mbuf **last)
1236 {
1237 	struct mbuf *m;
1238 	u_int len;
1239 
1240 	len = 0;
1241 	for (m = m0; m != NULL; m = m->m_next) {
1242 		len += m->m_len;
1243 		if (m->m_next == NULL)
1244 			break;
1245 	}
1246 	if (last != NULL)
1247 		*last = m;
1248 	return (len);
1249 }
1250 
1251 /*
1252  * Defragment a mbuf chain, returning the shortest possible
1253  * chain of mbufs and clusters.  If allocation fails and
1254  * this cannot be completed, NULL will be returned, but
1255  * the passed in chain will be unchanged.  Upon success,
1256  * the original chain will be freed, and the new chain
1257  * will be returned.
1258  *
1259  * If a non-packet header is passed in, the original
1260  * mbuf (chain?) will be returned unharmed.
1261  */
1262 struct mbuf *
1263 m_defrag(struct mbuf *m0, int how)
1264 {
1265 	struct mbuf *m_new = NULL, *m_final = NULL;
1266 	int progress = 0, length;
1267 
1268 	MBUF_CHECKSLEEP(how);
1269 	if (!(m0->m_flags & M_PKTHDR))
1270 		return (m0);
1271 
1272 	m_fixhdr(m0); /* Needed sanity check */
1273 
1274 #ifdef MBUF_STRESS_TEST
1275 	if (m_defragrandomfailures) {
1276 		int temp = arc4random() & 0xff;
1277 		if (temp == 0xba)
1278 			goto nospace;
1279 	}
1280 #endif
1281 
1282 	if (m0->m_pkthdr.len > MHLEN)
1283 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1284 	else
1285 		m_final = m_gethdr(how, MT_DATA);
1286 
1287 	if (m_final == NULL)
1288 		goto nospace;
1289 
1290 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1291 		goto nospace;
1292 
1293 	m_new = m_final;
1294 
1295 	while (progress < m0->m_pkthdr.len) {
1296 		length = m0->m_pkthdr.len - progress;
1297 		if (length > MCLBYTES)
1298 			length = MCLBYTES;
1299 
1300 		if (m_new == NULL) {
1301 			if (length > MLEN)
1302 				m_new = m_getcl(how, MT_DATA, 0);
1303 			else
1304 				m_new = m_get(how, MT_DATA);
1305 			if (m_new == NULL)
1306 				goto nospace;
1307 		}
1308 
1309 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1310 		progress += length;
1311 		m_new->m_len = length;
1312 		if (m_new != m_final)
1313 			m_cat(m_final, m_new);
1314 		m_new = NULL;
1315 	}
1316 #ifdef MBUF_STRESS_TEST
1317 	if (m0->m_next == NULL)
1318 		m_defraguseless++;
1319 #endif
1320 	m_freem(m0);
1321 	m0 = m_final;
1322 #ifdef MBUF_STRESS_TEST
1323 	m_defragpackets++;
1324 	m_defragbytes += m0->m_pkthdr.len;
1325 #endif
1326 	return (m0);
1327 nospace:
1328 #ifdef MBUF_STRESS_TEST
1329 	m_defragfailure++;
1330 #endif
1331 	if (m_final)
1332 		m_freem(m_final);
1333 	return (NULL);
1334 }
1335 
1336 /*
1337  * Defragment an mbuf chain, returning at most maxfrags separate
1338  * mbufs+clusters.  If this is not possible NULL is returned and
1339  * the original mbuf chain is left in it's present (potentially
1340  * modified) state.  We use two techniques: collapsing consecutive
1341  * mbufs and replacing consecutive mbufs by a cluster.
1342  *
1343  * NB: this should really be named m_defrag but that name is taken
1344  */
1345 struct mbuf *
1346 m_collapse(struct mbuf *m0, int how, int maxfrags)
1347 {
1348 	struct mbuf *m, *n, *n2, **prev;
1349 	u_int curfrags;
1350 
1351 	/*
1352 	 * Calculate the current number of frags.
1353 	 */
1354 	curfrags = 0;
1355 	for (m = m0; m != NULL; m = m->m_next)
1356 		curfrags++;
1357 	/*
1358 	 * First, try to collapse mbufs.  Note that we always collapse
1359 	 * towards the front so we don't need to deal with moving the
1360 	 * pkthdr.  This may be suboptimal if the first mbuf has much
1361 	 * less data than the following.
1362 	 */
1363 	m = m0;
1364 again:
1365 	for (;;) {
1366 		n = m->m_next;
1367 		if (n == NULL)
1368 			break;
1369 		if (M_WRITABLE(m) &&
1370 		    n->m_len < M_TRAILINGSPACE(m)) {
1371 			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
1372 				n->m_len);
1373 			m->m_len += n->m_len;
1374 			m->m_next = n->m_next;
1375 			m_free(n);
1376 			if (--curfrags <= maxfrags)
1377 				return m0;
1378 		} else
1379 			m = n;
1380 	}
1381 	KASSERT(maxfrags > 1,
1382 		("maxfrags %u, but normal collapse failed", maxfrags));
1383 	/*
1384 	 * Collapse consecutive mbufs to a cluster.
1385 	 */
1386 	prev = &m0->m_next;		/* NB: not the first mbuf */
1387 	while ((n = *prev) != NULL) {
1388 		if ((n2 = n->m_next) != NULL &&
1389 		    n->m_len + n2->m_len < MCLBYTES) {
1390 			m = m_getcl(how, MT_DATA, 0);
1391 			if (m == NULL)
1392 				goto bad;
1393 			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
1394 			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
1395 				n2->m_len);
1396 			m->m_len = n->m_len + n2->m_len;
1397 			m->m_next = n2->m_next;
1398 			*prev = m;
1399 			m_free(n);
1400 			m_free(n2);
1401 			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
1402 				return m0;
1403 			/*
1404 			 * Still not there, try the normal collapse
1405 			 * again before we allocate another cluster.
1406 			 */
1407 			goto again;
1408 		}
1409 		prev = &n->m_next;
1410 	}
1411 	/*
1412 	 * No place where we can collapse to a cluster; punt.
1413 	 * This can occur if, for example, you request 2 frags
1414 	 * but the packet requires that both be clusters (we
1415 	 * never reallocate the first mbuf to avoid moving the
1416 	 * packet header).
1417 	 */
1418 bad:
1419 	return NULL;
1420 }
1421 
1422 #ifdef MBUF_STRESS_TEST
1423 
1424 /*
1425  * Fragment an mbuf chain.  There's no reason you'd ever want to do
1426  * this in normal usage, but it's great for stress testing various
1427  * mbuf consumers.
1428  *
1429  * If fragmentation is not possible, the original chain will be
1430  * returned.
1431  *
1432  * Possible length values:
1433  * 0	 no fragmentation will occur
1434  * > 0	each fragment will be of the specified length
1435  * -1	each fragment will be the same random value in length
1436  * -2	each fragment's length will be entirely random
1437  * (Random values range from 1 to 256)
1438  */
1439 struct mbuf *
1440 m_fragment(struct mbuf *m0, int how, int length)
1441 {
1442 	struct mbuf *m_new = NULL, *m_final = NULL;
1443 	int progress = 0;
1444 
1445 	if (!(m0->m_flags & M_PKTHDR))
1446 		return (m0);
1447 
1448 	if ((length == 0) || (length < -2))
1449 		return (m0);
1450 
1451 	m_fixhdr(m0); /* Needed sanity check */
1452 
1453 	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1454 
1455 	if (m_final == NULL)
1456 		goto nospace;
1457 
1458 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1459 		goto nospace;
1460 
1461 	m_new = m_final;
1462 
1463 	if (length == -1)
1464 		length = 1 + (arc4random() & 255);
1465 
1466 	while (progress < m0->m_pkthdr.len) {
1467 		int fraglen;
1468 
1469 		if (length > 0)
1470 			fraglen = length;
1471 		else
1472 			fraglen = 1 + (arc4random() & 255);
1473 		if (fraglen > m0->m_pkthdr.len - progress)
1474 			fraglen = m0->m_pkthdr.len - progress;
1475 
1476 		if (fraglen > MCLBYTES)
1477 			fraglen = MCLBYTES;
1478 
1479 		if (m_new == NULL) {
1480 			m_new = m_getcl(how, MT_DATA, 0);
1481 			if (m_new == NULL)
1482 				goto nospace;
1483 		}
1484 
1485 		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
1486 		progress += fraglen;
1487 		m_new->m_len = fraglen;
1488 		if (m_new != m_final)
1489 			m_cat(m_final, m_new);
1490 		m_new = NULL;
1491 	}
1492 	m_freem(m0);
1493 	m0 = m_final;
1494 	return (m0);
1495 nospace:
1496 	if (m_final)
1497 		m_freem(m_final);
1498 	/* Return the original chain on failure */
1499 	return (m0);
1500 }
1501 
1502 #endif
1503 
1504 /*
1505  * Copy the contents of uio into a properly sized mbuf chain.
1506  */
1507 struct mbuf *
1508 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
1509 {
1510 	struct mbuf *m, *mb;
1511 	int error, length;
1512 	ssize_t total;
1513 	int progress = 0;
1514 
1515 	/*
1516 	 * len can be zero or an arbitrary large value bound by
1517 	 * the total data supplied by the uio.
1518 	 */
1519 	if (len > 0)
1520 		total = min(uio->uio_resid, len);
1521 	else
1522 		total = uio->uio_resid;
1523 
1524 	/*
1525 	 * The smallest unit returned by m_getm2() is a single mbuf
1526 	 * with pkthdr.  We can't align past it.
1527 	 */
1528 	if (align >= MHLEN)
1529 		return (NULL);
1530 
1531 	/*
1532 	 * Give us the full allocation or nothing.
1533 	 * If len is zero return the smallest empty mbuf.
1534 	 */
1535 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
1536 	if (m == NULL)
1537 		return (NULL);
1538 	m->m_data += align;
1539 
1540 	/* Fill all mbufs with uio data and update header information. */
1541 	for (mb = m; mb != NULL; mb = mb->m_next) {
1542 		length = min(M_TRAILINGSPACE(mb), total - progress);
1543 
1544 		error = uiomove(mtod(mb, void *), length, uio);
1545 		if (error) {
1546 			m_freem(m);
1547 			return (NULL);
1548 		}
1549 
1550 		mb->m_len = length;
1551 		progress += length;
1552 		if (flags & M_PKTHDR)
1553 			m->m_pkthdr.len += length;
1554 	}
1555 	KASSERT(progress == total, ("%s: progress != total", __func__));
1556 
1557 	return (m);
1558 }
1559 
1560 /*
1561  * Copy an mbuf chain into a uio limited by len if set.
1562  */
1563 int
1564 m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
1565 {
1566 	int error, length, total;
1567 	int progress = 0;
1568 
1569 	if (len > 0)
1570 		total = min(uio->uio_resid, len);
1571 	else
1572 		total = uio->uio_resid;
1573 
1574 	/* Fill the uio with data from the mbufs. */
1575 	for (; m != NULL; m = m->m_next) {
1576 		length = min(m->m_len, total - progress);
1577 
1578 		error = uiomove(mtod(m, void *), length, uio);
1579 		if (error)
1580 			return (error);
1581 
1582 		progress += length;
1583 	}
1584 
1585 	return (0);
1586 }
1587 
1588 /*
1589  * Create a writable copy of the mbuf chain.  While doing this
1590  * we compact the chain with a goal of producing a chain with
1591  * at most two mbufs.  The second mbuf in this chain is likely
1592  * to be a cluster.  The primary purpose of this work is to create
1593  * a writable packet for encryption, compression, etc.  The
1594  * secondary goal is to linearize the data so the data can be
1595  * passed to crypto hardware in the most efficient manner possible.
1596  */
1597 struct mbuf *
1598 m_unshare(struct mbuf *m0, int how)
1599 {
1600 	struct mbuf *m, *mprev;
1601 	struct mbuf *n, *mfirst, *mlast;
1602 	int len, off;
1603 
1604 	mprev = NULL;
1605 	for (m = m0; m != NULL; m = mprev->m_next) {
1606 		/*
1607 		 * Regular mbufs are ignored unless there's a cluster
1608 		 * in front of it that we can use to coalesce.  We do
1609 		 * the latter mainly so later clusters can be coalesced
1610 		 * also w/o having to handle them specially (i.e. convert
1611 		 * mbuf+cluster -> cluster).  This optimization is heavily
1612 		 * influenced by the assumption that we're running over
1613 		 * Ethernet where MCLBYTES is large enough that the max
1614 		 * packet size will permit lots of coalescing into a
1615 		 * single cluster.  This in turn permits efficient
1616 		 * crypto operations, especially when using hardware.
1617 		 */
1618 		if ((m->m_flags & M_EXT) == 0) {
1619 			if (mprev && (mprev->m_flags & M_EXT) &&
1620 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
1621 				/* XXX: this ignores mbuf types */
1622 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1623 				    mtod(m, caddr_t), m->m_len);
1624 				mprev->m_len += m->m_len;
1625 				mprev->m_next = m->m_next;	/* unlink from chain */
1626 				m_free(m);			/* reclaim mbuf */
1627 #if 0
1628 				newipsecstat.ips_mbcoalesced++;
1629 #endif
1630 			} else {
1631 				mprev = m;
1632 			}
1633 			continue;
1634 		}
1635 		/*
1636 		 * Writable mbufs are left alone (for now).
1637 		 */
1638 		if (M_WRITABLE(m)) {
1639 			mprev = m;
1640 			continue;
1641 		}
1642 
1643 		/*
1644 		 * Not writable, replace with a copy or coalesce with
1645 		 * the previous mbuf if possible (since we have to copy
1646 		 * it anyway, we try to reduce the number of mbufs and
1647 		 * clusters so that future work is easier).
1648 		 */
1649 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
1650 		/* NB: we only coalesce into a cluster or larger */
1651 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
1652 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
1653 			/* XXX: this ignores mbuf types */
1654 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1655 			    mtod(m, caddr_t), m->m_len);
1656 			mprev->m_len += m->m_len;
1657 			mprev->m_next = m->m_next;	/* unlink from chain */
1658 			m_free(m);			/* reclaim mbuf */
1659 #if 0
1660 			newipsecstat.ips_clcoalesced++;
1661 #endif
1662 			continue;
1663 		}
1664 
1665 		/*
1666 		 * Allocate new space to hold the copy and copy the data.
1667 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
1668 		 * splitting them into clusters.  We could just malloc a
1669 		 * buffer and make it external but too many device drivers
1670 		 * don't know how to break up the non-contiguous memory when
1671 		 * doing DMA.
1672 		 */
1673 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
1674 		if (n == NULL) {
1675 			m_freem(m0);
1676 			return (NULL);
1677 		}
1678 		if (m->m_flags & M_PKTHDR) {
1679 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
1680 			    __func__, m0, m));
1681 			m_move_pkthdr(n, m);
1682 		}
1683 		len = m->m_len;
1684 		off = 0;
1685 		mfirst = n;
1686 		mlast = NULL;
1687 		for (;;) {
1688 			int cc = min(len, MCLBYTES);
1689 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
1690 			n->m_len = cc;
1691 			if (mlast != NULL)
1692 				mlast->m_next = n;
1693 			mlast = n;
1694 #if 0
1695 			newipsecstat.ips_clcopied++;
1696 #endif
1697 
1698 			len -= cc;
1699 			if (len <= 0)
1700 				break;
1701 			off += cc;
1702 
1703 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
1704 			if (n == NULL) {
1705 				m_freem(mfirst);
1706 				m_freem(m0);
1707 				return (NULL);
1708 			}
1709 		}
1710 		n->m_next = m->m_next;
1711 		if (mprev == NULL)
1712 			m0 = mfirst;		/* new head of chain */
1713 		else
1714 			mprev->m_next = mfirst;	/* replace old mbuf */
1715 		m_free(m);			/* release old mbuf */
1716 		mprev = mfirst;
1717 	}
1718 	return (m0);
1719 }
1720 
1721 #ifdef MBUF_PROFILING
1722 
1723 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
1724 struct mbufprofile {
1725 	uintmax_t wasted[MP_BUCKETS];
1726 	uintmax_t used[MP_BUCKETS];
1727 	uintmax_t segments[MP_BUCKETS];
1728 } mbprof;
1729 
1730 #define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
1731 #define MP_NUMLINES 6
1732 #define MP_NUMSPERLINE 16
1733 #define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
1734 /* work out max space needed and add a bit of spare space too */
1735 #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
1736 #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
1737 
1738 char mbprofbuf[MP_BUFSIZE];
1739 
1740 void
1741 m_profile(struct mbuf *m)
1742 {
1743 	int segments = 0;
1744 	int used = 0;
1745 	int wasted = 0;
1746 
1747 	while (m) {
1748 		segments++;
1749 		used += m->m_len;
1750 		if (m->m_flags & M_EXT) {
1751 			wasted += MHLEN - sizeof(m->m_ext) +
1752 			    m->m_ext.ext_size - m->m_len;
1753 		} else {
1754 			if (m->m_flags & M_PKTHDR)
1755 				wasted += MHLEN - m->m_len;
1756 			else
1757 				wasted += MLEN - m->m_len;
1758 		}
1759 		m = m->m_next;
1760 	}
1761 	/* be paranoid.. it helps */
1762 	if (segments > MP_BUCKETS - 1)
1763 		segments = MP_BUCKETS - 1;
1764 	if (used > 100000)
1765 		used = 100000;
1766 	if (wasted > 100000)
1767 		wasted = 100000;
1768 	/* store in the appropriate bucket */
1769 	/* don't bother locking. if it's slightly off, so what? */
1770 	mbprof.segments[segments]++;
1771 	mbprof.used[fls(used)]++;
1772 	mbprof.wasted[fls(wasted)]++;
1773 }
1774 
1775 static void
1776 mbprof_textify(void)
1777 {
1778 	int offset;
1779 	char *c;
1780 	uint64_t *p;
1781 
1782 	p = &mbprof.wasted[0];
1783 	c = mbprofbuf;
1784 	offset = snprintf(c, MP_MAXLINE + 10,
1785 	    "wasted:\n"
1786 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1787 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1788 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1789 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1790 #ifdef BIG_ARRAY
1791 	p = &mbprof.wasted[16];
1792 	c += offset;
1793 	offset = snprintf(c, MP_MAXLINE,
1794 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1795 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1796 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1797 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1798 #endif
1799 	p = &mbprof.used[0];
1800 	c += offset;
1801 	offset = snprintf(c, MP_MAXLINE + 10,
1802 	    "used:\n"
1803 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1804 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1805 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1806 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1807 #ifdef BIG_ARRAY
1808 	p = &mbprof.used[16];
1809 	c += offset;
1810 	offset = snprintf(c, MP_MAXLINE,
1811 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1812 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1813 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1814 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1815 #endif
1816 	p = &mbprof.segments[0];
1817 	c += offset;
1818 	offset = snprintf(c, MP_MAXLINE + 10,
1819 	    "segments:\n"
1820 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1821 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1822 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1823 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1824 #ifdef BIG_ARRAY
1825 	p = &mbprof.segments[16];
1826 	c += offset;
1827 	offset = snprintf(c, MP_MAXLINE,
1828 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1829 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
1830 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1831 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1832 #endif
1833 }
1834 
1835 static int
1836 mbprof_handler(SYSCTL_HANDLER_ARGS)
1837 {
1838 	int error;
1839 
1840 	mbprof_textify();
1841 	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
1842 	return (error);
1843 }
1844 
1845 static int
1846 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
1847 {
1848 	int clear, error;
1849 
1850 	clear = 0;
1851 	error = sysctl_handle_int(oidp, &clear, 0, req);
1852 	if (error || !req->newptr)
1853 		return (error);
1854 
1855 	if (clear) {
1856 		bzero(&mbprof, sizeof(mbprof));
1857 	}
1858 
1859 	return (error);
1860 }
1861 
1862 
1863 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
1864 	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
1865 
1866 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
1867 	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
1868 #endif
1869 
1870