xref: /freebsd/sys/kern/uipc_mbuf.c (revision acc1a9ef8333c798c210fa94be6af4d5fe2dd794)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_param.h"
36 #include "opt_mbuf_stress_test.h"
37 #include "opt_mbuf_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/sysctl.h>
47 #include <sys/domain.h>
48 #include <sys/protosw.h>
49 #include <sys/uio.h>
50 
51 #include <security/mac/mac_framework.h>
52 
53 int	max_linkhdr;
54 int	max_protohdr;
55 int	max_hdr;
56 int	max_datalen;
57 #ifdef MBUF_STRESS_TEST
58 int	m_defragpackets;
59 int	m_defragbytes;
60 int	m_defraguseless;
61 int	m_defragfailure;
62 int	m_defragrandomfailures;
63 #endif
64 
65 /*
66  * sysctl(8) exported objects
67  */
68 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
69 	   &max_linkhdr, 0, "Size of largest link layer header");
70 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
71 	   &max_protohdr, 0, "Size of largest protocol layer header");
72 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
73 	   &max_hdr, 0, "Size of largest link plus protocol header");
74 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
75 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
76 #ifdef MBUF_STRESS_TEST
77 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
78 	   &m_defragpackets, 0, "");
79 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
80 	   &m_defragbytes, 0, "");
81 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
82 	   &m_defraguseless, 0, "");
83 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
84 	   &m_defragfailure, 0, "");
85 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
86 	   &m_defragrandomfailures, 0, "");
87 #endif
88 
89 /*
90  * Ensure the correct size of various mbuf parameters.  It could be off due
91  * to compiler-induced padding and alignment artifacts.
92  */
93 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
94 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
95 
96 /*
97  * mbuf data storage should be 64-bit aligned regardless of architectural
98  * pointer size; check this is the case with and without a packet header.
99  */
100 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
101 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
102 
103 /*
104  * While the specific values here don't matter too much (i.e., +/- a few
105  * words), we do want to ensure that changes to these values are carefully
106  * reasoned about and properly documented.  This is especially the case as
107  * network-protocol and device-driver modules encode these layouts, and must
108  * be recompiled if the structures change.  Check these values at compile time
109  * against the ones documented in comments in mbuf.h.
110  *
111  * NB: Possibly they should be documented there via #define's and not just
112  * comments.
113  */
114 #if defined(__LP64__)
115 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
116 CTASSERT(sizeof(struct pkthdr) == 56);
117 CTASSERT(sizeof(struct m_ext) == 48);
118 #else
119 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
120 CTASSERT(sizeof(struct pkthdr) == 48);
121 CTASSERT(sizeof(struct m_ext) == 28);
122 #endif
123 
124 /*
125  * Assert that the queue(3) macros produce code of the same size as an old
126  * plain pointer does.
127  */
128 #ifdef INVARIANTS
129 static struct mbuf m_assertbuf;
130 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
131 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
132 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
133 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
134 #endif
135 
136 /*
137  * Attach the cluster from *m to *n, set up m_ext in *n
138  * and bump the refcount of the cluster.
139  */
140 void
141 mb_dupcl(struct mbuf *n, struct mbuf *m)
142 {
143 	volatile u_int *refcnt;
144 
145 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
146 	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
147 
148 	n->m_ext = m->m_ext;
149 	n->m_flags |= M_EXT;
150 	n->m_flags |= m->m_flags & M_RDONLY;
151 
152 	/* See if this is the mbuf that holds the embedded refcount. */
153 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
154 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
155 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
156 	} else {
157 		KASSERT(m->m_ext.ext_cnt != NULL,
158 		    ("%s: no refcounting pointer on %p", __func__, m));
159 		refcnt = m->m_ext.ext_cnt;
160 	}
161 
162 	if (*refcnt == 1)
163 		*refcnt += 1;
164 	else
165 		atomic_add_int(refcnt, 1);
166 }
167 
168 void
169 m_demote_pkthdr(struct mbuf *m)
170 {
171 
172 	M_ASSERTPKTHDR(m);
173 
174 	m_tag_delete_chain(m, NULL);
175 	m->m_flags &= ~M_PKTHDR;
176 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
177 }
178 
179 /*
180  * Clean up mbuf (chain) from any tags and packet headers.
181  * If "all" is set then the first mbuf in the chain will be
182  * cleaned too.
183  */
184 void
185 m_demote(struct mbuf *m0, int all, int flags)
186 {
187 	struct mbuf *m;
188 
189 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
190 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
191 		    __func__, m, m0));
192 		if (m->m_flags & M_PKTHDR)
193 			m_demote_pkthdr(m);
194 		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
195 	}
196 }
197 
198 /*
199  * Sanity checks on mbuf (chain) for use in KASSERT() and general
200  * debugging.
201  * Returns 0 or panics when bad and 1 on all tests passed.
202  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
203  * blow up later.
204  */
205 int
206 m_sanity(struct mbuf *m0, int sanitize)
207 {
208 	struct mbuf *m;
209 	caddr_t a, b;
210 	int pktlen = 0;
211 
212 #ifdef INVARIANTS
213 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
214 #else
215 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
216 #endif
217 
218 	for (m = m0; m != NULL; m = m->m_next) {
219 		/*
220 		 * Basic pointer checks.  If any of these fails then some
221 		 * unrelated kernel memory before or after us is trashed.
222 		 * No way to recover from that.
223 		 */
224 		a = M_START(m);
225 		b = a + M_SIZE(m);
226 		if ((caddr_t)m->m_data < a)
227 			M_SANITY_ACTION("m_data outside mbuf data range left");
228 		if ((caddr_t)m->m_data > b)
229 			M_SANITY_ACTION("m_data outside mbuf data range right");
230 		if ((caddr_t)m->m_data + m->m_len > b)
231 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
232 
233 		/* m->m_nextpkt may only be set on first mbuf in chain. */
234 		if (m != m0 && m->m_nextpkt != NULL) {
235 			if (sanitize) {
236 				m_freem(m->m_nextpkt);
237 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
238 			} else
239 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
240 		}
241 
242 		/* packet length (not mbuf length!) calculation */
243 		if (m0->m_flags & M_PKTHDR)
244 			pktlen += m->m_len;
245 
246 		/* m_tags may only be attached to first mbuf in chain. */
247 		if (m != m0 && m->m_flags & M_PKTHDR &&
248 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
249 			if (sanitize) {
250 				m_tag_delete_chain(m, NULL);
251 				/* put in 0xDEADC0DE perhaps? */
252 			} else
253 				M_SANITY_ACTION("m_tags on in-chain mbuf");
254 		}
255 
256 		/* M_PKTHDR may only be set on first mbuf in chain */
257 		if (m != m0 && m->m_flags & M_PKTHDR) {
258 			if (sanitize) {
259 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
260 				m->m_flags &= ~M_PKTHDR;
261 				/* put in 0xDEADCODE and leave hdr flag in */
262 			} else
263 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
264 		}
265 	}
266 	m = m0;
267 	if (pktlen && pktlen != m->m_pkthdr.len) {
268 		if (sanitize)
269 			m->m_pkthdr.len = 0;
270 		else
271 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
272 	}
273 	return 1;
274 
275 #undef	M_SANITY_ACTION
276 }
277 
278 /*
279  * Non-inlined part of m_init().
280  */
281 int
282 m_pkthdr_init(struct mbuf *m, int how)
283 {
284 #ifdef MAC
285 	int error;
286 #endif
287 	m->m_data = m->m_pktdat;
288 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
289 #ifdef MAC
290 	/* If the label init fails, fail the alloc */
291 	error = mac_mbuf_init(m, how);
292 	if (error)
293 		return (error);
294 #endif
295 
296 	return (0);
297 }
298 
299 /*
300  * "Move" mbuf pkthdr from "from" to "to".
301  * "from" must have M_PKTHDR set, and "to" must be empty.
302  */
303 void
304 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
305 {
306 
307 #if 0
308 	/* see below for why these are not enabled */
309 	M_ASSERTPKTHDR(to);
310 	/* Note: with MAC, this may not be a good assertion. */
311 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
312 	    ("m_move_pkthdr: to has tags"));
313 #endif
314 #ifdef MAC
315 	/*
316 	 * XXXMAC: It could be this should also occur for non-MAC?
317 	 */
318 	if (to->m_flags & M_PKTHDR)
319 		m_tag_delete_chain(to, NULL);
320 #endif
321 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
322 	if ((to->m_flags & M_EXT) == 0)
323 		to->m_data = to->m_pktdat;
324 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
325 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
326 	from->m_flags &= ~M_PKTHDR;
327 }
328 
329 /*
330  * Duplicate "from"'s mbuf pkthdr in "to".
331  * "from" must have M_PKTHDR set, and "to" must be empty.
332  * In particular, this does a deep copy of the packet tags.
333  */
334 int
335 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
336 {
337 
338 #if 0
339 	/*
340 	 * The mbuf allocator only initializes the pkthdr
341 	 * when the mbuf is allocated with m_gethdr(). Many users
342 	 * (e.g. m_copy*, m_prepend) use m_get() and then
343 	 * smash the pkthdr as needed causing these
344 	 * assertions to trip.  For now just disable them.
345 	 */
346 	M_ASSERTPKTHDR(to);
347 	/* Note: with MAC, this may not be a good assertion. */
348 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
349 #endif
350 	MBUF_CHECKSLEEP(how);
351 #ifdef MAC
352 	if (to->m_flags & M_PKTHDR)
353 		m_tag_delete_chain(to, NULL);
354 #endif
355 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
356 	if ((to->m_flags & M_EXT) == 0)
357 		to->m_data = to->m_pktdat;
358 	to->m_pkthdr = from->m_pkthdr;
359 	SLIST_INIT(&to->m_pkthdr.tags);
360 	return (m_tag_copy_chain(to, from, how));
361 }
362 
363 /*
364  * Lesser-used path for M_PREPEND:
365  * allocate new mbuf to prepend to chain,
366  * copy junk along.
367  */
368 struct mbuf *
369 m_prepend(struct mbuf *m, int len, int how)
370 {
371 	struct mbuf *mn;
372 
373 	if (m->m_flags & M_PKTHDR)
374 		mn = m_gethdr(how, m->m_type);
375 	else
376 		mn = m_get(how, m->m_type);
377 	if (mn == NULL) {
378 		m_freem(m);
379 		return (NULL);
380 	}
381 	if (m->m_flags & M_PKTHDR)
382 		m_move_pkthdr(mn, m);
383 	mn->m_next = m;
384 	m = mn;
385 	if (len < M_SIZE(m))
386 		M_ALIGN(m, len);
387 	m->m_len = len;
388 	return (m);
389 }
390 
391 /*
392  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
393  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
394  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
395  * Note that the copy is read-only, because clusters are not copied,
396  * only their reference counts are incremented.
397  */
398 struct mbuf *
399 m_copym(struct mbuf *m, int off0, int len, int wait)
400 {
401 	struct mbuf *n, **np;
402 	int off = off0;
403 	struct mbuf *top;
404 	int copyhdr = 0;
405 
406 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
407 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
408 	MBUF_CHECKSLEEP(wait);
409 	if (off == 0 && m->m_flags & M_PKTHDR)
410 		copyhdr = 1;
411 	while (off > 0) {
412 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
413 		if (off < m->m_len)
414 			break;
415 		off -= m->m_len;
416 		m = m->m_next;
417 	}
418 	np = &top;
419 	top = 0;
420 	while (len > 0) {
421 		if (m == NULL) {
422 			KASSERT(len == M_COPYALL,
423 			    ("m_copym, length > size of mbuf chain"));
424 			break;
425 		}
426 		if (copyhdr)
427 			n = m_gethdr(wait, m->m_type);
428 		else
429 			n = m_get(wait, m->m_type);
430 		*np = n;
431 		if (n == NULL)
432 			goto nospace;
433 		if (copyhdr) {
434 			if (!m_dup_pkthdr(n, m, wait))
435 				goto nospace;
436 			if (len == M_COPYALL)
437 				n->m_pkthdr.len -= off0;
438 			else
439 				n->m_pkthdr.len = len;
440 			copyhdr = 0;
441 		}
442 		n->m_len = min(len, m->m_len - off);
443 		if (m->m_flags & M_EXT) {
444 			n->m_data = m->m_data + off;
445 			mb_dupcl(n, m);
446 		} else
447 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
448 			    (u_int)n->m_len);
449 		if (len != M_COPYALL)
450 			len -= n->m_len;
451 		off = 0;
452 		m = m->m_next;
453 		np = &n->m_next;
454 	}
455 
456 	return (top);
457 nospace:
458 	m_freem(top);
459 	return (NULL);
460 }
461 
462 /*
463  * Copy an entire packet, including header (which must be present).
464  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
465  * Note that the copy is read-only, because clusters are not copied,
466  * only their reference counts are incremented.
467  * Preserve alignment of the first mbuf so if the creator has left
468  * some room at the beginning (e.g. for inserting protocol headers)
469  * the copies still have the room available.
470  */
471 struct mbuf *
472 m_copypacket(struct mbuf *m, int how)
473 {
474 	struct mbuf *top, *n, *o;
475 
476 	MBUF_CHECKSLEEP(how);
477 	n = m_get(how, m->m_type);
478 	top = n;
479 	if (n == NULL)
480 		goto nospace;
481 
482 	if (!m_dup_pkthdr(n, m, how))
483 		goto nospace;
484 	n->m_len = m->m_len;
485 	if (m->m_flags & M_EXT) {
486 		n->m_data = m->m_data;
487 		mb_dupcl(n, m);
488 	} else {
489 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
490 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
491 	}
492 
493 	m = m->m_next;
494 	while (m) {
495 		o = m_get(how, m->m_type);
496 		if (o == NULL)
497 			goto nospace;
498 
499 		n->m_next = o;
500 		n = n->m_next;
501 
502 		n->m_len = m->m_len;
503 		if (m->m_flags & M_EXT) {
504 			n->m_data = m->m_data;
505 			mb_dupcl(n, m);
506 		} else {
507 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
508 		}
509 
510 		m = m->m_next;
511 	}
512 	return top;
513 nospace:
514 	m_freem(top);
515 	return (NULL);
516 }
517 
518 /*
519  * Copy data from an mbuf chain starting "off" bytes from the beginning,
520  * continuing for "len" bytes, into the indicated buffer.
521  */
522 void
523 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
524 {
525 	u_int count;
526 
527 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
528 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
529 	while (off > 0) {
530 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
531 		if (off < m->m_len)
532 			break;
533 		off -= m->m_len;
534 		m = m->m_next;
535 	}
536 	while (len > 0) {
537 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
538 		count = min(m->m_len - off, len);
539 		bcopy(mtod(m, caddr_t) + off, cp, count);
540 		len -= count;
541 		cp += count;
542 		off = 0;
543 		m = m->m_next;
544 	}
545 }
546 
547 /*
548  * Copy a packet header mbuf chain into a completely new chain, including
549  * copying any mbuf clusters.  Use this instead of m_copypacket() when
550  * you need a writable copy of an mbuf chain.
551  */
552 struct mbuf *
553 m_dup(const struct mbuf *m, int how)
554 {
555 	struct mbuf **p, *top = NULL;
556 	int remain, moff, nsize;
557 
558 	MBUF_CHECKSLEEP(how);
559 	/* Sanity check */
560 	if (m == NULL)
561 		return (NULL);
562 	M_ASSERTPKTHDR(m);
563 
564 	/* While there's more data, get a new mbuf, tack it on, and fill it */
565 	remain = m->m_pkthdr.len;
566 	moff = 0;
567 	p = &top;
568 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
569 		struct mbuf *n;
570 
571 		/* Get the next new mbuf */
572 		if (remain >= MINCLSIZE) {
573 			n = m_getcl(how, m->m_type, 0);
574 			nsize = MCLBYTES;
575 		} else {
576 			n = m_get(how, m->m_type);
577 			nsize = MLEN;
578 		}
579 		if (n == NULL)
580 			goto nospace;
581 
582 		if (top == NULL) {		/* First one, must be PKTHDR */
583 			if (!m_dup_pkthdr(n, m, how)) {
584 				m_free(n);
585 				goto nospace;
586 			}
587 			if ((n->m_flags & M_EXT) == 0)
588 				nsize = MHLEN;
589 			n->m_flags &= ~M_RDONLY;
590 		}
591 		n->m_len = 0;
592 
593 		/* Link it into the new chain */
594 		*p = n;
595 		p = &n->m_next;
596 
597 		/* Copy data from original mbuf(s) into new mbuf */
598 		while (n->m_len < nsize && m != NULL) {
599 			int chunk = min(nsize - n->m_len, m->m_len - moff);
600 
601 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
602 			moff += chunk;
603 			n->m_len += chunk;
604 			remain -= chunk;
605 			if (moff == m->m_len) {
606 				m = m->m_next;
607 				moff = 0;
608 			}
609 		}
610 
611 		/* Check correct total mbuf length */
612 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
613 		    	("%s: bogus m_pkthdr.len", __func__));
614 	}
615 	return (top);
616 
617 nospace:
618 	m_freem(top);
619 	return (NULL);
620 }
621 
622 /*
623  * Concatenate mbuf chain n to m.
624  * Both chains must be of the same type (e.g. MT_DATA).
625  * Any m_pkthdr is not updated.
626  */
627 void
628 m_cat(struct mbuf *m, struct mbuf *n)
629 {
630 	while (m->m_next)
631 		m = m->m_next;
632 	while (n) {
633 		if (!M_WRITABLE(m) ||
634 		    M_TRAILINGSPACE(m) < n->m_len) {
635 			/* just join the two chains */
636 			m->m_next = n;
637 			return;
638 		}
639 		/* splat the data from one into the other */
640 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
641 		    (u_int)n->m_len);
642 		m->m_len += n->m_len;
643 		n = m_free(n);
644 	}
645 }
646 
647 /*
648  * Concatenate two pkthdr mbuf chains.
649  */
650 void
651 m_catpkt(struct mbuf *m, struct mbuf *n)
652 {
653 
654 	M_ASSERTPKTHDR(m);
655 	M_ASSERTPKTHDR(n);
656 
657 	m->m_pkthdr.len += n->m_pkthdr.len;
658 	m_demote(n, 1, 0);
659 
660 	m_cat(m, n);
661 }
662 
663 void
664 m_adj(struct mbuf *mp, int req_len)
665 {
666 	int len = req_len;
667 	struct mbuf *m;
668 	int count;
669 
670 	if ((m = mp) == NULL)
671 		return;
672 	if (len >= 0) {
673 		/*
674 		 * Trim from head.
675 		 */
676 		while (m != NULL && len > 0) {
677 			if (m->m_len <= len) {
678 				len -= m->m_len;
679 				m->m_len = 0;
680 				m = m->m_next;
681 			} else {
682 				m->m_len -= len;
683 				m->m_data += len;
684 				len = 0;
685 			}
686 		}
687 		if (mp->m_flags & M_PKTHDR)
688 			mp->m_pkthdr.len -= (req_len - len);
689 	} else {
690 		/*
691 		 * Trim from tail.  Scan the mbuf chain,
692 		 * calculating its length and finding the last mbuf.
693 		 * If the adjustment only affects this mbuf, then just
694 		 * adjust and return.  Otherwise, rescan and truncate
695 		 * after the remaining size.
696 		 */
697 		len = -len;
698 		count = 0;
699 		for (;;) {
700 			count += m->m_len;
701 			if (m->m_next == (struct mbuf *)0)
702 				break;
703 			m = m->m_next;
704 		}
705 		if (m->m_len >= len) {
706 			m->m_len -= len;
707 			if (mp->m_flags & M_PKTHDR)
708 				mp->m_pkthdr.len -= len;
709 			return;
710 		}
711 		count -= len;
712 		if (count < 0)
713 			count = 0;
714 		/*
715 		 * Correct length for chain is "count".
716 		 * Find the mbuf with last data, adjust its length,
717 		 * and toss data from remaining mbufs on chain.
718 		 */
719 		m = mp;
720 		if (m->m_flags & M_PKTHDR)
721 			m->m_pkthdr.len = count;
722 		for (; m; m = m->m_next) {
723 			if (m->m_len >= count) {
724 				m->m_len = count;
725 				if (m->m_next != NULL) {
726 					m_freem(m->m_next);
727 					m->m_next = NULL;
728 				}
729 				break;
730 			}
731 			count -= m->m_len;
732 		}
733 	}
734 }
735 
736 /*
737  * Rearange an mbuf chain so that len bytes are contiguous
738  * and in the data area of an mbuf (so that mtod will work
739  * for a structure of size len).  Returns the resulting
740  * mbuf chain on success, frees it and returns null on failure.
741  * If there is room, it will add up to max_protohdr-len extra bytes to the
742  * contiguous region in an attempt to avoid being called next time.
743  */
744 struct mbuf *
745 m_pullup(struct mbuf *n, int len)
746 {
747 	struct mbuf *m;
748 	int count;
749 	int space;
750 
751 	/*
752 	 * If first mbuf has no cluster, and has room for len bytes
753 	 * without shifting current data, pullup into it,
754 	 * otherwise allocate a new mbuf to prepend to the chain.
755 	 */
756 	if ((n->m_flags & M_EXT) == 0 &&
757 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
758 		if (n->m_len >= len)
759 			return (n);
760 		m = n;
761 		n = n->m_next;
762 		len -= m->m_len;
763 	} else {
764 		if (len > MHLEN)
765 			goto bad;
766 		m = m_get(M_NOWAIT, n->m_type);
767 		if (m == NULL)
768 			goto bad;
769 		if (n->m_flags & M_PKTHDR)
770 			m_move_pkthdr(m, n);
771 	}
772 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
773 	do {
774 		count = min(min(max(len, max_protohdr), space), n->m_len);
775 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
776 		  (u_int)count);
777 		len -= count;
778 		m->m_len += count;
779 		n->m_len -= count;
780 		space -= count;
781 		if (n->m_len)
782 			n->m_data += count;
783 		else
784 			n = m_free(n);
785 	} while (len > 0 && n);
786 	if (len > 0) {
787 		(void) m_free(m);
788 		goto bad;
789 	}
790 	m->m_next = n;
791 	return (m);
792 bad:
793 	m_freem(n);
794 	return (NULL);
795 }
796 
797 /*
798  * Like m_pullup(), except a new mbuf is always allocated, and we allow
799  * the amount of empty space before the data in the new mbuf to be specified
800  * (in the event that the caller expects to prepend later).
801  */
802 struct mbuf *
803 m_copyup(struct mbuf *n, int len, int dstoff)
804 {
805 	struct mbuf *m;
806 	int count, space;
807 
808 	if (len > (MHLEN - dstoff))
809 		goto bad;
810 	m = m_get(M_NOWAIT, n->m_type);
811 	if (m == NULL)
812 		goto bad;
813 	if (n->m_flags & M_PKTHDR)
814 		m_move_pkthdr(m, n);
815 	m->m_data += dstoff;
816 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
817 	do {
818 		count = min(min(max(len, max_protohdr), space), n->m_len);
819 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
820 		    (unsigned)count);
821 		len -= count;
822 		m->m_len += count;
823 		n->m_len -= count;
824 		space -= count;
825 		if (n->m_len)
826 			n->m_data += count;
827 		else
828 			n = m_free(n);
829 	} while (len > 0 && n);
830 	if (len > 0) {
831 		(void) m_free(m);
832 		goto bad;
833 	}
834 	m->m_next = n;
835 	return (m);
836  bad:
837 	m_freem(n);
838 	return (NULL);
839 }
840 
841 /*
842  * Partition an mbuf chain in two pieces, returning the tail --
843  * all but the first len0 bytes.  In case of failure, it returns NULL and
844  * attempts to restore the chain to its original state.
845  *
846  * Note that the resulting mbufs might be read-only, because the new
847  * mbuf can end up sharing an mbuf cluster with the original mbuf if
848  * the "breaking point" happens to lie within a cluster mbuf. Use the
849  * M_WRITABLE() macro to check for this case.
850  */
851 struct mbuf *
852 m_split(struct mbuf *m0, int len0, int wait)
853 {
854 	struct mbuf *m, *n;
855 	u_int len = len0, remain;
856 
857 	MBUF_CHECKSLEEP(wait);
858 	for (m = m0; m && len > m->m_len; m = m->m_next)
859 		len -= m->m_len;
860 	if (m == NULL)
861 		return (NULL);
862 	remain = m->m_len - len;
863 	if (m0->m_flags & M_PKTHDR && remain == 0) {
864 		n = m_gethdr(wait, m0->m_type);
865 		if (n == NULL)
866 			return (NULL);
867 		n->m_next = m->m_next;
868 		m->m_next = NULL;
869 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
870 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
871 		m0->m_pkthdr.len = len0;
872 		return (n);
873 	} else if (m0->m_flags & M_PKTHDR) {
874 		n = m_gethdr(wait, m0->m_type);
875 		if (n == NULL)
876 			return (NULL);
877 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
878 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
879 		m0->m_pkthdr.len = len0;
880 		if (m->m_flags & M_EXT)
881 			goto extpacket;
882 		if (remain > MHLEN) {
883 			/* m can't be the lead packet */
884 			M_ALIGN(n, 0);
885 			n->m_next = m_split(m, len, wait);
886 			if (n->m_next == NULL) {
887 				(void) m_free(n);
888 				return (NULL);
889 			} else {
890 				n->m_len = 0;
891 				return (n);
892 			}
893 		} else
894 			M_ALIGN(n, remain);
895 	} else if (remain == 0) {
896 		n = m->m_next;
897 		m->m_next = NULL;
898 		return (n);
899 	} else {
900 		n = m_get(wait, m->m_type);
901 		if (n == NULL)
902 			return (NULL);
903 		M_ALIGN(n, remain);
904 	}
905 extpacket:
906 	if (m->m_flags & M_EXT) {
907 		n->m_data = m->m_data + len;
908 		mb_dupcl(n, m);
909 	} else {
910 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
911 	}
912 	n->m_len = remain;
913 	m->m_len = len;
914 	n->m_next = m->m_next;
915 	m->m_next = NULL;
916 	return (n);
917 }
918 /*
919  * Routine to copy from device local memory into mbufs.
920  * Note that `off' argument is offset into first mbuf of target chain from
921  * which to begin copying the data to.
922  */
923 struct mbuf *
924 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
925     void (*copy)(char *from, caddr_t to, u_int len))
926 {
927 	struct mbuf *m;
928 	struct mbuf *top = NULL, **mp = &top;
929 	int len;
930 
931 	if (off < 0 || off > MHLEN)
932 		return (NULL);
933 
934 	while (totlen > 0) {
935 		if (top == NULL) {	/* First one, must be PKTHDR */
936 			if (totlen + off >= MINCLSIZE) {
937 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
938 				len = MCLBYTES;
939 			} else {
940 				m = m_gethdr(M_NOWAIT, MT_DATA);
941 				len = MHLEN;
942 
943 				/* Place initial small packet/header at end of mbuf */
944 				if (m && totlen + off + max_linkhdr <= MLEN) {
945 					m->m_data += max_linkhdr;
946 					len -= max_linkhdr;
947 				}
948 			}
949 			if (m == NULL)
950 				return NULL;
951 			m->m_pkthdr.rcvif = ifp;
952 			m->m_pkthdr.len = totlen;
953 		} else {
954 			if (totlen + off >= MINCLSIZE) {
955 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
956 				len = MCLBYTES;
957 			} else {
958 				m = m_get(M_NOWAIT, MT_DATA);
959 				len = MLEN;
960 			}
961 			if (m == NULL) {
962 				m_freem(top);
963 				return NULL;
964 			}
965 		}
966 		if (off) {
967 			m->m_data += off;
968 			len -= off;
969 			off = 0;
970 		}
971 		m->m_len = len = min(totlen, len);
972 		if (copy)
973 			copy(buf, mtod(m, caddr_t), (u_int)len);
974 		else
975 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
976 		buf += len;
977 		*mp = m;
978 		mp = &m->m_next;
979 		totlen -= len;
980 	}
981 	return (top);
982 }
983 
984 /*
985  * Copy data from a buffer back into the indicated mbuf chain,
986  * starting "off" bytes from the beginning, extending the mbuf
987  * chain if necessary.
988  */
989 void
990 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
991 {
992 	int mlen;
993 	struct mbuf *m = m0, *n;
994 	int totlen = 0;
995 
996 	if (m0 == NULL)
997 		return;
998 	while (off > (mlen = m->m_len)) {
999 		off -= mlen;
1000 		totlen += mlen;
1001 		if (m->m_next == NULL) {
1002 			n = m_get(M_NOWAIT, m->m_type);
1003 			if (n == NULL)
1004 				goto out;
1005 			bzero(mtod(n, caddr_t), MLEN);
1006 			n->m_len = min(MLEN, len + off);
1007 			m->m_next = n;
1008 		}
1009 		m = m->m_next;
1010 	}
1011 	while (len > 0) {
1012 		if (m->m_next == NULL && (len > m->m_len - off)) {
1013 			m->m_len += min(len - (m->m_len - off),
1014 			    M_TRAILINGSPACE(m));
1015 		}
1016 		mlen = min (m->m_len - off, len);
1017 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
1018 		cp += mlen;
1019 		len -= mlen;
1020 		mlen += off;
1021 		off = 0;
1022 		totlen += mlen;
1023 		if (len == 0)
1024 			break;
1025 		if (m->m_next == NULL) {
1026 			n = m_get(M_NOWAIT, m->m_type);
1027 			if (n == NULL)
1028 				break;
1029 			n->m_len = min(MLEN, len);
1030 			m->m_next = n;
1031 		}
1032 		m = m->m_next;
1033 	}
1034 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1035 		m->m_pkthdr.len = totlen;
1036 }
1037 
1038 /*
1039  * Append the specified data to the indicated mbuf chain,
1040  * Extend the mbuf chain if the new data does not fit in
1041  * existing space.
1042  *
1043  * Return 1 if able to complete the job; otherwise 0.
1044  */
1045 int
1046 m_append(struct mbuf *m0, int len, c_caddr_t cp)
1047 {
1048 	struct mbuf *m, *n;
1049 	int remainder, space;
1050 
1051 	for (m = m0; m->m_next != NULL; m = m->m_next)
1052 		;
1053 	remainder = len;
1054 	space = M_TRAILINGSPACE(m);
1055 	if (space > 0) {
1056 		/*
1057 		 * Copy into available space.
1058 		 */
1059 		if (space > remainder)
1060 			space = remainder;
1061 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1062 		m->m_len += space;
1063 		cp += space, remainder -= space;
1064 	}
1065 	while (remainder > 0) {
1066 		/*
1067 		 * Allocate a new mbuf; could check space
1068 		 * and allocate a cluster instead.
1069 		 */
1070 		n = m_get(M_NOWAIT, m->m_type);
1071 		if (n == NULL)
1072 			break;
1073 		n->m_len = min(MLEN, remainder);
1074 		bcopy(cp, mtod(n, caddr_t), n->m_len);
1075 		cp += n->m_len, remainder -= n->m_len;
1076 		m->m_next = n;
1077 		m = n;
1078 	}
1079 	if (m0->m_flags & M_PKTHDR)
1080 		m0->m_pkthdr.len += len - remainder;
1081 	return (remainder == 0);
1082 }
1083 
1084 /*
1085  * Apply function f to the data in an mbuf chain starting "off" bytes from
1086  * the beginning, continuing for "len" bytes.
1087  */
1088 int
1089 m_apply(struct mbuf *m, int off, int len,
1090     int (*f)(void *, void *, u_int), void *arg)
1091 {
1092 	u_int count;
1093 	int rval;
1094 
1095 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1096 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1097 	while (off > 0) {
1098 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1099 		if (off < m->m_len)
1100 			break;
1101 		off -= m->m_len;
1102 		m = m->m_next;
1103 	}
1104 	while (len > 0) {
1105 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1106 		count = min(m->m_len - off, len);
1107 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
1108 		if (rval)
1109 			return (rval);
1110 		len -= count;
1111 		off = 0;
1112 		m = m->m_next;
1113 	}
1114 	return (0);
1115 }
1116 
1117 /*
1118  * Return a pointer to mbuf/offset of location in mbuf chain.
1119  */
1120 struct mbuf *
1121 m_getptr(struct mbuf *m, int loc, int *off)
1122 {
1123 
1124 	while (loc >= 0) {
1125 		/* Normal end of search. */
1126 		if (m->m_len > loc) {
1127 			*off = loc;
1128 			return (m);
1129 		} else {
1130 			loc -= m->m_len;
1131 			if (m->m_next == NULL) {
1132 				if (loc == 0) {
1133 					/* Point at the end of valid data. */
1134 					*off = m->m_len;
1135 					return (m);
1136 				}
1137 				return (NULL);
1138 			}
1139 			m = m->m_next;
1140 		}
1141 	}
1142 	return (NULL);
1143 }
1144 
1145 void
1146 m_print(const struct mbuf *m, int maxlen)
1147 {
1148 	int len;
1149 	int pdata;
1150 	const struct mbuf *m2;
1151 
1152 	if (m == NULL) {
1153 		printf("mbuf: %p\n", m);
1154 		return;
1155 	}
1156 
1157 	if (m->m_flags & M_PKTHDR)
1158 		len = m->m_pkthdr.len;
1159 	else
1160 		len = -1;
1161 	m2 = m;
1162 	while (m2 != NULL && (len == -1 || len)) {
1163 		pdata = m2->m_len;
1164 		if (maxlen != -1 && pdata > maxlen)
1165 			pdata = maxlen;
1166 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1167 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1168 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1169 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1170 		if (pdata)
1171 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
1172 		if (len != -1)
1173 			len -= m2->m_len;
1174 		m2 = m2->m_next;
1175 	}
1176 	if (len > 0)
1177 		printf("%d bytes unaccounted for.\n", len);
1178 	return;
1179 }
1180 
1181 u_int
1182 m_fixhdr(struct mbuf *m0)
1183 {
1184 	u_int len;
1185 
1186 	len = m_length(m0, NULL);
1187 	m0->m_pkthdr.len = len;
1188 	return (len);
1189 }
1190 
1191 u_int
1192 m_length(struct mbuf *m0, struct mbuf **last)
1193 {
1194 	struct mbuf *m;
1195 	u_int len;
1196 
1197 	len = 0;
1198 	for (m = m0; m != NULL; m = m->m_next) {
1199 		len += m->m_len;
1200 		if (m->m_next == NULL)
1201 			break;
1202 	}
1203 	if (last != NULL)
1204 		*last = m;
1205 	return (len);
1206 }
1207 
1208 /*
1209  * Defragment a mbuf chain, returning the shortest possible
1210  * chain of mbufs and clusters.  If allocation fails and
1211  * this cannot be completed, NULL will be returned, but
1212  * the passed in chain will be unchanged.  Upon success,
1213  * the original chain will be freed, and the new chain
1214  * will be returned.
1215  *
1216  * If a non-packet header is passed in, the original
1217  * mbuf (chain?) will be returned unharmed.
1218  */
1219 struct mbuf *
1220 m_defrag(struct mbuf *m0, int how)
1221 {
1222 	struct mbuf *m_new = NULL, *m_final = NULL;
1223 	int progress = 0, length;
1224 
1225 	MBUF_CHECKSLEEP(how);
1226 	if (!(m0->m_flags & M_PKTHDR))
1227 		return (m0);
1228 
1229 	m_fixhdr(m0); /* Needed sanity check */
1230 
1231 #ifdef MBUF_STRESS_TEST
1232 	if (m_defragrandomfailures) {
1233 		int temp = arc4random() & 0xff;
1234 		if (temp == 0xba)
1235 			goto nospace;
1236 	}
1237 #endif
1238 
1239 	if (m0->m_pkthdr.len > MHLEN)
1240 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1241 	else
1242 		m_final = m_gethdr(how, MT_DATA);
1243 
1244 	if (m_final == NULL)
1245 		goto nospace;
1246 
1247 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1248 		goto nospace;
1249 
1250 	m_new = m_final;
1251 
1252 	while (progress < m0->m_pkthdr.len) {
1253 		length = m0->m_pkthdr.len - progress;
1254 		if (length > MCLBYTES)
1255 			length = MCLBYTES;
1256 
1257 		if (m_new == NULL) {
1258 			if (length > MLEN)
1259 				m_new = m_getcl(how, MT_DATA, 0);
1260 			else
1261 				m_new = m_get(how, MT_DATA);
1262 			if (m_new == NULL)
1263 				goto nospace;
1264 		}
1265 
1266 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1267 		progress += length;
1268 		m_new->m_len = length;
1269 		if (m_new != m_final)
1270 			m_cat(m_final, m_new);
1271 		m_new = NULL;
1272 	}
1273 #ifdef MBUF_STRESS_TEST
1274 	if (m0->m_next == NULL)
1275 		m_defraguseless++;
1276 #endif
1277 	m_freem(m0);
1278 	m0 = m_final;
1279 #ifdef MBUF_STRESS_TEST
1280 	m_defragpackets++;
1281 	m_defragbytes += m0->m_pkthdr.len;
1282 #endif
1283 	return (m0);
1284 nospace:
1285 #ifdef MBUF_STRESS_TEST
1286 	m_defragfailure++;
1287 #endif
1288 	if (m_final)
1289 		m_freem(m_final);
1290 	return (NULL);
1291 }
1292 
1293 /*
1294  * Defragment an mbuf chain, returning at most maxfrags separate
1295  * mbufs+clusters.  If this is not possible NULL is returned and
1296  * the original mbuf chain is left in it's present (potentially
1297  * modified) state.  We use two techniques: collapsing consecutive
1298  * mbufs and replacing consecutive mbufs by a cluster.
1299  *
1300  * NB: this should really be named m_defrag but that name is taken
1301  */
1302 struct mbuf *
1303 m_collapse(struct mbuf *m0, int how, int maxfrags)
1304 {
1305 	struct mbuf *m, *n, *n2, **prev;
1306 	u_int curfrags;
1307 
1308 	/*
1309 	 * Calculate the current number of frags.
1310 	 */
1311 	curfrags = 0;
1312 	for (m = m0; m != NULL; m = m->m_next)
1313 		curfrags++;
1314 	/*
1315 	 * First, try to collapse mbufs.  Note that we always collapse
1316 	 * towards the front so we don't need to deal with moving the
1317 	 * pkthdr.  This may be suboptimal if the first mbuf has much
1318 	 * less data than the following.
1319 	 */
1320 	m = m0;
1321 again:
1322 	for (;;) {
1323 		n = m->m_next;
1324 		if (n == NULL)
1325 			break;
1326 		if (M_WRITABLE(m) &&
1327 		    n->m_len < M_TRAILINGSPACE(m)) {
1328 			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
1329 				n->m_len);
1330 			m->m_len += n->m_len;
1331 			m->m_next = n->m_next;
1332 			m_free(n);
1333 			if (--curfrags <= maxfrags)
1334 				return m0;
1335 		} else
1336 			m = n;
1337 	}
1338 	KASSERT(maxfrags > 1,
1339 		("maxfrags %u, but normal collapse failed", maxfrags));
1340 	/*
1341 	 * Collapse consecutive mbufs to a cluster.
1342 	 */
1343 	prev = &m0->m_next;		/* NB: not the first mbuf */
1344 	while ((n = *prev) != NULL) {
1345 		if ((n2 = n->m_next) != NULL &&
1346 		    n->m_len + n2->m_len < MCLBYTES) {
1347 			m = m_getcl(how, MT_DATA, 0);
1348 			if (m == NULL)
1349 				goto bad;
1350 			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
1351 			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
1352 				n2->m_len);
1353 			m->m_len = n->m_len + n2->m_len;
1354 			m->m_next = n2->m_next;
1355 			*prev = m;
1356 			m_free(n);
1357 			m_free(n2);
1358 			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
1359 				return m0;
1360 			/*
1361 			 * Still not there, try the normal collapse
1362 			 * again before we allocate another cluster.
1363 			 */
1364 			goto again;
1365 		}
1366 		prev = &n->m_next;
1367 	}
1368 	/*
1369 	 * No place where we can collapse to a cluster; punt.
1370 	 * This can occur if, for example, you request 2 frags
1371 	 * but the packet requires that both be clusters (we
1372 	 * never reallocate the first mbuf to avoid moving the
1373 	 * packet header).
1374 	 */
1375 bad:
1376 	return NULL;
1377 }
1378 
1379 #ifdef MBUF_STRESS_TEST
1380 
1381 /*
1382  * Fragment an mbuf chain.  There's no reason you'd ever want to do
1383  * this in normal usage, but it's great for stress testing various
1384  * mbuf consumers.
1385  *
1386  * If fragmentation is not possible, the original chain will be
1387  * returned.
1388  *
1389  * Possible length values:
1390  * 0	 no fragmentation will occur
1391  * > 0	each fragment will be of the specified length
1392  * -1	each fragment will be the same random value in length
1393  * -2	each fragment's length will be entirely random
1394  * (Random values range from 1 to 256)
1395  */
1396 struct mbuf *
1397 m_fragment(struct mbuf *m0, int how, int length)
1398 {
1399 	struct mbuf *m_new = NULL, *m_final = NULL;
1400 	int progress = 0;
1401 
1402 	if (!(m0->m_flags & M_PKTHDR))
1403 		return (m0);
1404 
1405 	if ((length == 0) || (length < -2))
1406 		return (m0);
1407 
1408 	m_fixhdr(m0); /* Needed sanity check */
1409 
1410 	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1411 
1412 	if (m_final == NULL)
1413 		goto nospace;
1414 
1415 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1416 		goto nospace;
1417 
1418 	m_new = m_final;
1419 
1420 	if (length == -1)
1421 		length = 1 + (arc4random() & 255);
1422 
1423 	while (progress < m0->m_pkthdr.len) {
1424 		int fraglen;
1425 
1426 		if (length > 0)
1427 			fraglen = length;
1428 		else
1429 			fraglen = 1 + (arc4random() & 255);
1430 		if (fraglen > m0->m_pkthdr.len - progress)
1431 			fraglen = m0->m_pkthdr.len - progress;
1432 
1433 		if (fraglen > MCLBYTES)
1434 			fraglen = MCLBYTES;
1435 
1436 		if (m_new == NULL) {
1437 			m_new = m_getcl(how, MT_DATA, 0);
1438 			if (m_new == NULL)
1439 				goto nospace;
1440 		}
1441 
1442 		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
1443 		progress += fraglen;
1444 		m_new->m_len = fraglen;
1445 		if (m_new != m_final)
1446 			m_cat(m_final, m_new);
1447 		m_new = NULL;
1448 	}
1449 	m_freem(m0);
1450 	m0 = m_final;
1451 	return (m0);
1452 nospace:
1453 	if (m_final)
1454 		m_freem(m_final);
1455 	/* Return the original chain on failure */
1456 	return (m0);
1457 }
1458 
1459 #endif
1460 
1461 /*
1462  * Copy the contents of uio into a properly sized mbuf chain.
1463  */
1464 struct mbuf *
1465 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
1466 {
1467 	struct mbuf *m, *mb;
1468 	int error, length;
1469 	ssize_t total;
1470 	int progress = 0;
1471 
1472 	/*
1473 	 * len can be zero or an arbitrary large value bound by
1474 	 * the total data supplied by the uio.
1475 	 */
1476 	if (len > 0)
1477 		total = min(uio->uio_resid, len);
1478 	else
1479 		total = uio->uio_resid;
1480 
1481 	/*
1482 	 * The smallest unit returned by m_getm2() is a single mbuf
1483 	 * with pkthdr.  We can't align past it.
1484 	 */
1485 	if (align >= MHLEN)
1486 		return (NULL);
1487 
1488 	/*
1489 	 * Give us the full allocation or nothing.
1490 	 * If len is zero return the smallest empty mbuf.
1491 	 */
1492 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
1493 	if (m == NULL)
1494 		return (NULL);
1495 	m->m_data += align;
1496 
1497 	/* Fill all mbufs with uio data and update header information. */
1498 	for (mb = m; mb != NULL; mb = mb->m_next) {
1499 		length = min(M_TRAILINGSPACE(mb), total - progress);
1500 
1501 		error = uiomove(mtod(mb, void *), length, uio);
1502 		if (error) {
1503 			m_freem(m);
1504 			return (NULL);
1505 		}
1506 
1507 		mb->m_len = length;
1508 		progress += length;
1509 		if (flags & M_PKTHDR)
1510 			m->m_pkthdr.len += length;
1511 	}
1512 	KASSERT(progress == total, ("%s: progress != total", __func__));
1513 
1514 	return (m);
1515 }
1516 
1517 /*
1518  * Copy an mbuf chain into a uio limited by len if set.
1519  */
1520 int
1521 m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
1522 {
1523 	int error, length, total;
1524 	int progress = 0;
1525 
1526 	if (len > 0)
1527 		total = min(uio->uio_resid, len);
1528 	else
1529 		total = uio->uio_resid;
1530 
1531 	/* Fill the uio with data from the mbufs. */
1532 	for (; m != NULL; m = m->m_next) {
1533 		length = min(m->m_len, total - progress);
1534 
1535 		error = uiomove(mtod(m, void *), length, uio);
1536 		if (error)
1537 			return (error);
1538 
1539 		progress += length;
1540 	}
1541 
1542 	return (0);
1543 }
1544 
1545 /*
1546  * Create a writable copy of the mbuf chain.  While doing this
1547  * we compact the chain with a goal of producing a chain with
1548  * at most two mbufs.  The second mbuf in this chain is likely
1549  * to be a cluster.  The primary purpose of this work is to create
1550  * a writable packet for encryption, compression, etc.  The
1551  * secondary goal is to linearize the data so the data can be
1552  * passed to crypto hardware in the most efficient manner possible.
1553  */
1554 struct mbuf *
1555 m_unshare(struct mbuf *m0, int how)
1556 {
1557 	struct mbuf *m, *mprev;
1558 	struct mbuf *n, *mfirst, *mlast;
1559 	int len, off;
1560 
1561 	mprev = NULL;
1562 	for (m = m0; m != NULL; m = mprev->m_next) {
1563 		/*
1564 		 * Regular mbufs are ignored unless there's a cluster
1565 		 * in front of it that we can use to coalesce.  We do
1566 		 * the latter mainly so later clusters can be coalesced
1567 		 * also w/o having to handle them specially (i.e. convert
1568 		 * mbuf+cluster -> cluster).  This optimization is heavily
1569 		 * influenced by the assumption that we're running over
1570 		 * Ethernet where MCLBYTES is large enough that the max
1571 		 * packet size will permit lots of coalescing into a
1572 		 * single cluster.  This in turn permits efficient
1573 		 * crypto operations, especially when using hardware.
1574 		 */
1575 		if ((m->m_flags & M_EXT) == 0) {
1576 			if (mprev && (mprev->m_flags & M_EXT) &&
1577 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
1578 				/* XXX: this ignores mbuf types */
1579 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1580 				    mtod(m, caddr_t), m->m_len);
1581 				mprev->m_len += m->m_len;
1582 				mprev->m_next = m->m_next;	/* unlink from chain */
1583 				m_free(m);			/* reclaim mbuf */
1584 #if 0
1585 				newipsecstat.ips_mbcoalesced++;
1586 #endif
1587 			} else {
1588 				mprev = m;
1589 			}
1590 			continue;
1591 		}
1592 		/*
1593 		 * Writable mbufs are left alone (for now).
1594 		 */
1595 		if (M_WRITABLE(m)) {
1596 			mprev = m;
1597 			continue;
1598 		}
1599 
1600 		/*
1601 		 * Not writable, replace with a copy or coalesce with
1602 		 * the previous mbuf if possible (since we have to copy
1603 		 * it anyway, we try to reduce the number of mbufs and
1604 		 * clusters so that future work is easier).
1605 		 */
1606 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
1607 		/* NB: we only coalesce into a cluster or larger */
1608 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
1609 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
1610 			/* XXX: this ignores mbuf types */
1611 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1612 			    mtod(m, caddr_t), m->m_len);
1613 			mprev->m_len += m->m_len;
1614 			mprev->m_next = m->m_next;	/* unlink from chain */
1615 			m_free(m);			/* reclaim mbuf */
1616 #if 0
1617 			newipsecstat.ips_clcoalesced++;
1618 #endif
1619 			continue;
1620 		}
1621 
1622 		/*
1623 		 * Allocate new space to hold the copy and copy the data.
1624 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
1625 		 * splitting them into clusters.  We could just malloc a
1626 		 * buffer and make it external but too many device drivers
1627 		 * don't know how to break up the non-contiguous memory when
1628 		 * doing DMA.
1629 		 */
1630 		n = m_getcl(how, m->m_type, m->m_flags);
1631 		if (n == NULL) {
1632 			m_freem(m0);
1633 			return (NULL);
1634 		}
1635 		if (m->m_flags & M_PKTHDR) {
1636 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
1637 			    __func__, m0, m));
1638 			m_move_pkthdr(n, m);
1639 		}
1640 		len = m->m_len;
1641 		off = 0;
1642 		mfirst = n;
1643 		mlast = NULL;
1644 		for (;;) {
1645 			int cc = min(len, MCLBYTES);
1646 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
1647 			n->m_len = cc;
1648 			if (mlast != NULL)
1649 				mlast->m_next = n;
1650 			mlast = n;
1651 #if 0
1652 			newipsecstat.ips_clcopied++;
1653 #endif
1654 
1655 			len -= cc;
1656 			if (len <= 0)
1657 				break;
1658 			off += cc;
1659 
1660 			n = m_getcl(how, m->m_type, m->m_flags);
1661 			if (n == NULL) {
1662 				m_freem(mfirst);
1663 				m_freem(m0);
1664 				return (NULL);
1665 			}
1666 		}
1667 		n->m_next = m->m_next;
1668 		if (mprev == NULL)
1669 			m0 = mfirst;		/* new head of chain */
1670 		else
1671 			mprev->m_next = mfirst;	/* replace old mbuf */
1672 		m_free(m);			/* release old mbuf */
1673 		mprev = mfirst;
1674 	}
1675 	return (m0);
1676 }
1677 
1678 #ifdef MBUF_PROFILING
1679 
1680 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
1681 struct mbufprofile {
1682 	uintmax_t wasted[MP_BUCKETS];
1683 	uintmax_t used[MP_BUCKETS];
1684 	uintmax_t segments[MP_BUCKETS];
1685 } mbprof;
1686 
1687 #define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
1688 #define MP_NUMLINES 6
1689 #define MP_NUMSPERLINE 16
1690 #define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
1691 /* work out max space needed and add a bit of spare space too */
1692 #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
1693 #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
1694 
1695 char mbprofbuf[MP_BUFSIZE];
1696 
1697 void
1698 m_profile(struct mbuf *m)
1699 {
1700 	int segments = 0;
1701 	int used = 0;
1702 	int wasted = 0;
1703 
1704 	while (m) {
1705 		segments++;
1706 		used += m->m_len;
1707 		if (m->m_flags & M_EXT) {
1708 			wasted += MHLEN - sizeof(m->m_ext) +
1709 			    m->m_ext.ext_size - m->m_len;
1710 		} else {
1711 			if (m->m_flags & M_PKTHDR)
1712 				wasted += MHLEN - m->m_len;
1713 			else
1714 				wasted += MLEN - m->m_len;
1715 		}
1716 		m = m->m_next;
1717 	}
1718 	/* be paranoid.. it helps */
1719 	if (segments > MP_BUCKETS - 1)
1720 		segments = MP_BUCKETS - 1;
1721 	if (used > 100000)
1722 		used = 100000;
1723 	if (wasted > 100000)
1724 		wasted = 100000;
1725 	/* store in the appropriate bucket */
1726 	/* don't bother locking. if it's slightly off, so what? */
1727 	mbprof.segments[segments]++;
1728 	mbprof.used[fls(used)]++;
1729 	mbprof.wasted[fls(wasted)]++;
1730 }
1731 
1732 static void
1733 mbprof_textify(void)
1734 {
1735 	int offset;
1736 	char *c;
1737 	uint64_t *p;
1738 
1739 	p = &mbprof.wasted[0];
1740 	c = mbprofbuf;
1741 	offset = snprintf(c, MP_MAXLINE + 10,
1742 	    "wasted:\n"
1743 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1744 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1745 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1746 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1747 #ifdef BIG_ARRAY
1748 	p = &mbprof.wasted[16];
1749 	c += offset;
1750 	offset = snprintf(c, MP_MAXLINE,
1751 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1752 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1753 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1754 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1755 #endif
1756 	p = &mbprof.used[0];
1757 	c += offset;
1758 	offset = snprintf(c, MP_MAXLINE + 10,
1759 	    "used:\n"
1760 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1761 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1762 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1763 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1764 #ifdef BIG_ARRAY
1765 	p = &mbprof.used[16];
1766 	c += offset;
1767 	offset = snprintf(c, MP_MAXLINE,
1768 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1769 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1770 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1771 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1772 #endif
1773 	p = &mbprof.segments[0];
1774 	c += offset;
1775 	offset = snprintf(c, MP_MAXLINE + 10,
1776 	    "segments:\n"
1777 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1778 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
1779 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1780 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1781 #ifdef BIG_ARRAY
1782 	p = &mbprof.segments[16];
1783 	c += offset;
1784 	offset = snprintf(c, MP_MAXLINE,
1785 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
1786 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
1787 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
1788 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1789 #endif
1790 }
1791 
1792 static int
1793 mbprof_handler(SYSCTL_HANDLER_ARGS)
1794 {
1795 	int error;
1796 
1797 	mbprof_textify();
1798 	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
1799 	return (error);
1800 }
1801 
1802 static int
1803 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
1804 {
1805 	int clear, error;
1806 
1807 	clear = 0;
1808 	error = sysctl_handle_int(oidp, &clear, 0, req);
1809 	if (error || !req->newptr)
1810 		return (error);
1811 
1812 	if (clear) {
1813 		bzero(&mbprof, sizeof(mbprof));
1814 	}
1815 
1816 	return (error);
1817 }
1818 
1819 
1820 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
1821 	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
1822 
1823 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
1824 	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
1825 #endif
1826 
1827