xref: /freebsd/sys/kern/uipc_mbuf.c (revision 6af83ee0d2941d18880b6aaa2b4facd1d30c6106)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_mac.h"
36 #include "opt_param.h"
37 #include "opt_mbuf_stress_test.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/lock.h>
44 #include <sys/mac.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/sysctl.h>
48 #include <sys/domain.h>
49 #include <sys/protosw.h>
50 #include <sys/uio.h>
51 
52 int	max_linkhdr;
53 int	max_protohdr;
54 int	max_hdr;
55 int	max_datalen;
56 #ifdef MBUF_STRESS_TEST
57 int	m_defragpackets;
58 int	m_defragbytes;
59 int	m_defraguseless;
60 int	m_defragfailure;
61 int	m_defragrandomfailures;
62 #endif
63 
64 /*
65  * sysctl(8) exported objects
66  */
67 SYSCTL_DECL(_kern_ipc);
68 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
69 	   &max_linkhdr, 0, "");
70 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
71 	   &max_protohdr, 0, "");
72 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
73 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
74 	   &max_datalen, 0, "");
75 #ifdef MBUF_STRESS_TEST
76 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
77 	   &m_defragpackets, 0, "");
78 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
79 	   &m_defragbytes, 0, "");
80 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
81 	   &m_defraguseless, 0, "");
82 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
83 	   &m_defragfailure, 0, "");
84 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
85 	   &m_defragrandomfailures, 0, "");
86 #endif
87 
88 /*
89  * Malloc-type for external ext_buf ref counts.
90  */
91 static MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
92 
93 /*
94  * Allocate a given length worth of mbufs and/or clusters (whatever fits
95  * best) and return a pointer to the top of the allocated chain.  If an
96  * existing mbuf chain is provided, then we will append the new chain
97  * to the existing one but still return the top of the newly allocated
98  * chain.
99  */
100 struct mbuf *
101 m_getm(struct mbuf *m, int len, int how, short type)
102 {
103 	struct mbuf *mb, *top, *cur, *mtail;
104 	int num, rem;
105 	int i;
106 
107 	KASSERT(len >= 0, ("m_getm(): len is < 0"));
108 
109 	/* If m != NULL, we will append to the end of that chain. */
110 	if (m != NULL)
111 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
112 	else
113 		mtail = NULL;
114 
115 	/*
116 	 * Calculate how many mbufs+clusters ("packets") we need and how much
117 	 * leftover there is after that and allocate the first mbuf+cluster
118 	 * if required.
119 	 */
120 	num = len / MCLBYTES;
121 	rem = len % MCLBYTES;
122 	top = cur = NULL;
123 	if (num > 0) {
124 		if ((top = cur = m_getcl(how, type, 0)) == NULL)
125 			goto failed;
126 		top->m_len = 0;
127 	}
128 	num--;
129 
130 	for (i = 0; i < num; i++) {
131 		mb = m_getcl(how, type, 0);
132 		if (mb == NULL)
133 			goto failed;
134 		mb->m_len = 0;
135 		cur = (cur->m_next = mb);
136 	}
137 	if (rem > 0) {
138 		mb = (rem > MINCLSIZE) ?
139 		    m_getcl(how, type, 0) : m_get(how, type);
140 		if (mb == NULL)
141 			goto failed;
142 		mb->m_len = 0;
143 		if (cur == NULL)
144 			top = mb;
145 		else
146 			cur->m_next = mb;
147 	}
148 
149 	if (mtail != NULL)
150 		mtail->m_next = top;
151 	return top;
152 failed:
153 	if (top != NULL)
154 		m_freem(top);
155 	return NULL;
156 }
157 
158 /*
159  * Free an entire chain of mbufs and associated external buffers, if
160  * applicable.
161  */
162 void
163 m_freem(struct mbuf *mb)
164 {
165 
166 	while (mb != NULL)
167 		mb = m_free(mb);
168 }
169 
170 /*-
171  * Configure a provided mbuf to refer to the provided external storage
172  * buffer and setup a reference count for said buffer.  If the setting
173  * up of the reference count fails, the M_EXT bit will not be set.  If
174  * successfull, the M_EXT bit is set in the mbuf's flags.
175  *
176  * Arguments:
177  *    mb     The existing mbuf to which to attach the provided buffer.
178  *    buf    The address of the provided external storage buffer.
179  *    size   The size of the provided buffer.
180  *    freef  A pointer to a routine that is responsible for freeing the
181  *           provided external storage buffer.
182  *    args   A pointer to an argument structure (of any type) to be passed
183  *           to the provided freef routine (may be NULL).
184  *    flags  Any other flags to be passed to the provided mbuf.
185  *    type   The type that the external storage buffer should be
186  *           labeled with.
187  *
188  * Returns:
189  *    Nothing.
190  */
191 void
192 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
193     void (*freef)(void *, void *), void *args, int flags, int type)
194 {
195 	u_int *ref_cnt = NULL;
196 
197 	/* XXX Shouldn't be adding EXT_CLUSTER with this API */
198 	if (type == EXT_CLUSTER)
199 		ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
200 		    mb->m_ext.ext_buf);
201 	else if (type == EXT_EXTREF)
202 		ref_cnt = mb->m_ext.ref_cnt;
203 	mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
204 	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
205 	if (mb->m_ext.ref_cnt != NULL) {
206 		*(mb->m_ext.ref_cnt) = 1;
207 		mb->m_flags |= (M_EXT | flags);
208 		mb->m_ext.ext_buf = buf;
209 		mb->m_data = mb->m_ext.ext_buf;
210 		mb->m_ext.ext_size = size;
211 		mb->m_ext.ext_free = freef;
212 		mb->m_ext.ext_args = args;
213 		mb->m_ext.ext_type = type;
214         }
215 }
216 
217 /*
218  * Non-directly-exported function to clean up after mbufs with M_EXT
219  * storage attached to them if the reference count hits 0.
220  */
221 void
222 mb_free_ext(struct mbuf *m)
223 {
224 	u_int cnt;
225 	int dofree;
226 
227 	/* Account for lazy ref count assign. */
228 	if (m->m_ext.ref_cnt == NULL)
229 		dofree = 1;
230 	else
231 		dofree = 0;
232 
233 	/*
234 	 * This is tricky.  We need to make sure to decrement the
235 	 * refcount in a safe way but to also clean up if we're the
236 	 * last reference.  This method seems to do it without race.
237 	 */
238 	while (dofree == 0) {
239 		cnt = *(m->m_ext.ref_cnt);
240 		if (atomic_cmpset_int(m->m_ext.ref_cnt, cnt, cnt - 1)) {
241 			if (cnt == 1)
242 				dofree = 1;
243 			break;
244 		}
245 	}
246 
247 	if (dofree) {
248 		/*
249 		 * Do the free, should be safe.
250 		 */
251 		if (m->m_ext.ext_type == EXT_PACKET) {
252 			uma_zfree(zone_pack, m);
253 			return;
254 		} else if (m->m_ext.ext_type == EXT_CLUSTER) {
255 			uma_zfree(zone_clust, m->m_ext.ext_buf);
256 			m->m_ext.ext_buf = NULL;
257 		} else {
258 			(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
259 			    m->m_ext.ext_args);
260 			if (m->m_ext.ext_type != EXT_EXTREF) {
261 				if (m->m_ext.ref_cnt != NULL)
262 					free(m->m_ext.ref_cnt, M_MBUF);
263 				m->m_ext.ref_cnt = NULL;
264 			}
265 			m->m_ext.ext_buf = NULL;
266 		}
267 	}
268 	uma_zfree(zone_mbuf, m);
269 }
270 
271 /*
272  * "Move" mbuf pkthdr from "from" to "to".
273  * "from" must have M_PKTHDR set, and "to" must be empty.
274  */
275 void
276 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
277 {
278 
279 #if 0
280 	/* see below for why these are not enabled */
281 	M_ASSERTPKTHDR(to);
282 	/* Note: with MAC, this may not be a good assertion. */
283 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
284 	    ("m_move_pkthdr: to has tags"));
285 #endif
286 	KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster"));
287 #ifdef MAC
288 	/*
289 	 * XXXMAC: It could be this should also occur for non-MAC?
290 	 */
291 	if (to->m_flags & M_PKTHDR)
292 		m_tag_delete_chain(to, NULL);
293 #endif
294 	to->m_flags = from->m_flags & M_COPYFLAGS;
295 	to->m_data = to->m_pktdat;
296 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
297 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
298 	from->m_flags &= ~M_PKTHDR;
299 }
300 
301 /*
302  * Duplicate "from"'s mbuf pkthdr in "to".
303  * "from" must have M_PKTHDR set, and "to" must be empty.
304  * In particular, this does a deep copy of the packet tags.
305  */
306 int
307 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
308 {
309 
310 #if 0
311 	/*
312 	 * The mbuf allocator only initializes the pkthdr
313 	 * when the mbuf is allocated with MGETHDR. Many users
314 	 * (e.g. m_copy*, m_prepend) use MGET and then
315 	 * smash the pkthdr as needed causing these
316 	 * assertions to trip.  For now just disable them.
317 	 */
318 	M_ASSERTPKTHDR(to);
319 	/* Note: with MAC, this may not be a good assertion. */
320 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
321 #endif
322 	MBUF_CHECKSLEEP(how);
323 #ifdef MAC
324 	if (to->m_flags & M_PKTHDR)
325 		m_tag_delete_chain(to, NULL);
326 #endif
327 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
328 	if ((to->m_flags & M_EXT) == 0)
329 		to->m_data = to->m_pktdat;
330 	to->m_pkthdr = from->m_pkthdr;
331 	SLIST_INIT(&to->m_pkthdr.tags);
332 	return (m_tag_copy_chain(to, from, MBTOM(how)));
333 }
334 
335 /*
336  * Lesser-used path for M_PREPEND:
337  * allocate new mbuf to prepend to chain,
338  * copy junk along.
339  */
340 struct mbuf *
341 m_prepend(struct mbuf *m, int len, int how)
342 {
343 	struct mbuf *mn;
344 
345 	if (m->m_flags & M_PKTHDR)
346 		MGETHDR(mn, how, m->m_type);
347 	else
348 		MGET(mn, how, m->m_type);
349 	if (mn == NULL) {
350 		m_freem(m);
351 		return (NULL);
352 	}
353 	if (m->m_flags & M_PKTHDR)
354 		M_MOVE_PKTHDR(mn, m);
355 	mn->m_next = m;
356 	m = mn;
357 	if (len < MHLEN)
358 		MH_ALIGN(m, len);
359 	m->m_len = len;
360 	return (m);
361 }
362 
363 /*
364  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
365  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
366  * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
367  * Note that the copy is read-only, because clusters are not copied,
368  * only their reference counts are incremented.
369  */
370 struct mbuf *
371 m_copym(struct mbuf *m, int off0, int len, int wait)
372 {
373 	struct mbuf *n, **np;
374 	int off = off0;
375 	struct mbuf *top;
376 	int copyhdr = 0;
377 
378 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
379 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
380 	MBUF_CHECKSLEEP(wait);
381 	if (off == 0 && m->m_flags & M_PKTHDR)
382 		copyhdr = 1;
383 	while (off > 0) {
384 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
385 		if (off < m->m_len)
386 			break;
387 		off -= m->m_len;
388 		m = m->m_next;
389 	}
390 	np = &top;
391 	top = 0;
392 	while (len > 0) {
393 		if (m == NULL) {
394 			KASSERT(len == M_COPYALL,
395 			    ("m_copym, length > size of mbuf chain"));
396 			break;
397 		}
398 		if (copyhdr)
399 			MGETHDR(n, wait, m->m_type);
400 		else
401 			MGET(n, wait, m->m_type);
402 		*np = n;
403 		if (n == NULL)
404 			goto nospace;
405 		if (copyhdr) {
406 			if (!m_dup_pkthdr(n, m, wait))
407 				goto nospace;
408 			if (len == M_COPYALL)
409 				n->m_pkthdr.len -= off0;
410 			else
411 				n->m_pkthdr.len = len;
412 			copyhdr = 0;
413 		}
414 		n->m_len = min(len, m->m_len - off);
415 		if (m->m_flags & M_EXT) {
416 			n->m_data = m->m_data + off;
417 			n->m_ext = m->m_ext;
418 			n->m_flags |= M_EXT;
419 			MEXT_ADD_REF(m);
420 			n->m_ext.ref_cnt = m->m_ext.ref_cnt;
421 		} else
422 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
423 			    (u_int)n->m_len);
424 		if (len != M_COPYALL)
425 			len -= n->m_len;
426 		off = 0;
427 		m = m->m_next;
428 		np = &n->m_next;
429 	}
430 	if (top == NULL)
431 		mbstat.m_mcfail++;	/* XXX: No consistency. */
432 
433 	return (top);
434 nospace:
435 	m_freem(top);
436 	mbstat.m_mcfail++;	/* XXX: No consistency. */
437 	return (NULL);
438 }
439 
440 /*
441  * Copy an entire packet, including header (which must be present).
442  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
443  * Note that the copy is read-only, because clusters are not copied,
444  * only their reference counts are incremented.
445  * Preserve alignment of the first mbuf so if the creator has left
446  * some room at the beginning (e.g. for inserting protocol headers)
447  * the copies still have the room available.
448  */
449 struct mbuf *
450 m_copypacket(struct mbuf *m, int how)
451 {
452 	struct mbuf *top, *n, *o;
453 
454 	MBUF_CHECKSLEEP(how);
455 	MGET(n, how, m->m_type);
456 	top = n;
457 	if (n == NULL)
458 		goto nospace;
459 
460 	if (!m_dup_pkthdr(n, m, how))
461 		goto nospace;
462 	n->m_len = m->m_len;
463 	if (m->m_flags & M_EXT) {
464 		n->m_data = m->m_data;
465 		n->m_ext = m->m_ext;
466 		n->m_flags |= M_EXT;
467 		MEXT_ADD_REF(m);
468 		n->m_ext.ref_cnt = m->m_ext.ref_cnt;
469 	} else {
470 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
471 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
472 	}
473 
474 	m = m->m_next;
475 	while (m) {
476 		MGET(o, how, m->m_type);
477 		if (o == NULL)
478 			goto nospace;
479 
480 		n->m_next = o;
481 		n = n->m_next;
482 
483 		n->m_len = m->m_len;
484 		if (m->m_flags & M_EXT) {
485 			n->m_data = m->m_data;
486 			n->m_ext = m->m_ext;
487 			n->m_flags |= M_EXT;
488 			MEXT_ADD_REF(m);
489 			n->m_ext.ref_cnt = m->m_ext.ref_cnt;
490 		} else {
491 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
492 		}
493 
494 		m = m->m_next;
495 	}
496 	return top;
497 nospace:
498 	m_freem(top);
499 	mbstat.m_mcfail++;	/* XXX: No consistency. */
500 	return (NULL);
501 }
502 
503 /*
504  * Copy data from an mbuf chain starting "off" bytes from the beginning,
505  * continuing for "len" bytes, into the indicated buffer.
506  */
507 void
508 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
509 {
510 	u_int count;
511 
512 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
513 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
514 	while (off > 0) {
515 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
516 		if (off < m->m_len)
517 			break;
518 		off -= m->m_len;
519 		m = m->m_next;
520 	}
521 	while (len > 0) {
522 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
523 		count = min(m->m_len - off, len);
524 		bcopy(mtod(m, caddr_t) + off, cp, count);
525 		len -= count;
526 		cp += count;
527 		off = 0;
528 		m = m->m_next;
529 	}
530 }
531 
532 /*
533  * Copy a packet header mbuf chain into a completely new chain, including
534  * copying any mbuf clusters.  Use this instead of m_copypacket() when
535  * you need a writable copy of an mbuf chain.
536  */
537 struct mbuf *
538 m_dup(struct mbuf *m, int how)
539 {
540 	struct mbuf **p, *top = NULL;
541 	int remain, moff, nsize;
542 
543 	MBUF_CHECKSLEEP(how);
544 	/* Sanity check */
545 	if (m == NULL)
546 		return (NULL);
547 	M_ASSERTPKTHDR(m);
548 
549 	/* While there's more data, get a new mbuf, tack it on, and fill it */
550 	remain = m->m_pkthdr.len;
551 	moff = 0;
552 	p = &top;
553 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
554 		struct mbuf *n;
555 
556 		/* Get the next new mbuf */
557 		if (remain >= MINCLSIZE) {
558 			n = m_getcl(how, m->m_type, 0);
559 			nsize = MCLBYTES;
560 		} else {
561 			n = m_get(how, m->m_type);
562 			nsize = MLEN;
563 		}
564 		if (n == NULL)
565 			goto nospace;
566 
567 		if (top == NULL) {		/* First one, must be PKTHDR */
568 			if (!m_dup_pkthdr(n, m, how)) {
569 				m_free(n);
570 				goto nospace;
571 			}
572 			nsize = MHLEN;
573 		}
574 		n->m_len = 0;
575 
576 		/* Link it into the new chain */
577 		*p = n;
578 		p = &n->m_next;
579 
580 		/* Copy data from original mbuf(s) into new mbuf */
581 		while (n->m_len < nsize && m != NULL) {
582 			int chunk = min(nsize - n->m_len, m->m_len - moff);
583 
584 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
585 			moff += chunk;
586 			n->m_len += chunk;
587 			remain -= chunk;
588 			if (moff == m->m_len) {
589 				m = m->m_next;
590 				moff = 0;
591 			}
592 		}
593 
594 		/* Check correct total mbuf length */
595 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
596 		    	("%s: bogus m_pkthdr.len", __func__));
597 	}
598 	return (top);
599 
600 nospace:
601 	m_freem(top);
602 	mbstat.m_mcfail++;	/* XXX: No consistency. */
603 	return (NULL);
604 }
605 
606 /*
607  * Concatenate mbuf chain n to m.
608  * Both chains must be of the same type (e.g. MT_DATA).
609  * Any m_pkthdr is not updated.
610  */
611 void
612 m_cat(struct mbuf *m, struct mbuf *n)
613 {
614 	while (m->m_next)
615 		m = m->m_next;
616 	while (n) {
617 		if (m->m_flags & M_EXT ||
618 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
619 			/* just join the two chains */
620 			m->m_next = n;
621 			return;
622 		}
623 		/* splat the data from one into the other */
624 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
625 		    (u_int)n->m_len);
626 		m->m_len += n->m_len;
627 		n = m_free(n);
628 	}
629 }
630 
631 void
632 m_adj(struct mbuf *mp, int req_len)
633 {
634 	int len = req_len;
635 	struct mbuf *m;
636 	int count;
637 
638 	if ((m = mp) == NULL)
639 		return;
640 	if (len >= 0) {
641 		/*
642 		 * Trim from head.
643 		 */
644 		while (m != NULL && len > 0) {
645 			if (m->m_len <= len) {
646 				len -= m->m_len;
647 				m->m_len = 0;
648 				m = m->m_next;
649 			} else {
650 				m->m_len -= len;
651 				m->m_data += len;
652 				len = 0;
653 			}
654 		}
655 		m = mp;
656 		if (mp->m_flags & M_PKTHDR)
657 			m->m_pkthdr.len -= (req_len - len);
658 	} else {
659 		/*
660 		 * Trim from tail.  Scan the mbuf chain,
661 		 * calculating its length and finding the last mbuf.
662 		 * If the adjustment only affects this mbuf, then just
663 		 * adjust and return.  Otherwise, rescan and truncate
664 		 * after the remaining size.
665 		 */
666 		len = -len;
667 		count = 0;
668 		for (;;) {
669 			count += m->m_len;
670 			if (m->m_next == (struct mbuf *)0)
671 				break;
672 			m = m->m_next;
673 		}
674 		if (m->m_len >= len) {
675 			m->m_len -= len;
676 			if (mp->m_flags & M_PKTHDR)
677 				mp->m_pkthdr.len -= len;
678 			return;
679 		}
680 		count -= len;
681 		if (count < 0)
682 			count = 0;
683 		/*
684 		 * Correct length for chain is "count".
685 		 * Find the mbuf with last data, adjust its length,
686 		 * and toss data from remaining mbufs on chain.
687 		 */
688 		m = mp;
689 		if (m->m_flags & M_PKTHDR)
690 			m->m_pkthdr.len = count;
691 		for (; m; m = m->m_next) {
692 			if (m->m_len >= count) {
693 				m->m_len = count;
694 				break;
695 			}
696 			count -= m->m_len;
697 		}
698 		while (m->m_next)
699 			(m = m->m_next) ->m_len = 0;
700 	}
701 }
702 
703 /*
704  * Rearange an mbuf chain so that len bytes are contiguous
705  * and in the data area of an mbuf (so that mtod and dtom
706  * will work for a structure of size len).  Returns the resulting
707  * mbuf chain on success, frees it and returns null on failure.
708  * If there is room, it will add up to max_protohdr-len extra bytes to the
709  * contiguous region in an attempt to avoid being called next time.
710  */
711 struct mbuf *
712 m_pullup(struct mbuf *n, int len)
713 {
714 	struct mbuf *m;
715 	int count;
716 	int space;
717 
718 	/*
719 	 * If first mbuf has no cluster, and has room for len bytes
720 	 * without shifting current data, pullup into it,
721 	 * otherwise allocate a new mbuf to prepend to the chain.
722 	 */
723 	if ((n->m_flags & M_EXT) == 0 &&
724 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
725 		if (n->m_len >= len)
726 			return (n);
727 		m = n;
728 		n = n->m_next;
729 		len -= m->m_len;
730 	} else {
731 		if (len > MHLEN)
732 			goto bad;
733 		MGET(m, M_DONTWAIT, n->m_type);
734 		if (m == NULL)
735 			goto bad;
736 		m->m_len = 0;
737 		if (n->m_flags & M_PKTHDR)
738 			M_MOVE_PKTHDR(m, n);
739 	}
740 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
741 	do {
742 		count = min(min(max(len, max_protohdr), space), n->m_len);
743 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
744 		  (u_int)count);
745 		len -= count;
746 		m->m_len += count;
747 		n->m_len -= count;
748 		space -= count;
749 		if (n->m_len)
750 			n->m_data += count;
751 		else
752 			n = m_free(n);
753 	} while (len > 0 && n);
754 	if (len > 0) {
755 		(void) m_free(m);
756 		goto bad;
757 	}
758 	m->m_next = n;
759 	return (m);
760 bad:
761 	m_freem(n);
762 	mbstat.m_mpfail++;	/* XXX: No consistency. */
763 	return (NULL);
764 }
765 
766 /*
767  * Partition an mbuf chain in two pieces, returning the tail --
768  * all but the first len0 bytes.  In case of failure, it returns NULL and
769  * attempts to restore the chain to its original state.
770  *
771  * Note that the resulting mbufs might be read-only, because the new
772  * mbuf can end up sharing an mbuf cluster with the original mbuf if
773  * the "breaking point" happens to lie within a cluster mbuf. Use the
774  * M_WRITABLE() macro to check for this case.
775  */
776 struct mbuf *
777 m_split(struct mbuf *m0, int len0, int wait)
778 {
779 	struct mbuf *m, *n;
780 	u_int len = len0, remain;
781 
782 	MBUF_CHECKSLEEP(wait);
783 	for (m = m0; m && len > m->m_len; m = m->m_next)
784 		len -= m->m_len;
785 	if (m == NULL)
786 		return (NULL);
787 	remain = m->m_len - len;
788 	if (m0->m_flags & M_PKTHDR) {
789 		MGETHDR(n, wait, m0->m_type);
790 		if (n == NULL)
791 			return (NULL);
792 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
793 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
794 		m0->m_pkthdr.len = len0;
795 		if (m->m_flags & M_EXT)
796 			goto extpacket;
797 		if (remain > MHLEN) {
798 			/* m can't be the lead packet */
799 			MH_ALIGN(n, 0);
800 			n->m_next = m_split(m, len, wait);
801 			if (n->m_next == NULL) {
802 				(void) m_free(n);
803 				return (NULL);
804 			} else {
805 				n->m_len = 0;
806 				return (n);
807 			}
808 		} else
809 			MH_ALIGN(n, remain);
810 	} else if (remain == 0) {
811 		n = m->m_next;
812 		m->m_next = NULL;
813 		return (n);
814 	} else {
815 		MGET(n, wait, m->m_type);
816 		if (n == NULL)
817 			return (NULL);
818 		M_ALIGN(n, remain);
819 	}
820 extpacket:
821 	if (m->m_flags & M_EXT) {
822 		n->m_flags |= M_EXT;
823 		n->m_ext = m->m_ext;
824 		MEXT_ADD_REF(m);
825 		n->m_ext.ref_cnt = m->m_ext.ref_cnt;
826 		n->m_data = m->m_data + len;
827 	} else {
828 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
829 	}
830 	n->m_len = remain;
831 	m->m_len = len;
832 	n->m_next = m->m_next;
833 	m->m_next = NULL;
834 	return (n);
835 }
836 /*
837  * Routine to copy from device local memory into mbufs.
838  * Note that `off' argument is offset into first mbuf of target chain from
839  * which to begin copying the data to.
840  */
841 struct mbuf *
842 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
843 	 void (*copy)(char *from, caddr_t to, u_int len))
844 {
845 	struct mbuf *m;
846 	struct mbuf *top = NULL, **mp = &top;
847 	int len;
848 
849 	if (off < 0 || off > MHLEN)
850 		return (NULL);
851 
852 	while (totlen > 0) {
853 		if (top == NULL) {	/* First one, must be PKTHDR */
854 			if (totlen + off >= MINCLSIZE) {
855 				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
856 				len = MCLBYTES;
857 			} else {
858 				m = m_gethdr(M_DONTWAIT, MT_DATA);
859 				len = MHLEN;
860 
861 				/* Place initial small packet/header at end of mbuf */
862 				if (m && totlen + off + max_linkhdr <= MLEN) {
863 					m->m_data += max_linkhdr;
864 					len -= max_linkhdr;
865 				}
866 			}
867 			if (m == NULL)
868 				return NULL;
869 			m->m_pkthdr.rcvif = ifp;
870 			m->m_pkthdr.len = totlen;
871 		} else {
872 			if (totlen + off >= MINCLSIZE) {
873 				m = m_getcl(M_DONTWAIT, MT_DATA, 0);
874 				len = MCLBYTES;
875 			} else {
876 				m = m_get(M_DONTWAIT, MT_DATA);
877 				len = MLEN;
878 			}
879 			if (m == NULL) {
880 				m_freem(top);
881 				return NULL;
882 			}
883 		}
884 		if (off) {
885 			m->m_data += off;
886 			len -= off;
887 			off = 0;
888 		}
889 		m->m_len = len = min(totlen, len);
890 		if (copy)
891 			copy(buf, mtod(m, caddr_t), (u_int)len);
892 		else
893 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
894 		buf += len;
895 		*mp = m;
896 		mp = &m->m_next;
897 		totlen -= len;
898 	}
899 	return (top);
900 }
901 
902 /*
903  * Copy data from a buffer back into the indicated mbuf chain,
904  * starting "off" bytes from the beginning, extending the mbuf
905  * chain if necessary.
906  */
907 void
908 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
909 {
910 	int mlen;
911 	struct mbuf *m = m0, *n;
912 	int totlen = 0;
913 
914 	if (m0 == NULL)
915 		return;
916 	while (off > (mlen = m->m_len)) {
917 		off -= mlen;
918 		totlen += mlen;
919 		if (m->m_next == NULL) {
920 			n = m_get(M_DONTWAIT, m->m_type);
921 			if (n == NULL)
922 				goto out;
923 			bzero(mtod(n, caddr_t), MLEN);
924 			n->m_len = min(MLEN, len + off);
925 			m->m_next = n;
926 		}
927 		m = m->m_next;
928 	}
929 	while (len > 0) {
930 		mlen = min (m->m_len - off, len);
931 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
932 		cp += mlen;
933 		len -= mlen;
934 		mlen += off;
935 		off = 0;
936 		totlen += mlen;
937 		if (len == 0)
938 			break;
939 		if (m->m_next == NULL) {
940 			n = m_get(M_DONTWAIT, m->m_type);
941 			if (n == NULL)
942 				break;
943 			n->m_len = min(MLEN, len);
944 			m->m_next = n;
945 		}
946 		m = m->m_next;
947 	}
948 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
949 		m->m_pkthdr.len = totlen;
950 }
951 
952 /*
953  * Append the specified data to the indicated mbuf chain,
954  * Extend the mbuf chain if the new data does not fit in
955  * existing space.
956  *
957  * Return 1 if able to complete the job; otherwise 0.
958  */
959 int
960 m_append(struct mbuf *m0, int len, c_caddr_t cp)
961 {
962 	struct mbuf *m, *n;
963 	int remainder, space;
964 
965 	for (m = m0; m->m_next != NULL; m = m->m_next)
966 		;
967 	remainder = len;
968 	space = M_TRAILINGSPACE(m);
969 	if (space > 0) {
970 		/*
971 		 * Copy into available space.
972 		 */
973 		if (space > remainder)
974 			space = remainder;
975 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
976 		m->m_len += space;
977 		cp += space, remainder -= space;
978 	}
979 	while (remainder > 0) {
980 		/*
981 		 * Allocate a new mbuf; could check space
982 		 * and allocate a cluster instead.
983 		 */
984 		n = m_get(M_DONTWAIT, m->m_type);
985 		if (n == NULL)
986 			break;
987 		n->m_len = min(MLEN, remainder);
988 		bcopy(cp, mtod(n, caddr_t), n->m_len);
989 		cp += n->m_len, remainder -= n->m_len;
990 		m->m_next = n;
991 		m = n;
992 	}
993 	if (m0->m_flags & M_PKTHDR)
994 		m0->m_pkthdr.len += len - remainder;
995 	return (remainder == 0);
996 }
997 
998 /*
999  * Apply function f to the data in an mbuf chain starting "off" bytes from
1000  * the beginning, continuing for "len" bytes.
1001  */
1002 int
1003 m_apply(struct mbuf *m, int off, int len,
1004     int (*f)(void *, void *, u_int), void *arg)
1005 {
1006 	u_int count;
1007 	int rval;
1008 
1009 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
1010 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
1011 	while (off > 0) {
1012 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1013 		if (off < m->m_len)
1014 			break;
1015 		off -= m->m_len;
1016 		m = m->m_next;
1017 	}
1018 	while (len > 0) {
1019 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
1020 		count = min(m->m_len - off, len);
1021 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
1022 		if (rval)
1023 			return (rval);
1024 		len -= count;
1025 		off = 0;
1026 		m = m->m_next;
1027 	}
1028 	return (0);
1029 }
1030 
1031 /*
1032  * Return a pointer to mbuf/offset of location in mbuf chain.
1033  */
1034 struct mbuf *
1035 m_getptr(struct mbuf *m, int loc, int *off)
1036 {
1037 
1038 	while (loc >= 0) {
1039 		/* Normal end of search. */
1040 		if (m->m_len > loc) {
1041 			*off = loc;
1042 			return (m);
1043 		} else {
1044 			loc -= m->m_len;
1045 			if (m->m_next == NULL) {
1046 				if (loc == 0) {
1047 					/* Point at the end of valid data. */
1048 					*off = m->m_len;
1049 					return (m);
1050 				}
1051 				return (NULL);
1052 			}
1053 			m = m->m_next;
1054 		}
1055 	}
1056 	return (NULL);
1057 }
1058 
1059 void
1060 m_print(const struct mbuf *m, int maxlen)
1061 {
1062 	int len;
1063 	int pdata;
1064 	const struct mbuf *m2;
1065 
1066 	if (m->m_flags & M_PKTHDR)
1067 		len = m->m_pkthdr.len;
1068 	else
1069 		len = -1;
1070 	m2 = m;
1071 	while (m2 != NULL && (len == -1 || len)) {
1072 		pdata = m2->m_len;
1073 		if (maxlen != -1 && pdata > maxlen)
1074 			pdata = maxlen;
1075 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
1076 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
1077 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
1078 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
1079 		if (pdata)
1080 			printf(", %*D\n", m2->m_len, (u_char *)m2->m_data, "-");
1081 		if (len != -1)
1082 			len -= m2->m_len;
1083 		m2 = m2->m_next;
1084 	}
1085 	if (len > 0)
1086 		printf("%d bytes unaccounted for.\n", len);
1087 	return;
1088 }
1089 
1090 u_int
1091 m_fixhdr(struct mbuf *m0)
1092 {
1093 	u_int len;
1094 
1095 	len = m_length(m0, NULL);
1096 	m0->m_pkthdr.len = len;
1097 	return (len);
1098 }
1099 
1100 u_int
1101 m_length(struct mbuf *m0, struct mbuf **last)
1102 {
1103 	struct mbuf *m;
1104 	u_int len;
1105 
1106 	len = 0;
1107 	for (m = m0; m != NULL; m = m->m_next) {
1108 		len += m->m_len;
1109 		if (m->m_next == NULL)
1110 			break;
1111 	}
1112 	if (last != NULL)
1113 		*last = m;
1114 	return (len);
1115 }
1116 
1117 /*
1118  * Defragment a mbuf chain, returning the shortest possible
1119  * chain of mbufs and clusters.  If allocation fails and
1120  * this cannot be completed, NULL will be returned, but
1121  * the passed in chain will be unchanged.  Upon success,
1122  * the original chain will be freed, and the new chain
1123  * will be returned.
1124  *
1125  * If a non-packet header is passed in, the original
1126  * mbuf (chain?) will be returned unharmed.
1127  */
1128 struct mbuf *
1129 m_defrag(struct mbuf *m0, int how)
1130 {
1131 	struct mbuf *m_new = NULL, *m_final = NULL;
1132 	int progress = 0, length;
1133 
1134 	MBUF_CHECKSLEEP(how);
1135 	if (!(m0->m_flags & M_PKTHDR))
1136 		return (m0);
1137 
1138 	m_fixhdr(m0); /* Needed sanity check */
1139 
1140 #ifdef MBUF_STRESS_TEST
1141 	if (m_defragrandomfailures) {
1142 		int temp = arc4random() & 0xff;
1143 		if (temp == 0xba)
1144 			goto nospace;
1145 	}
1146 #endif
1147 
1148 	if (m0->m_pkthdr.len > MHLEN)
1149 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1150 	else
1151 		m_final = m_gethdr(how, MT_DATA);
1152 
1153 	if (m_final == NULL)
1154 		goto nospace;
1155 
1156 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1157 		goto nospace;
1158 
1159 	m_new = m_final;
1160 
1161 	while (progress < m0->m_pkthdr.len) {
1162 		length = m0->m_pkthdr.len - progress;
1163 		if (length > MCLBYTES)
1164 			length = MCLBYTES;
1165 
1166 		if (m_new == NULL) {
1167 			if (length > MLEN)
1168 				m_new = m_getcl(how, MT_DATA, 0);
1169 			else
1170 				m_new = m_get(how, MT_DATA);
1171 			if (m_new == NULL)
1172 				goto nospace;
1173 		}
1174 
1175 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1176 		progress += length;
1177 		m_new->m_len = length;
1178 		if (m_new != m_final)
1179 			m_cat(m_final, m_new);
1180 		m_new = NULL;
1181 	}
1182 #ifdef MBUF_STRESS_TEST
1183 	if (m0->m_next == NULL)
1184 		m_defraguseless++;
1185 #endif
1186 	m_freem(m0);
1187 	m0 = m_final;
1188 #ifdef MBUF_STRESS_TEST
1189 	m_defragpackets++;
1190 	m_defragbytes += m0->m_pkthdr.len;
1191 #endif
1192 	return (m0);
1193 nospace:
1194 #ifdef MBUF_STRESS_TEST
1195 	m_defragfailure++;
1196 #endif
1197 	if (m_new)
1198 		m_free(m_new);
1199 	if (m_final)
1200 		m_freem(m_final);
1201 	return (NULL);
1202 }
1203 
1204 #ifdef MBUF_STRESS_TEST
1205 
1206 /*
1207  * Fragment an mbuf chain.  There's no reason you'd ever want to do
1208  * this in normal usage, but it's great for stress testing various
1209  * mbuf consumers.
1210  *
1211  * If fragmentation is not possible, the original chain will be
1212  * returned.
1213  *
1214  * Possible length values:
1215  * 0	 no fragmentation will occur
1216  * > 0	each fragment will be of the specified length
1217  * -1	each fragment will be the same random value in length
1218  * -2	each fragment's length will be entirely random
1219  * (Random values range from 1 to 256)
1220  */
1221 struct mbuf *
1222 m_fragment(struct mbuf *m0, int how, int length)
1223 {
1224 	struct mbuf *m_new = NULL, *m_final = NULL;
1225 	int progress = 0;
1226 
1227 	if (!(m0->m_flags & M_PKTHDR))
1228 		return (m0);
1229 
1230 	if ((length == 0) || (length < -2))
1231 		return (m0);
1232 
1233 	m_fixhdr(m0); /* Needed sanity check */
1234 
1235 	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1236 
1237 	if (m_final == NULL)
1238 		goto nospace;
1239 
1240 	if (m_dup_pkthdr(m_final, m0, how) == 0)
1241 		goto nospace;
1242 
1243 	m_new = m_final;
1244 
1245 	if (length == -1)
1246 		length = 1 + (arc4random() & 255);
1247 
1248 	while (progress < m0->m_pkthdr.len) {
1249 		int fraglen;
1250 
1251 		if (length > 0)
1252 			fraglen = length;
1253 		else
1254 			fraglen = 1 + (arc4random() & 255);
1255 		if (fraglen > m0->m_pkthdr.len - progress)
1256 			fraglen = m0->m_pkthdr.len - progress;
1257 
1258 		if (fraglen > MCLBYTES)
1259 			fraglen = MCLBYTES;
1260 
1261 		if (m_new == NULL) {
1262 			m_new = m_getcl(how, MT_DATA, 0);
1263 			if (m_new == NULL)
1264 				goto nospace;
1265 		}
1266 
1267 		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
1268 		progress += fraglen;
1269 		m_new->m_len = fraglen;
1270 		if (m_new != m_final)
1271 			m_cat(m_final, m_new);
1272 		m_new = NULL;
1273 	}
1274 	m_freem(m0);
1275 	m0 = m_final;
1276 	return (m0);
1277 nospace:
1278 	if (m_new)
1279 		m_free(m_new);
1280 	if (m_final)
1281 		m_freem(m_final);
1282 	/* Return the original chain on failure */
1283 	return (m0);
1284 }
1285 
1286 #endif
1287 
1288 struct mbuf *
1289 m_uiotombuf(struct uio *uio, int how, int len)
1290 {
1291 	struct mbuf *m_new = NULL, *m_final = NULL;
1292 	int progress = 0, error = 0, length, total;
1293 
1294 	if (len > 0)
1295 		total = min(uio->uio_resid, len);
1296 	else
1297 		total = uio->uio_resid;
1298 	if (total > MHLEN)
1299 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1300 	else
1301 		m_final = m_gethdr(how, MT_DATA);
1302 	if (m_final == NULL)
1303 		goto nospace;
1304 	m_new = m_final;
1305 	while (progress < total) {
1306 		length = total - progress;
1307 		if (length > MCLBYTES)
1308 			length = MCLBYTES;
1309 		if (m_new == NULL) {
1310 			if (length > MLEN)
1311 				m_new = m_getcl(how, MT_DATA, 0);
1312 			else
1313 				m_new = m_get(how, MT_DATA);
1314 			if (m_new == NULL)
1315 				goto nospace;
1316 		}
1317 		error = uiomove(mtod(m_new, void *), length, uio);
1318 		if (error)
1319 			goto nospace;
1320 		progress += length;
1321 		m_new->m_len = length;
1322 		if (m_new != m_final)
1323 			m_cat(m_final, m_new);
1324 		m_new = NULL;
1325 	}
1326 	m_fixhdr(m_final);
1327 	return (m_final);
1328 nospace:
1329 	if (m_new)
1330 		m_free(m_new);
1331 	if (m_final)
1332 		m_freem(m_final);
1333 	return (NULL);
1334 }
1335