xref: /freebsd/sys/kern/uipc_mbuf.c (revision ee41f1b1cf5e3d4f586cb85b46123b416275862c)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_param.h"
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47 #include <vm/vm.h>
48 #include <vm/vm_kern.h>
49 #include <vm/vm_extern.h>
50 
51 static void mbinit(void *);
52 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
53 
54 struct mbuf *mbutl;
55 struct mbstat mbstat;
56 u_long	mbtypes[MT_NTYPES];
57 int	max_linkhdr;
58 int	max_protohdr;
59 int	max_hdr;
60 int	max_datalen;
61 int	nmbclusters;
62 int	nmbufs;
63 int	nmbcnt;
64 u_long	m_mballoc_wid = 0;
65 u_long	m_clalloc_wid = 0;
66 
67 /*
68  * freelist header structures...
69  * mbffree_lst, mclfree_lst, mcntfree_lst
70  */
71 struct mbffree_lst mmbfree;
72 struct mclfree_lst mclfree;
73 struct mcntfree_lst mcntfree;
74 
75 /*
76  * sysctl(8) exported objects
77  */
78 SYSCTL_DECL(_kern_ipc);
79 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
80 	   &max_linkhdr, 0, "");
81 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
82 	   &max_protohdr, 0, "");
83 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
84 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
85 	   &max_datalen, 0, "");
86 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
87 	   &mbuf_wait, 0, "");
88 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD, &mbstat, mbstat, "");
89 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
90 	   sizeof(mbtypes), "LU", "");
91 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
92 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
93 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
94 	   "Maximum number of mbufs available");
95 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
96 	   "Maximum number of ext_buf counters available");
97 #ifndef NMBCLUSTERS
98 #define NMBCLUSTERS	(512 + MAXUSERS * 16)
99 #endif
100 TUNABLE_INT_DECL("kern.ipc.nmbclusters", NMBCLUSTERS, nmbclusters);
101 TUNABLE_INT_DECL("kern.ipc.nmbufs", NMBCLUSTERS * 4, nmbufs);
102 TUNABLE_INT_DECL("kern.ipc.nmbcnt", EXT_COUNTERS, nmbcnt);
103 
104 static void	m_reclaim(void);
105 
106 /* Initial allocation numbers */
107 #define NCL_INIT	2
108 #define NMB_INIT	16
109 #define REF_INIT	NMBCLUSTERS
110 
111 /*
112  * Full mbuf subsystem initialization done here.
113  *
114  * XXX: If ever we have system specific map setups to do, then move them to
115  *      machdep.c - for now, there is no reason for this stuff to go there.
116  */
117 static void
118 mbinit(void *dummy)
119 {
120 	vm_offset_t maxaddr, mb_map_size;
121 
122 	/*
123 	 * Setup the mb_map, allocate requested VM space.
124 	 */
125 	mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt
126 	    * sizeof(union mext_refcnt);
127 	mb_map_size = roundup2(mb_map_size, PAGE_SIZE);
128 	mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
129 	    mb_map_size);
130 	/* XXX XXX XXX: mb_map->system_map = 1; */
131 
132 	/*
133 	 * Initialize the free list headers, and setup locks for lists.
134 	 */
135 	mmbfree.m_head = NULL;
136 	mclfree.m_head = NULL;
137 	mcntfree.m_head = NULL;
138 	mtx_init(&mmbfree.m_mtx, "mbuf free list lock", MTX_DEF);
139 	mtx_init(&mclfree.m_mtx, "mcluster free list lock", MTX_DEF);
140 	mtx_init(&mcntfree.m_mtx, "m_ext counter free list lock", MTX_DEF);
141 
142 	/*
143 	 * Initialize mbuf subsystem (sysctl exported) statistics structure.
144 	 */
145 	mbstat.m_msize = MSIZE;
146 	mbstat.m_mclbytes = MCLBYTES;
147 	mbstat.m_minclsize = MINCLSIZE;
148 	mbstat.m_mlen = MLEN;
149 	mbstat.m_mhlen = MHLEN;
150 
151 	/*
152 	 * Perform some initial allocations.
153 	 */
154 	mtx_lock(&mcntfree.m_mtx);
155 	if (m_alloc_ref(REF_INIT, M_DONTWAIT) == 0)
156 		goto bad;
157 	mtx_unlock(&mcntfree.m_mtx);
158 
159 	mtx_lock(&mmbfree.m_mtx);
160 	if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
161 		goto bad;
162 	mtx_unlock(&mmbfree.m_mtx);
163 
164 	mtx_lock(&mclfree.m_mtx);
165 	if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
166 		goto bad;
167 	mtx_unlock(&mclfree.m_mtx);
168 
169 	return;
170 bad:
171 	panic("mbinit: failed to initialize mbuf subsystem!");
172 }
173 
174 /*
175  * Allocate at least nmb reference count structs and place them
176  * on the ref cnt free list.
177  *
178  * Must be called with the mcntfree lock held.
179  */
180 int
181 m_alloc_ref(u_int nmb, int how)
182 {
183 	caddr_t p;
184 	u_int nbytes;
185 	int i;
186 
187 	/*
188 	 * We don't cap the amount of memory that can be used
189 	 * by the reference counters, like we do for mbufs and
190 	 * mbuf clusters. In fact, we're absolutely sure that we
191 	 * won't ever be going over our allocated space. We keep enough
192 	 * space in mb_map to accomodate maximum values of allocatable
193 	 * external buffers including, but not limited to, clusters.
194 	 * (That's also why we won't have to have wait routines for
195 	 * counters).
196 	 *
197 	 * If we're in here, we're absolutely certain to be returning
198 	 * succesfully, as long as there is physical memory to accomodate
199 	 * us. And if there isn't, but we're willing to wait, then
200 	 * kmem_malloc() will do the only waiting needed.
201 	 */
202 
203 	nbytes = round_page(nmb * sizeof(union mext_refcnt));
204 	mtx_unlock(&mcntfree.m_mtx);
205 	if ((p = (caddr_t)kmem_malloc(mb_map, nbytes, how == M_TRYWAIT ?
206 	    M_WAITOK : M_NOWAIT)) == NULL) {
207 		mtx_lock(&mcntfree.m_mtx);
208 		return (0);
209 	}
210 	nmb = nbytes / sizeof(union mext_refcnt);
211 
212 	/*
213 	 * We don't let go of the mutex in order to avoid a race.
214 	 * It is up to the caller to let go of the mutex.
215 	 */
216 	mtx_lock(&mcntfree.m_mtx);
217 	for (i = 0; i < nmb; i++) {
218 		((union mext_refcnt *)p)->next_ref = mcntfree.m_head;
219 		mcntfree.m_head = (union mext_refcnt *)p;
220 		p += sizeof(union mext_refcnt);
221 		mbstat.m_refree++;
222 	}
223 	mbstat.m_refcnt += nmb;
224 
225 	return (1);
226 }
227 
228 /*
229  * Allocate at least nmb mbufs and place on mbuf free list.
230  *
231  * Must be called with the mmbfree lock held.
232  */
233 int
234 m_mballoc(int nmb, int how)
235 {
236 	caddr_t p;
237 	int i;
238 	int nbytes;
239 
240 	/*
241 	 * If we've hit the mbuf limit, stop allocating from mb_map.
242 	 * Also, once we run out of map space, it will be impossible to
243 	 * get any more (nothing is ever freed back to the map).
244 	 */
245 	if (mb_map_full || ((nmb + mbstat.m_mbufs) > nmbufs)) {
246 		/*
247 		 * Needs to be atomic as we may be incrementing it
248 		 * while holding another mutex, like mclfree. In other
249 		 * words, m_drops is not reserved solely for mbufs,
250 		 * but is also available for clusters.
251 		 */
252 		atomic_add_long(&mbstat.m_drops, 1);
253 		return (0);
254 	}
255 
256 	nbytes = round_page(nmb * MSIZE);
257 
258 	mtx_unlock(&mmbfree.m_mtx);
259 	p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
260 	if (p == NULL && how == M_TRYWAIT) {
261 		atomic_add_long(&mbstat.m_wait, 1);
262 		p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
263 	}
264 	mtx_lock(&mmbfree.m_mtx);
265 
266 	/*
267 	 * Either the map is now full, or `how' is M_DONTWAIT and there
268 	 * are no pages left.
269 	 */
270 	if (p == NULL)
271 		return (0);
272 
273 	nmb = nbytes / MSIZE;
274 
275 	/*
276 	 * We don't let go of the mutex in order to avoid a race.
277 	 * It is up to the caller to let go of the mutex when done
278 	 * with grabbing the mbuf from the free list.
279 	 */
280 	for (i = 0; i < nmb; i++) {
281 		((struct mbuf *)p)->m_next = mmbfree.m_head;
282 		mmbfree.m_head = (struct mbuf *)p;
283 		p += MSIZE;
284 	}
285 	mbstat.m_mbufs += nmb;
286 	mbtypes[MT_FREE] += nmb;
287 	return (1);
288 }
289 
290 /*
291  * Once the mb_map has been exhausted and if the call to the allocation macros
292  * (or, in some cases, functions) is with M_TRYWAIT, then it is necessary to
293  * rely solely on reclaimed mbufs.
294  *
295  * Here we request for the protocols to free up some resources and, if we
296  * still cannot get anything, then we wait for an mbuf to be freed for a
297  * designated (mbuf_wait) time.
298  *
299  * Must be called with the mmbfree mutex held.
300  */
301 struct mbuf *
302 m_mballoc_wait(void)
303 {
304 	struct mbuf *p = NULL;
305 
306 	/*
307 	 * See if we can drain some resources out of the protocols.
308 	 * We drop the mmbfree mutex to avoid recursing into it in some of
309 	 * the drain routines. Clearly, we're faced with a race here because
310 	 * once something is freed during the drain, it may be grabbed right
311 	 * from under us by some other thread. But we accept this possibility
312 	 * in order to avoid a potentially large lock recursion and, more
313 	 * importantly, to avoid a potential lock order reversal which may
314 	 * result in deadlock (See comment above m_reclaim()).
315 	 */
316 	mtx_unlock(&mmbfree.m_mtx);
317 	m_reclaim();
318 
319 	mtx_lock(&mmbfree.m_mtx);
320 	_MGET(p, M_DONTWAIT);
321 
322 	if (p == NULL) {
323 		m_mballoc_wid++;
324 		msleep(&m_mballoc_wid, &mmbfree.m_mtx, PVM, "mballc",
325 		    mbuf_wait);
326 		m_mballoc_wid--;
327 
328 		/*
329 		 * Try again (one last time).
330 		 *
331 		 * We retry to fetch _even_ if the sleep timed out. This
332 		 * is left this way, purposely, in the [unlikely] case
333 		 * that an mbuf was freed but the sleep was not awoken
334 		 * in time.
335 		 *
336 		 * If the sleep didn't time out (i.e. we got woken up) then
337 		 * we have the lock so we just grab an mbuf, hopefully.
338 		 */
339 		_MGET(p, M_DONTWAIT);
340 	}
341 
342 	/* If we waited and got something... */
343 	if (p != NULL) {
344 		atomic_add_long(&mbstat.m_wait, 1);
345 		if (mmbfree.m_head != NULL)
346 			MBWAKEUP(m_mballoc_wid);
347 	} else
348 		atomic_add_long(&mbstat.m_drops, 1);
349 
350 	return (p);
351 }
352 
353 /*
354  * Allocate some number of mbuf clusters
355  * and place on cluster free list.
356  *
357  * Must be called with the mclfree lock held.
358  */
359 int
360 m_clalloc(int ncl, int how)
361 {
362 	caddr_t p;
363 	int i;
364 	int npg;
365 
366 	/*
367 	 * If the map is now full (nothing will ever be freed to it).
368 	 * If we've hit the mcluster number limit, stop allocating from
369 	 * mb_map.
370 	 */
371 	if (mb_map_full || ((ncl + mbstat.m_clusters) > nmbclusters)) {
372 		atomic_add_long(&mbstat.m_drops, 1);
373 		return (0);
374 	}
375 
376 	npg = ncl;
377 	mtx_unlock(&mclfree.m_mtx);
378 	p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
379 				 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
380 	ncl = ncl * PAGE_SIZE / MCLBYTES;
381 	mtx_lock(&mclfree.m_mtx);
382 
383 	/*
384 	 * Either the map is now full, or `how' is M_DONTWAIT and there
385 	 * are no pages left.
386 	 */
387 	if (p == NULL) {
388 		atomic_add_long(&mbstat.m_drops, 1);
389 		return (0);
390 	}
391 
392 	/*
393 	 * We don't let go of the mutex in order to avoid a race.
394 	 */
395 	for (i = 0; i < ncl; i++) {
396 		((union mcluster *)p)->mcl_next = mclfree.m_head;
397 		mclfree.m_head = (union mcluster *)p;
398 		p += MCLBYTES;
399 		mbstat.m_clfree++;
400 	}
401 	mbstat.m_clusters += ncl;
402 	return (1);
403 }
404 
405 /*
406  * Once the mb_map submap has been exhausted and the allocation is called with
407  * M_TRYWAIT, we rely on the mclfree list. If nothing is free, we will
408  * sleep for a designated amount of time (mbuf_wait) or until we're woken up
409  * due to sudden mcluster availability.
410  *
411  * Must be called with the mclfree lock held.
412  */
413 caddr_t
414 m_clalloc_wait(void)
415 {
416 	caddr_t p = NULL;
417 
418 	m_clalloc_wid++;
419 	msleep(&m_clalloc_wid, &mclfree.m_mtx, PVM, "mclalc", mbuf_wait);
420 	m_clalloc_wid--;
421 
422 	/*
423 	 * Now that we (think) that we've got something, try again.
424 	 */
425 	_MCLALLOC(p, M_DONTWAIT);
426 
427 	/* If we waited and got something ... */
428 	if (p != NULL) {
429 		atomic_add_long(&mbstat.m_wait, 1);
430 		if (mclfree.m_head != NULL)
431 			MBWAKEUP(m_clalloc_wid);
432 	} else
433 		atomic_add_long(&mbstat.m_drops, 1);
434 
435 	return (p);
436 }
437 
438 /*
439  * m_reclaim: drain protocols in hopes to free up some resources...
440  *
441  * XXX: No locks should be held going in here. The drain routines have
442  * to presently acquire some locks which raises the possibility of lock
443  * order violation if we're holding any mutex if that mutex is acquired in
444  * reverse order relative to one of the locks in the drain routines.
445  */
446 static void
447 m_reclaim(void)
448 {
449 	struct domain *dp;
450 	struct protosw *pr;
451 
452 #ifdef WITNESS
453 	KASSERT(witness_list(CURPROC) == 0,
454 	    ("m_reclaim called with locks held"));
455 #endif
456 
457 	for (dp = domains; dp; dp = dp->dom_next)
458 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
459 			if (pr->pr_drain)
460 				(*pr->pr_drain)();
461 	mbstat.m_drain++;
462 }
463 
464 /*
465  * Space allocation routines.
466  * Some of these are also available as macros
467  * for critical paths.
468  */
469 struct mbuf *
470 m_get(int how, int type)
471 {
472 	struct mbuf *m;
473 
474 	MGET(m, how, type);
475 	return (m);
476 }
477 
478 struct mbuf *
479 m_gethdr(int how, int type)
480 {
481 	struct mbuf *m;
482 
483 	MGETHDR(m, how, type);
484 	return (m);
485 }
486 
487 struct mbuf *
488 m_getclr(int how, int type)
489 {
490 	struct mbuf *m;
491 
492 	MGET(m, how, type);
493 	if (m == NULL)
494 		return (NULL);
495 	bzero(mtod(m, caddr_t), MLEN);
496 	return (m);
497 }
498 
499 struct mbuf *
500 m_free(struct mbuf *m)
501 {
502 	struct mbuf *n;
503 
504 	MFREE(m, n);
505 	return (n);
506 }
507 
508 /*
509  * struct mbuf *
510  * m_getm(m, len, how, type)
511  *
512  * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
513  * best) and return a pointer to the top of the allocated chain. If m is
514  * non-null, then we assume that it is a single mbuf or an mbuf chain to
515  * which we want len bytes worth of mbufs and/or clusters attached, and so
516  * if we succeed in allocating it, we will just return a pointer to m.
517  *
518  * If we happen to fail at any point during the allocation, we will free
519  * up everything we have already allocated and return NULL.
520  *
521  */
522 struct mbuf *
523 m_getm(struct mbuf *m, int len, int how, int type)
524 {
525 	struct mbuf *top, *tail, *mp, *mtail = NULL;
526 
527 	KASSERT(len >= 0, ("len is < 0 in m_getm"));
528 
529 	MGET(mp, type, how);
530 	if (mp == NULL)
531 		return (NULL);
532 	else if (len > MINCLSIZE) {
533 		MCLGET(mp, how);
534 		if ((mp->m_flags & M_EXT) == 0) {
535 			m_free(mp);
536 			return (NULL);
537 		}
538 	}
539 	mp->m_len = 0;
540 	len -= M_TRAILINGSPACE(mp);
541 
542 	if (m != NULL)
543 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
544 	else
545 		m = mp;
546 
547 	top = tail = mp;
548 	while (len > 0) {
549 		MGET(mp, type, how);
550 		if (mp == NULL)
551 			goto failed;
552 
553 		tail->m_next = mp;
554 		tail = mp;
555 		if (len > MINCLSIZE) {
556 			MCLGET(mp, how);
557 			if ((mp->m_flags & M_EXT) == 0)
558 				goto failed;
559 		}
560 
561 		mp->m_len = 0;
562 		len -= M_TRAILINGSPACE(mp);
563 	}
564 
565 	if (mtail != NULL)
566 		mtail->m_next = top;
567 	return (m);
568 
569 failed:
570 	m_freem(top);
571 	return (NULL);
572 }
573 
574 void
575 m_freem(struct mbuf *m)
576 {
577 	struct mbuf *n;
578 
579 	if (m == NULL)
580 		return;
581 	do {
582 		/*
583 		 * we do need to check non-first mbuf, since some of existing
584 		 * code does not call M_PREPEND properly.
585 		 * (example: call to bpf_mtap from drivers)
586 		 */
587 		if ((m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.aux) {
588 			m_freem(m->m_pkthdr.aux);
589 			m->m_pkthdr.aux = NULL;
590 		}
591 		MFREE(m, n);
592 		m = n;
593 	} while (m);
594 }
595 
596 /*
597  * Lesser-used path for M_PREPEND:
598  * allocate new mbuf to prepend to chain,
599  * copy junk along.
600  */
601 struct mbuf *
602 m_prepend(struct mbuf *m, int len, int how)
603 {
604 	struct mbuf *mn;
605 
606 	MGET(mn, how, m->m_type);
607 	if (mn == NULL) {
608 		m_freem(m);
609 		return (NULL);
610 	}
611 	if (m->m_flags & M_PKTHDR) {
612 		M_COPY_PKTHDR(mn, m);
613 		m->m_flags &= ~M_PKTHDR;
614 	}
615 	mn->m_next = m;
616 	m = mn;
617 	if (len < MHLEN)
618 		MH_ALIGN(m, len);
619 	m->m_len = len;
620 	return (m);
621 }
622 
623 /*
624  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
625  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
626  * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
627  * Note that the copy is read-only, because clusters are not copied,
628  * only their reference counts are incremented.
629  */
630 #define MCFail (mbstat.m_mcfail)
631 
632 struct mbuf *
633 m_copym(struct mbuf *m, int off0, int len, int wait)
634 {
635 	struct mbuf *n, **np;
636 	int off = off0;
637 	struct mbuf *top;
638 	int copyhdr = 0;
639 
640 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
641 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
642 	if (off == 0 && m->m_flags & M_PKTHDR)
643 		copyhdr = 1;
644 	while (off > 0) {
645 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
646 		if (off < m->m_len)
647 			break;
648 		off -= m->m_len;
649 		m = m->m_next;
650 	}
651 	np = &top;
652 	top = 0;
653 	while (len > 0) {
654 		if (m == NULL) {
655 			KASSERT(len == M_COPYALL,
656 			    ("m_copym, length > size of mbuf chain"));
657 			break;
658 		}
659 		MGET(n, wait, m->m_type);
660 		*np = n;
661 		if (n == NULL)
662 			goto nospace;
663 		if (copyhdr) {
664 			M_COPY_PKTHDR(n, m);
665 			if (len == M_COPYALL)
666 				n->m_pkthdr.len -= off0;
667 			else
668 				n->m_pkthdr.len = len;
669 			copyhdr = 0;
670 		}
671 		n->m_len = min(len, m->m_len - off);
672 		if (m->m_flags & M_EXT) {
673 			n->m_data = m->m_data + off;
674 			n->m_ext = m->m_ext;
675 			n->m_flags |= M_EXT;
676 			MEXT_ADD_REF(m);
677 		} else
678 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
679 			    (unsigned)n->m_len);
680 		if (len != M_COPYALL)
681 			len -= n->m_len;
682 		off = 0;
683 		m = m->m_next;
684 		np = &n->m_next;
685 	}
686 	if (top == NULL)
687 		atomic_add_long(&MCFail, 1);
688 	return (top);
689 nospace:
690 	m_freem(top);
691 	atomic_add_long(&MCFail, 1);
692 	return (NULL);
693 }
694 
695 /*
696  * Copy an entire packet, including header (which must be present).
697  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
698  * Note that the copy is read-only, because clusters are not copied,
699  * only their reference counts are incremented.
700  */
701 struct mbuf *
702 m_copypacket(struct mbuf *m, int how)
703 {
704 	struct mbuf *top, *n, *o;
705 
706 	MGET(n, how, m->m_type);
707 	top = n;
708 	if (n == NULL)
709 		goto nospace;
710 
711 	M_COPY_PKTHDR(n, m);
712 	n->m_len = m->m_len;
713 	if (m->m_flags & M_EXT) {
714 		n->m_data = m->m_data;
715 		n->m_ext = m->m_ext;
716 		n->m_flags |= M_EXT;
717 		MEXT_ADD_REF(m);
718 	} else {
719 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
720 	}
721 
722 	m = m->m_next;
723 	while (m) {
724 		MGET(o, how, m->m_type);
725 		if (o == NULL)
726 			goto nospace;
727 
728 		n->m_next = o;
729 		n = n->m_next;
730 
731 		n->m_len = m->m_len;
732 		if (m->m_flags & M_EXT) {
733 			n->m_data = m->m_data;
734 			n->m_ext = m->m_ext;
735 			n->m_flags |= M_EXT;
736 			MEXT_ADD_REF(m);
737 		} else {
738 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
739 		}
740 
741 		m = m->m_next;
742 	}
743 	return top;
744 nospace:
745 	m_freem(top);
746 	atomic_add_long(&MCFail, 1);
747 	return (NULL);
748 }
749 
750 /*
751  * Copy data from an mbuf chain starting "off" bytes from the beginning,
752  * continuing for "len" bytes, into the indicated buffer.
753  */
754 void
755 m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
756 {
757 	unsigned count;
758 
759 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
760 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
761 	while (off > 0) {
762 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
763 		if (off < m->m_len)
764 			break;
765 		off -= m->m_len;
766 		m = m->m_next;
767 	}
768 	while (len > 0) {
769 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
770 		count = min(m->m_len - off, len);
771 		bcopy(mtod(m, caddr_t) + off, cp, count);
772 		len -= count;
773 		cp += count;
774 		off = 0;
775 		m = m->m_next;
776 	}
777 }
778 
779 /*
780  * Copy a packet header mbuf chain into a completely new chain, including
781  * copying any mbuf clusters.  Use this instead of m_copypacket() when
782  * you need a writable copy of an mbuf chain.
783  */
784 struct mbuf *
785 m_dup(struct mbuf *m, int how)
786 {
787 	struct mbuf **p, *top = NULL;
788 	int remain, moff, nsize;
789 
790 	/* Sanity check */
791 	if (m == NULL)
792 		return (NULL);
793 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__));
794 
795 	/* While there's more data, get a new mbuf, tack it on, and fill it */
796 	remain = m->m_pkthdr.len;
797 	moff = 0;
798 	p = &top;
799 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
800 		struct mbuf *n;
801 
802 		/* Get the next new mbuf */
803 		MGET(n, how, m->m_type);
804 		if (n == NULL)
805 			goto nospace;
806 		if (top == NULL) {		/* first one, must be PKTHDR */
807 			M_COPY_PKTHDR(n, m);
808 			nsize = MHLEN;
809 		} else				/* not the first one */
810 			nsize = MLEN;
811 		if (remain >= MINCLSIZE) {
812 			MCLGET(n, how);
813 			if ((n->m_flags & M_EXT) == 0) {
814 				(void)m_free(n);
815 				goto nospace;
816 			}
817 			nsize = MCLBYTES;
818 		}
819 		n->m_len = 0;
820 
821 		/* Link it into the new chain */
822 		*p = n;
823 		p = &n->m_next;
824 
825 		/* Copy data from original mbuf(s) into new mbuf */
826 		while (n->m_len < nsize && m != NULL) {
827 			int chunk = min(nsize - n->m_len, m->m_len - moff);
828 
829 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
830 			moff += chunk;
831 			n->m_len += chunk;
832 			remain -= chunk;
833 			if (moff == m->m_len) {
834 				m = m->m_next;
835 				moff = 0;
836 			}
837 		}
838 
839 		/* Check correct total mbuf length */
840 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
841 		    	("%s: bogus m_pkthdr.len", __FUNCTION__));
842 	}
843 	return (top);
844 
845 nospace:
846 	m_freem(top);
847 	atomic_add_long(&MCFail, 1);
848 	return (NULL);
849 }
850 
851 /*
852  * Concatenate mbuf chain n to m.
853  * Both chains must be of the same type (e.g. MT_DATA).
854  * Any m_pkthdr is not updated.
855  */
856 void
857 m_cat(struct mbuf *m, struct mbuf *n)
858 {
859 	while (m->m_next)
860 		m = m->m_next;
861 	while (n) {
862 		if (m->m_flags & M_EXT ||
863 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
864 			/* just join the two chains */
865 			m->m_next = n;
866 			return;
867 		}
868 		/* splat the data from one into the other */
869 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
870 		    (u_int)n->m_len);
871 		m->m_len += n->m_len;
872 		n = m_free(n);
873 	}
874 }
875 
876 void
877 m_adj(struct mbuf *mp, int req_len)
878 {
879 	int len = req_len;
880 	struct mbuf *m;
881 	int count;
882 
883 	if ((m = mp) == NULL)
884 		return;
885 	if (len >= 0) {
886 		/*
887 		 * Trim from head.
888 		 */
889 		while (m != NULL && len > 0) {
890 			if (m->m_len <= len) {
891 				len -= m->m_len;
892 				m->m_len = 0;
893 				m = m->m_next;
894 			} else {
895 				m->m_len -= len;
896 				m->m_data += len;
897 				len = 0;
898 			}
899 		}
900 		m = mp;
901 		if (mp->m_flags & M_PKTHDR)
902 			m->m_pkthdr.len -= (req_len - len);
903 	} else {
904 		/*
905 		 * Trim from tail.  Scan the mbuf chain,
906 		 * calculating its length and finding the last mbuf.
907 		 * If the adjustment only affects this mbuf, then just
908 		 * adjust and return.  Otherwise, rescan and truncate
909 		 * after the remaining size.
910 		 */
911 		len = -len;
912 		count = 0;
913 		for (;;) {
914 			count += m->m_len;
915 			if (m->m_next == (struct mbuf *)0)
916 				break;
917 			m = m->m_next;
918 		}
919 		if (m->m_len >= len) {
920 			m->m_len -= len;
921 			if (mp->m_flags & M_PKTHDR)
922 				mp->m_pkthdr.len -= len;
923 			return;
924 		}
925 		count -= len;
926 		if (count < 0)
927 			count = 0;
928 		/*
929 		 * Correct length for chain is "count".
930 		 * Find the mbuf with last data, adjust its length,
931 		 * and toss data from remaining mbufs on chain.
932 		 */
933 		m = mp;
934 		if (m->m_flags & M_PKTHDR)
935 			m->m_pkthdr.len = count;
936 		for (; m; m = m->m_next) {
937 			if (m->m_len >= count) {
938 				m->m_len = count;
939 				break;
940 			}
941 			count -= m->m_len;
942 		}
943 		while (m->m_next)
944 			(m = m->m_next) ->m_len = 0;
945 	}
946 }
947 
948 /*
949  * Rearange an mbuf chain so that len bytes are contiguous
950  * and in the data area of an mbuf (so that mtod and dtom
951  * will work for a structure of size len).  Returns the resulting
952  * mbuf chain on success, frees it and returns null on failure.
953  * If there is room, it will add up to max_protohdr-len extra bytes to the
954  * contiguous region in an attempt to avoid being called next time.
955  */
956 #define MPFail (mbstat.m_mpfail)
957 
958 struct mbuf *
959 m_pullup(struct mbuf *n, int len)
960 {
961 	struct mbuf *m;
962 	int count;
963 	int space;
964 
965 	/*
966 	 * If first mbuf has no cluster, and has room for len bytes
967 	 * without shifting current data, pullup into it,
968 	 * otherwise allocate a new mbuf to prepend to the chain.
969 	 */
970 	if ((n->m_flags & M_EXT) == 0 &&
971 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
972 		if (n->m_len >= len)
973 			return (n);
974 		m = n;
975 		n = n->m_next;
976 		len -= m->m_len;
977 	} else {
978 		if (len > MHLEN)
979 			goto bad;
980 		MGET(m, M_DONTWAIT, n->m_type);
981 		if (m == NULL)
982 			goto bad;
983 		m->m_len = 0;
984 		if (n->m_flags & M_PKTHDR) {
985 			M_COPY_PKTHDR(m, n);
986 			n->m_flags &= ~M_PKTHDR;
987 		}
988 	}
989 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
990 	do {
991 		count = min(min(max(len, max_protohdr), space), n->m_len);
992 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
993 		  (unsigned)count);
994 		len -= count;
995 		m->m_len += count;
996 		n->m_len -= count;
997 		space -= count;
998 		if (n->m_len)
999 			n->m_data += count;
1000 		else
1001 			n = m_free(n);
1002 	} while (len > 0 && n);
1003 	if (len > 0) {
1004 		(void) m_free(m);
1005 		goto bad;
1006 	}
1007 	m->m_next = n;
1008 	return (m);
1009 bad:
1010 	m_freem(n);
1011 	atomic_add_long(&MPFail, 1);
1012 	return (NULL);
1013 }
1014 
1015 /*
1016  * Partition an mbuf chain in two pieces, returning the tail --
1017  * all but the first len0 bytes.  In case of failure, it returns NULL and
1018  * attempts to restore the chain to its original state.
1019  */
1020 struct mbuf *
1021 m_split(struct mbuf *m0, int len0, int wait)
1022 {
1023 	struct mbuf *m, *n;
1024 	unsigned len = len0, remain;
1025 
1026 	for (m = m0; m && len > m->m_len; m = m->m_next)
1027 		len -= m->m_len;
1028 	if (m == NULL)
1029 		return (NULL);
1030 	remain = m->m_len - len;
1031 	if (m0->m_flags & M_PKTHDR) {
1032 		MGETHDR(n, wait, m0->m_type);
1033 		if (n == NULL)
1034 			return (NULL);
1035 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1036 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1037 		m0->m_pkthdr.len = len0;
1038 		if (m->m_flags & M_EXT)
1039 			goto extpacket;
1040 		if (remain > MHLEN) {
1041 			/* m can't be the lead packet */
1042 			MH_ALIGN(n, 0);
1043 			n->m_next = m_split(m, len, wait);
1044 			if (n->m_next == NULL) {
1045 				(void) m_free(n);
1046 				return (NULL);
1047 			} else
1048 				return (n);
1049 		} else
1050 			MH_ALIGN(n, remain);
1051 	} else if (remain == 0) {
1052 		n = m->m_next;
1053 		m->m_next = NULL;
1054 		return (n);
1055 	} else {
1056 		MGET(n, wait, m->m_type);
1057 		if (n == NULL)
1058 			return (NULL);
1059 		M_ALIGN(n, remain);
1060 	}
1061 extpacket:
1062 	if (m->m_flags & M_EXT) {
1063 		n->m_flags |= M_EXT;
1064 		n->m_ext = m->m_ext;
1065 		MEXT_ADD_REF(m);
1066 		m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
1067 		n->m_data = m->m_data + len;
1068 	} else {
1069 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1070 	}
1071 	n->m_len = remain;
1072 	m->m_len = len;
1073 	n->m_next = m->m_next;
1074 	m->m_next = NULL;
1075 	return (n);
1076 }
1077 /*
1078  * Routine to copy from device local memory into mbufs.
1079  */
1080 struct mbuf *
1081 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
1082 	 void (*copy)(char *from, caddr_t to, u_int len))
1083 {
1084 	struct mbuf *m;
1085 	struct mbuf *top = 0, **mp = &top;
1086 	int off = off0, len;
1087 	char *cp;
1088 	char *epkt;
1089 
1090 	cp = buf;
1091 	epkt = cp + totlen;
1092 	if (off) {
1093 		cp += off + 2 * sizeof(u_short);
1094 		totlen -= 2 * sizeof(u_short);
1095 	}
1096 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1097 	if (m == NULL)
1098 		return (NULL);
1099 	m->m_pkthdr.rcvif = ifp;
1100 	m->m_pkthdr.len = totlen;
1101 	m->m_len = MHLEN;
1102 
1103 	while (totlen > 0) {
1104 		if (top) {
1105 			MGET(m, M_DONTWAIT, MT_DATA);
1106 			if (m == NULL) {
1107 				m_freem(top);
1108 				return (NULL);
1109 			}
1110 			m->m_len = MLEN;
1111 		}
1112 		len = min(totlen, epkt - cp);
1113 		if (len >= MINCLSIZE) {
1114 			MCLGET(m, M_DONTWAIT);
1115 			if (m->m_flags & M_EXT)
1116 				m->m_len = len = min(len, MCLBYTES);
1117 			else
1118 				len = m->m_len;
1119 		} else {
1120 			/*
1121 			 * Place initial small packet/header at end of mbuf.
1122 			 */
1123 			if (len < m->m_len) {
1124 				if (top == NULL && len +
1125 				    max_linkhdr <= m->m_len)
1126 					m->m_data += max_linkhdr;
1127 				m->m_len = len;
1128 			} else
1129 				len = m->m_len;
1130 		}
1131 		if (copy)
1132 			copy(cp, mtod(m, caddr_t), (unsigned)len);
1133 		else
1134 			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1135 		cp += len;
1136 		*mp = m;
1137 		mp = &m->m_next;
1138 		totlen -= len;
1139 		if (cp == epkt)
1140 			cp = buf;
1141 	}
1142 	return (top);
1143 }
1144 
1145 /*
1146  * Copy data from a buffer back into the indicated mbuf chain,
1147  * starting "off" bytes from the beginning, extending the mbuf
1148  * chain if necessary.
1149  */
1150 void
1151 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1152 {
1153 	int mlen;
1154 	struct mbuf *m = m0, *n;
1155 	int totlen = 0;
1156 
1157 	if (m0 == NULL)
1158 		return;
1159 	while (off > (mlen = m->m_len)) {
1160 		off -= mlen;
1161 		totlen += mlen;
1162 		if (m->m_next == NULL) {
1163 			n = m_getclr(M_DONTWAIT, m->m_type);
1164 			if (n == NULL)
1165 				goto out;
1166 			n->m_len = min(MLEN, len + off);
1167 			m->m_next = n;
1168 		}
1169 		m = m->m_next;
1170 	}
1171 	while (len > 0) {
1172 		mlen = min (m->m_len - off, len);
1173 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1174 		cp += mlen;
1175 		len -= mlen;
1176 		mlen += off;
1177 		off = 0;
1178 		totlen += mlen;
1179 		if (len == 0)
1180 			break;
1181 		if (m->m_next == NULL) {
1182 			n = m_get(M_DONTWAIT, m->m_type);
1183 			if (n == NULL)
1184 				break;
1185 			n->m_len = min(MLEN, len);
1186 			m->m_next = n;
1187 		}
1188 		m = m->m_next;
1189 	}
1190 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1191 		m->m_pkthdr.len = totlen;
1192 }
1193 
1194 void
1195 m_print(const struct mbuf *m)
1196 {
1197 	int len;
1198 	const struct mbuf *m2;
1199 
1200 	len = m->m_pkthdr.len;
1201 	m2 = m;
1202 	while (len) {
1203 		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1204 		len -= m2->m_len;
1205 		m2 = m2->m_next;
1206 	}
1207 	return;
1208 }
1209