xref: /freebsd/sys/kern/uipc_mbuf.c (revision 1d66272a85cde1c8a69c58f4b5dd649babd6eca6)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_param.h"
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47 #include <vm/vm.h>
48 #include <vm/vm_kern.h>
49 #include <vm/vm_extern.h>
50 
51 static void mbinit __P((void *));
52 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
53 
54 struct mbuf *mbutl;
55 struct mbstat mbstat;
56 u_long	mbtypes[MT_NTYPES];
57 int	max_linkhdr;
58 int	max_protohdr;
59 int	max_hdr;
60 int	max_datalen;
61 int	nmbclusters;
62 int	nmbufs;
63 int	nmbcnt;
64 u_long	m_mballoc_wid = 0;
65 u_long	m_clalloc_wid = 0;
66 
67 /*
68  * freelist header structures...
69  * mbffree_lst, mclfree_lst, mcntfree_lst
70  */
71 struct mbffree_lst mmbfree;
72 struct mclfree_lst mclfree;
73 struct mcntfree_lst mcntfree;
74 
75 /*
76  * sysctl(8) exported objects
77  */
78 SYSCTL_DECL(_kern_ipc);
79 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
80 	   &max_linkhdr, 0, "");
81 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
82 	   &max_protohdr, 0, "");
83 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
84 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
85 	   &max_datalen, 0, "");
86 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
87 	   &mbuf_wait, 0, "");
88 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD, &mbstat, mbstat, "");
89 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
90 	   sizeof(mbtypes), "LU", "");
91 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
92 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
93 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
94 	   "Maximum number of mbufs available");
95 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
96 	   "Maximum number of ext_buf counters available");
97 #ifndef NMBCLUSTERS
98 #define NMBCLUSTERS	(512 + MAXUSERS * 16)
99 #endif
100 TUNABLE_INT_DECL("kern.ipc.nmbclusters", NMBCLUSTERS, nmbclusters);
101 TUNABLE_INT_DECL("kern.ipc.nmbufs", NMBCLUSTERS * 4, nmbufs);
102 TUNABLE_INT_DECL("kern.ipc.nmbcnt", EXT_COUNTERS, nmbcnt);
103 
104 static void	m_reclaim __P((void));
105 
106 /* Initial allocation numbers */
107 #define NCL_INIT	2
108 #define NMB_INIT	16
109 #define REF_INIT	NMBCLUSTERS
110 
111 /*
112  * Full mbuf subsystem initialization done here.
113  *
114  * XXX: If ever we have system specific map setups to do, then move them to
115  *      machdep.c - for now, there is no reason for this stuff to go there.
116  */
117 static void
118 mbinit(dummy)
119 	void *dummy;
120 {
121 	vm_offset_t maxaddr, mb_map_size;
122 
123 	/*
124 	 * Setup the mb_map, allocate requested VM space.
125 	 */
126 	mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt
127 	    * sizeof(union mext_refcnt);
128 	mb_map_size = roundup2(mb_map_size, PAGE_SIZE);
129 	mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
130 	    mb_map_size);
131 	/* XXX: mb_map->system_map = 1; */
132 
133 	/*
134 	 * Initialize the free list headers, and setup locks for lists.
135 	 */
136 	mmbfree.m_head = NULL;
137 	mclfree.m_head = NULL;
138 	mcntfree.m_head = NULL;
139 	mtx_init(&mmbfree.m_mtx, "mbuf free list lock", MTX_DEF);
140 	mtx_init(&mclfree.m_mtx, "mcluster free list lock", MTX_DEF);
141 	mtx_init(&mcntfree.m_mtx, "m_ext counter free list lock", MTX_DEF);
142 
143 	/*
144 	 * Initialize mbuf subsystem (sysctl exported) statistics structure.
145 	 */
146 	mbstat.m_msize = MSIZE;
147 	mbstat.m_mclbytes = MCLBYTES;
148 	mbstat.m_minclsize = MINCLSIZE;
149 	mbstat.m_mlen = MLEN;
150 	mbstat.m_mhlen = MHLEN;
151 
152 	/*
153 	 * Perform some initial allocations.
154 	 */
155 	mtx_enter(&mcntfree.m_mtx, MTX_DEF);
156 	if (m_alloc_ref(REF_INIT, M_DONTWAIT) == 0)
157 		goto bad;
158 	mtx_exit(&mcntfree.m_mtx, MTX_DEF);
159 
160 	mtx_enter(&mmbfree.m_mtx, MTX_DEF);
161 	if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
162 		goto bad;
163 	mtx_exit(&mmbfree.m_mtx, MTX_DEF);
164 
165 	mtx_enter(&mclfree.m_mtx, MTX_DEF);
166 	if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
167 		goto bad;
168 	mtx_exit(&mclfree.m_mtx, MTX_DEF);
169 
170 	return;
171 bad:
172 	panic("mbinit: failed to initialize mbuf subsystem!");
173 }
174 
175 /*
176  * Allocate at least nmb reference count structs and place them
177  * on the ref cnt free list.
178  *
179  * Must be called with the mcntfree lock held.
180  */
181 int
182 m_alloc_ref(nmb, how)
183 	u_int nmb;
184 	int how;
185 {
186 	caddr_t p;
187 	u_int nbytes;
188 	int i;
189 
190 	/*
191 	 * We don't cap the amount of memory that can be used
192 	 * by the reference counters, like we do for mbufs and
193 	 * mbuf clusters. In fact, we're absolutely sure that we
194 	 * won't ever be going over our allocated space. We keep enough
195 	 * space in mb_map to accomodate maximum values of allocatable
196 	 * external buffers including, but not limited to, clusters.
197 	 * (That's also why we won't have to have wait routines for
198 	 * counters).
199 	 *
200 	 * If we're in here, we're absolutely certain to be returning
201 	 * succesfully, as long as there is physical memory to accomodate
202 	 * us. And if there isn't, but we're willing to wait, then
203 	 * kmem_malloc() will do the only waiting needed.
204 	 */
205 
206 	nbytes = round_page(nmb * sizeof(union mext_refcnt));
207 	mtx_exit(&mcntfree.m_mtx, MTX_DEF);
208 	mtx_enter(&Giant, MTX_DEF);
209 	if ((p = (caddr_t)kmem_malloc(mb_map, nbytes, how == M_TRYWAIT ?
210 	    M_WAITOK : M_NOWAIT)) == NULL) {
211 		mtx_exit(&Giant, MTX_DEF);
212 		mtx_enter(&mcntfree.m_mtx, MTX_DEF); /* XXX: We must be	holding
213 						             it going out. */
214 		return (0);
215 	}
216 	mtx_exit(&Giant, MTX_DEF);
217 	nmb = nbytes / sizeof(union mext_refcnt);
218 
219 	/*
220 	 * We don't let go of the mutex in order to avoid a race.
221 	 * It is up to the caller to let go of the mutex.
222 	 */
223 	mtx_enter(&mcntfree.m_mtx, MTX_DEF);
224 	for (i = 0; i < nmb; i++) {
225 		((union mext_refcnt *)p)->next_ref = mcntfree.m_head;
226 		mcntfree.m_head = (union mext_refcnt *)p;
227 		p += sizeof(union mext_refcnt);
228 		mbstat.m_refree++;
229 	}
230 	mbstat.m_refcnt += nmb;
231 
232 	return (1);
233 }
234 
235 /*
236  * Allocate at least nmb mbufs and place on mbuf free list.
237  *
238  * Must be called with the mmbfree lock held.
239  */
240 int
241 m_mballoc(nmb, how)
242 	register int nmb;
243 	int how;
244 {
245 	register caddr_t p;
246 	register int i;
247 	int nbytes;
248 
249 	/*
250 	 * If we've hit the mbuf limit, stop allocating from mb_map.
251 	 * Also, once we run out of map space, it will be impossible to
252 	 * get any more (nothing is ever freed back to the map).
253 	 */
254 	if (mb_map_full || ((nmb + mbstat.m_mbufs) > nmbufs)) {
255 		/*
256 		 * Needs to be atomic as we may be incrementing it
257 		 * while holding another mutex, like mclfree. In other
258 		 * words, m_drops is not reserved solely for mbufs,
259 		 * but is also available for clusters.
260 		 */
261 		atomic_add_long(&mbstat.m_drops, 1);
262 		return (0);
263 	}
264 
265 	nbytes = round_page(nmb * MSIZE);
266 
267 	/* XXX: The letting go of the mmbfree lock here may eventually
268 	   be moved to only be done for M_TRYWAIT calls to kmem_malloc() */
269 	mtx_exit(&mmbfree.m_mtx, MTX_DEF);
270 	mtx_enter(&Giant, MTX_DEF);
271 	p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
272 	if (p == 0 && how == M_TRYWAIT) {
273 		atomic_add_long(&mbstat.m_wait, 1);
274 		p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
275 	}
276 	mtx_exit(&Giant, MTX_DEF);
277 	mtx_enter(&mmbfree.m_mtx, MTX_DEF);
278 
279 	/*
280 	 * Either the map is now full, or `how' is M_DONTWAIT and there
281 	 * are no pages left.
282 	 */
283 	if (p == NULL)
284 		return (0);
285 
286 	nmb = nbytes / MSIZE;
287 
288 	/*
289 	 * We don't let go of the mutex in order to avoid a race.
290 	 * It is up to the caller to let go of the mutex when done
291 	 * with grabbing the mbuf from the free list.
292 	 */
293 	for (i = 0; i < nmb; i++) {
294 		((struct mbuf *)p)->m_next = mmbfree.m_head;
295 		mmbfree.m_head = (struct mbuf *)p;
296 		p += MSIZE;
297 	}
298 	mbstat.m_mbufs += nmb;
299 	mbtypes[MT_FREE] += nmb;
300 	return (1);
301 }
302 
303 /*
304  * Once the mb_map has been exhausted and if the call to the allocation macros
305  * (or, in some cases, functions) is with M_TRYWAIT, then it is necessary to
306  * rely solely on reclaimed mbufs.
307  *
308  * Here we request for the protocols to free up some resources and, if we
309  * still cannot get anything, then we wait for an mbuf to be freed for a
310  * designated (mbuf_wait) time.
311  *
312  * Must be called with the mmbfree mutex held.
313  */
314 struct mbuf *
315 m_mballoc_wait(void)
316 {
317 	struct mbuf *p = NULL;
318 
319 	/*
320 	 * See if we can drain some resources out of the protocols.
321 	 * We drop the mmbfree mutex to avoid recursing into it in some of
322 	 * the drain routines. Clearly, we're faced with a race here because
323 	 * once something is freed during the drain, it may be grabbed right
324 	 * from under us by some other thread. But we accept this possibility
325 	 * in order to avoid a potentially large lock recursion and, more
326 	 * importantly, to avoid a potential lock order reversal which may
327 	 * result in deadlock (See comment above m_reclaim()).
328 	 */
329 	mtx_exit(&mmbfree.m_mtx, MTX_DEF);
330 	m_reclaim();
331 
332 	mtx_enter(&mmbfree.m_mtx, MTX_DEF);
333 	_MGET(p, M_DONTWAIT);
334 
335 	if (p == NULL) {
336 		m_mballoc_wid++;
337 		if (msleep(&m_mballoc_wid, &mmbfree.m_mtx, PVM, "mballc",
338 		    mbuf_wait) == EWOULDBLOCK)
339 			m_mballoc_wid--;
340 
341 		/*
342 		 * Try again (one last time).
343 		 *
344 		 * We retry to fetch _even_ if the sleep timed out. This
345 		 * is left this way, purposely, in the [unlikely] case
346 		 * that an mbuf was freed but the sleep was not awoken
347 		 * in time.
348 		 *
349 		 * If the sleep didn't time out (i.e. we got woken up) then
350 		 * we have the lock so we just grab an mbuf, hopefully.
351 		 */
352 		_MGET(p, M_DONTWAIT);
353 	}
354 
355 	/* If we waited and got something... */
356 	if (p != NULL) {
357 		atomic_add_long(&mbstat.m_wait, 1);
358 		if (mmbfree.m_head != NULL)
359 			MBWAKEUP(m_mballoc_wid);
360 	} else
361 		atomic_add_long(&mbstat.m_drops, 1);
362 
363 	return (p);
364 }
365 
366 /*
367  * Allocate some number of mbuf clusters
368  * and place on cluster free list.
369  *
370  * Must be called with the mclfree lock held.
371  */
372 int
373 m_clalloc(ncl, how)
374 	register int ncl;
375 	int how;
376 {
377 	register caddr_t p;
378 	register int i;
379 	int npg;
380 
381 	/*
382 	 * If the map is now full (nothing will ever be freed to it).
383 	 * If we've hit the mcluster number limit, stop allocating from
384 	 * mb_map.
385 	 */
386 	if (mb_map_full || ((ncl + mbstat.m_clusters) > nmbclusters)) {
387 		atomic_add_long(&mbstat.m_drops, 1);
388 		return (0);
389 	}
390 
391 	npg = ncl;
392 	mtx_exit(&mclfree.m_mtx, MTX_DEF);
393 	mtx_enter(&Giant, MTX_DEF);
394 	p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
395 				 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
396 	mtx_exit(&Giant, MTX_DEF);
397 	ncl = ncl * PAGE_SIZE / MCLBYTES;
398 	mtx_enter(&mclfree.m_mtx, MTX_DEF);
399 
400 	/*
401 	 * Either the map is now full, or `how' is M_DONTWAIT and there
402 	 * are no pages left.
403 	 */
404 	if (p == NULL) {
405 		atomic_add_long(&mbstat.m_drops, 1);
406 		return (0);
407 	}
408 
409 	/*
410 	 * We don't let go of the mutex in order to avoid a race.
411 	 */
412 	for (i = 0; i < ncl; i++) {
413 		((union mcluster *)p)->mcl_next = mclfree.m_head;
414 		mclfree.m_head = (union mcluster *)p;
415 		p += MCLBYTES;
416 		mbstat.m_clfree++;
417 	}
418 	mbstat.m_clusters += ncl;
419 	return (1);
420 }
421 
422 /*
423  * Once the mb_map submap has been exhausted and the allocation is called with
424  * M_TRYWAIT, we rely on the mclfree list. If nothing is free, we will
425  * sleep for a designated amount of time (mbuf_wait) or until we're woken up
426  * due to sudden mcluster availability.
427  *
428  * Must be called with the mclfree lock held.
429  */
430 caddr_t
431 m_clalloc_wait(void)
432 {
433 	caddr_t p = NULL;
434 
435 	m_clalloc_wid++;
436 	if (msleep(&m_clalloc_wid, &mclfree.m_mtx, PVM, "mclalc", mbuf_wait)
437 	    == EWOULDBLOCK)
438 		m_clalloc_wid--;
439 
440 	/*
441 	 * Now that we (think) that we've got something, try again.
442 	 */
443 	_MCLALLOC(p, M_DONTWAIT);
444 
445 	/* If we waited and got something ... */
446 	if (p != NULL) {
447 		atomic_add_long(&mbstat.m_wait, 1);
448 		if (mclfree.m_head != NULL)
449 			MBWAKEUP(m_clalloc_wid);
450 	} else
451 		atomic_add_long(&mbstat.m_drops, 1);
452 
453 	return (p);
454 }
455 
456 /*
457  * m_reclaim: drain protocols in hopes to free up some resources...
458  *
459  * XXX: No locks should be held going in here. The drain routines have
460  * to presently acquire some locks which raises the possibility of lock
461  * order violation if we're holding any mutex if that mutex is acquired in
462  * reverse order relative to one of the locks in the drain routines.
463  */
464 static void
465 m_reclaim()
466 {
467 	register struct domain *dp;
468 	register struct protosw *pr;
469 
470 	for (dp = domains; dp; dp = dp->dom_next)
471 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
472 			if (pr->pr_drain)
473 				(*pr->pr_drain)();
474 	mbstat.m_drain++;
475 }
476 
477 /*
478  * Space allocation routines.
479  * These are also available as macros
480  * for critical paths.
481  */
482 struct mbuf *
483 m_get(how, type)
484 	int how, type;
485 {
486 	register struct mbuf *m;
487 
488 	MGET(m, how, type);
489 	return (m);
490 }
491 
492 struct mbuf *
493 m_gethdr(how, type)
494 	int how, type;
495 {
496 	register struct mbuf *m;
497 
498 	MGETHDR(m, how, type);
499 	return (m);
500 }
501 
502 struct mbuf *
503 m_getclr(how, type)
504 	int how, type;
505 {
506 	register struct mbuf *m;
507 
508 	MGET(m, how, type);
509 	if (m == 0)
510 		return (0);
511 	bzero(mtod(m, caddr_t), MLEN);
512 	return (m);
513 }
514 
515 struct mbuf *
516 m_free(m)
517 	struct mbuf *m;
518 {
519 	register struct mbuf *n;
520 
521 	MFREE(m, n);
522 	return (n);
523 }
524 
525 void
526 m_freem(m)
527 	register struct mbuf *m;
528 {
529 	register struct mbuf *n;
530 
531 	if (m == NULL)
532 		return;
533 	do {
534 		/*
535 		 * we do need to check non-first mbuf, since some of existing
536 		 * code does not call M_PREPEND properly.
537 		 * (example: call to bpf_mtap from drivers)
538 		 */
539 		if ((m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.aux) {
540 			m_freem(m->m_pkthdr.aux);
541 			m->m_pkthdr.aux = NULL;
542 		}
543 		MFREE(m, n);
544 		m = n;
545 	} while (m);
546 }
547 
548 /*
549  * Mbuffer utility routines.
550  */
551 
552 /*
553  * Lesser-used path for M_PREPEND:
554  * allocate new mbuf to prepend to chain,
555  * copy junk along.
556  */
557 struct mbuf *
558 m_prepend(m, len, how)
559 	register struct mbuf *m;
560 	int len, how;
561 {
562 	struct mbuf *mn;
563 
564 	MGET(mn, how, m->m_type);
565 	if (mn == (struct mbuf *)NULL) {
566 		m_freem(m);
567 		return ((struct mbuf *)NULL);
568 	}
569 	if (m->m_flags & M_PKTHDR) {
570 		M_COPY_PKTHDR(mn, m);
571 		m->m_flags &= ~M_PKTHDR;
572 	}
573 	mn->m_next = m;
574 	m = mn;
575 	if (len < MHLEN)
576 		MH_ALIGN(m, len);
577 	m->m_len = len;
578 	return (m);
579 }
580 
581 /*
582  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
583  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
584  * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
585  * Note that the copy is read-only, because clusters are not copied,
586  * only their reference counts are incremented.
587  */
588 #define MCFail (mbstat.m_mcfail)
589 
590 struct mbuf *
591 m_copym(m, off0, len, wait)
592 	register struct mbuf *m;
593 	int off0, wait;
594 	register int len;
595 {
596 	register struct mbuf *n, **np;
597 	register int off = off0;
598 	struct mbuf *top;
599 	int copyhdr = 0;
600 
601 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
602 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
603 	if (off == 0 && m->m_flags & M_PKTHDR)
604 		copyhdr = 1;
605 	while (off > 0) {
606 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
607 		if (off < m->m_len)
608 			break;
609 		off -= m->m_len;
610 		m = m->m_next;
611 	}
612 	np = &top;
613 	top = 0;
614 	while (len > 0) {
615 		if (m == 0) {
616 			KASSERT(len == M_COPYALL,
617 			    ("m_copym, length > size of mbuf chain"));
618 			break;
619 		}
620 		MGET(n, wait, m->m_type);
621 		*np = n;
622 		if (n == 0)
623 			goto nospace;
624 		if (copyhdr) {
625 			M_COPY_PKTHDR(n, m);
626 			if (len == M_COPYALL)
627 				n->m_pkthdr.len -= off0;
628 			else
629 				n->m_pkthdr.len = len;
630 			copyhdr = 0;
631 		}
632 		n->m_len = min(len, m->m_len - off);
633 		if (m->m_flags & M_EXT) {
634 			n->m_data = m->m_data + off;
635 			n->m_ext = m->m_ext;
636 			n->m_flags |= M_EXT;
637 			MEXT_ADD_REF(m);
638 		} else
639 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
640 			    (unsigned)n->m_len);
641 		if (len != M_COPYALL)
642 			len -= n->m_len;
643 		off = 0;
644 		m = m->m_next;
645 		np = &n->m_next;
646 	}
647 	if (top == 0)
648 		atomic_add_long(&MCFail, 1);
649 	return (top);
650 nospace:
651 	m_freem(top);
652 	atomic_add_long(&MCFail, 1);
653 	return (0);
654 }
655 
656 /*
657  * Copy an entire packet, including header (which must be present).
658  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
659  * Note that the copy is read-only, because clusters are not copied,
660  * only their reference counts are incremented.
661  */
662 struct mbuf *
663 m_copypacket(m, how)
664 	struct mbuf *m;
665 	int how;
666 {
667 	struct mbuf *top, *n, *o;
668 
669 	MGET(n, how, m->m_type);
670 	top = n;
671 	if (!n)
672 		goto nospace;
673 
674 	M_COPY_PKTHDR(n, m);
675 	n->m_len = m->m_len;
676 	if (m->m_flags & M_EXT) {
677 		n->m_data = m->m_data;
678 		n->m_ext = m->m_ext;
679 		n->m_flags |= M_EXT;
680 		MEXT_ADD_REF(m);
681 	} else {
682 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
683 	}
684 
685 	m = m->m_next;
686 	while (m) {
687 		MGET(o, how, m->m_type);
688 		if (!o)
689 			goto nospace;
690 
691 		n->m_next = o;
692 		n = n->m_next;
693 
694 		n->m_len = m->m_len;
695 		if (m->m_flags & M_EXT) {
696 			n->m_data = m->m_data;
697 			n->m_ext = m->m_ext;
698 			n->m_flags |= M_EXT;
699 			MEXT_ADD_REF(m);
700 		} else {
701 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
702 		}
703 
704 		m = m->m_next;
705 	}
706 	return top;
707 nospace:
708 	m_freem(top);
709 	atomic_add_long(&MCFail, 1);
710 	return 0;
711 }
712 
713 /*
714  * Copy data from an mbuf chain starting "off" bytes from the beginning,
715  * continuing for "len" bytes, into the indicated buffer.
716  */
717 void
718 m_copydata(m, off, len, cp)
719 	register struct mbuf *m;
720 	register int off;
721 	register int len;
722 	caddr_t cp;
723 {
724 	register unsigned count;
725 
726 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
727 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
728 	while (off > 0) {
729 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
730 		if (off < m->m_len)
731 			break;
732 		off -= m->m_len;
733 		m = m->m_next;
734 	}
735 	while (len > 0) {
736 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
737 		count = min(m->m_len - off, len);
738 		bcopy(mtod(m, caddr_t) + off, cp, count);
739 		len -= count;
740 		cp += count;
741 		off = 0;
742 		m = m->m_next;
743 	}
744 }
745 
746 /*
747  * Copy a packet header mbuf chain into a completely new chain, including
748  * copying any mbuf clusters.  Use this instead of m_copypacket() when
749  * you need a writable copy of an mbuf chain.
750  */
751 struct mbuf *
752 m_dup(m, how)
753 	struct mbuf *m;
754 	int how;
755 {
756 	struct mbuf **p, *top = NULL;
757 	int remain, moff, nsize;
758 
759 	/* Sanity check */
760 	if (m == NULL)
761 		return (0);
762 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__));
763 
764 	/* While there's more data, get a new mbuf, tack it on, and fill it */
765 	remain = m->m_pkthdr.len;
766 	moff = 0;
767 	p = &top;
768 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
769 		struct mbuf *n;
770 
771 		/* Get the next new mbuf */
772 		MGET(n, how, m->m_type);
773 		if (n == NULL)
774 			goto nospace;
775 		if (top == NULL) {		/* first one, must be PKTHDR */
776 			M_COPY_PKTHDR(n, m);
777 			nsize = MHLEN;
778 		} else				/* not the first one */
779 			nsize = MLEN;
780 		if (remain >= MINCLSIZE) {
781 			MCLGET(n, how);
782 			if ((n->m_flags & M_EXT) == 0) {
783 				(void)m_free(n);
784 				goto nospace;
785 			}
786 			nsize = MCLBYTES;
787 		}
788 		n->m_len = 0;
789 
790 		/* Link it into the new chain */
791 		*p = n;
792 		p = &n->m_next;
793 
794 		/* Copy data from original mbuf(s) into new mbuf */
795 		while (n->m_len < nsize && m != NULL) {
796 			int chunk = min(nsize - n->m_len, m->m_len - moff);
797 
798 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
799 			moff += chunk;
800 			n->m_len += chunk;
801 			remain -= chunk;
802 			if (moff == m->m_len) {
803 				m = m->m_next;
804 				moff = 0;
805 			}
806 		}
807 
808 		/* Check correct total mbuf length */
809 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
810 		    	("%s: bogus m_pkthdr.len", __FUNCTION__));
811 	}
812 	return (top);
813 
814 nospace:
815 	m_freem(top);
816 	atomic_add_long(&MCFail, 1);
817 	return (0);
818 }
819 
820 /*
821  * Concatenate mbuf chain n to m.
822  * Both chains must be of the same type (e.g. MT_DATA).
823  * Any m_pkthdr is not updated.
824  */
825 void
826 m_cat(m, n)
827 	register struct mbuf *m, *n;
828 {
829 	while (m->m_next)
830 		m = m->m_next;
831 	while (n) {
832 		if (m->m_flags & M_EXT ||
833 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
834 			/* just join the two chains */
835 			m->m_next = n;
836 			return;
837 		}
838 		/* splat the data from one into the other */
839 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
840 		    (u_int)n->m_len);
841 		m->m_len += n->m_len;
842 		n = m_free(n);
843 	}
844 }
845 
846 void
847 m_adj(mp, req_len)
848 	struct mbuf *mp;
849 	int req_len;
850 {
851 	register int len = req_len;
852 	register struct mbuf *m;
853 	register int count;
854 
855 	if ((m = mp) == NULL)
856 		return;
857 	if (len >= 0) {
858 		/*
859 		 * Trim from head.
860 		 */
861 		while (m != NULL && len > 0) {
862 			if (m->m_len <= len) {
863 				len -= m->m_len;
864 				m->m_len = 0;
865 				m = m->m_next;
866 			} else {
867 				m->m_len -= len;
868 				m->m_data += len;
869 				len = 0;
870 			}
871 		}
872 		m = mp;
873 		if (mp->m_flags & M_PKTHDR)
874 			m->m_pkthdr.len -= (req_len - len);
875 	} else {
876 		/*
877 		 * Trim from tail.  Scan the mbuf chain,
878 		 * calculating its length and finding the last mbuf.
879 		 * If the adjustment only affects this mbuf, then just
880 		 * adjust and return.  Otherwise, rescan and truncate
881 		 * after the remaining size.
882 		 */
883 		len = -len;
884 		count = 0;
885 		for (;;) {
886 			count += m->m_len;
887 			if (m->m_next == (struct mbuf *)0)
888 				break;
889 			m = m->m_next;
890 		}
891 		if (m->m_len >= len) {
892 			m->m_len -= len;
893 			if (mp->m_flags & M_PKTHDR)
894 				mp->m_pkthdr.len -= len;
895 			return;
896 		}
897 		count -= len;
898 		if (count < 0)
899 			count = 0;
900 		/*
901 		 * Correct length for chain is "count".
902 		 * Find the mbuf with last data, adjust its length,
903 		 * and toss data from remaining mbufs on chain.
904 		 */
905 		m = mp;
906 		if (m->m_flags & M_PKTHDR)
907 			m->m_pkthdr.len = count;
908 		for (; m; m = m->m_next) {
909 			if (m->m_len >= count) {
910 				m->m_len = count;
911 				break;
912 			}
913 			count -= m->m_len;
914 		}
915 		while (m->m_next)
916 			(m = m->m_next) ->m_len = 0;
917 	}
918 }
919 
920 /*
921  * Rearange an mbuf chain so that len bytes are contiguous
922  * and in the data area of an mbuf (so that mtod and dtom
923  * will work for a structure of size len).  Returns the resulting
924  * mbuf chain on success, frees it and returns null on failure.
925  * If there is room, it will add up to max_protohdr-len extra bytes to the
926  * contiguous region in an attempt to avoid being called next time.
927  */
928 #define MPFail (mbstat.m_mpfail)
929 
930 struct mbuf *
931 m_pullup(n, len)
932 	register struct mbuf *n;
933 	int len;
934 {
935 	register struct mbuf *m;
936 	register int count;
937 	int space;
938 
939 	/*
940 	 * If first mbuf has no cluster, and has room for len bytes
941 	 * without shifting current data, pullup into it,
942 	 * otherwise allocate a new mbuf to prepend to the chain.
943 	 */
944 	if ((n->m_flags & M_EXT) == 0 &&
945 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
946 		if (n->m_len >= len)
947 			return (n);
948 		m = n;
949 		n = n->m_next;
950 		len -= m->m_len;
951 	} else {
952 		if (len > MHLEN)
953 			goto bad;
954 		MGET(m, M_DONTWAIT, n->m_type);
955 		if (m == 0)
956 			goto bad;
957 		m->m_len = 0;
958 		if (n->m_flags & M_PKTHDR) {
959 			M_COPY_PKTHDR(m, n);
960 			n->m_flags &= ~M_PKTHDR;
961 		}
962 	}
963 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
964 	do {
965 		count = min(min(max(len, max_protohdr), space), n->m_len);
966 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
967 		  (unsigned)count);
968 		len -= count;
969 		m->m_len += count;
970 		n->m_len -= count;
971 		space -= count;
972 		if (n->m_len)
973 			n->m_data += count;
974 		else
975 			n = m_free(n);
976 	} while (len > 0 && n);
977 	if (len > 0) {
978 		(void) m_free(m);
979 		goto bad;
980 	}
981 	m->m_next = n;
982 	return (m);
983 bad:
984 	m_freem(n);
985 	atomic_add_long(&MPFail, 1);
986 	return (0);
987 }
988 
989 /*
990  * Partition an mbuf chain in two pieces, returning the tail --
991  * all but the first len0 bytes.  In case of failure, it returns NULL and
992  * attempts to restore the chain to its original state.
993  */
994 struct mbuf *
995 m_split(m0, len0, wait)
996 	register struct mbuf *m0;
997 	int len0, wait;
998 {
999 	register struct mbuf *m, *n;
1000 	unsigned len = len0, remain;
1001 
1002 	for (m = m0; m && len > m->m_len; m = m->m_next)
1003 		len -= m->m_len;
1004 	if (m == 0)
1005 		return (0);
1006 	remain = m->m_len - len;
1007 	if (m0->m_flags & M_PKTHDR) {
1008 		MGETHDR(n, wait, m0->m_type);
1009 		if (n == 0)
1010 			return (0);
1011 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1012 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1013 		m0->m_pkthdr.len = len0;
1014 		if (m->m_flags & M_EXT)
1015 			goto extpacket;
1016 		if (remain > MHLEN) {
1017 			/* m can't be the lead packet */
1018 			MH_ALIGN(n, 0);
1019 			n->m_next = m_split(m, len, wait);
1020 			if (n->m_next == 0) {
1021 				(void) m_free(n);
1022 				return (0);
1023 			} else
1024 				return (n);
1025 		} else
1026 			MH_ALIGN(n, remain);
1027 	} else if (remain == 0) {
1028 		n = m->m_next;
1029 		m->m_next = 0;
1030 		return (n);
1031 	} else {
1032 		MGET(n, wait, m->m_type);
1033 		if (n == 0)
1034 			return (0);
1035 		M_ALIGN(n, remain);
1036 	}
1037 extpacket:
1038 	if (m->m_flags & M_EXT) {
1039 		n->m_flags |= M_EXT;
1040 		n->m_ext = m->m_ext;
1041 		MEXT_ADD_REF(m);
1042 		m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
1043 		n->m_data = m->m_data + len;
1044 	} else {
1045 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1046 	}
1047 	n->m_len = remain;
1048 	m->m_len = len;
1049 	n->m_next = m->m_next;
1050 	m->m_next = 0;
1051 	return (n);
1052 }
1053 /*
1054  * Routine to copy from device local memory into mbufs.
1055  */
1056 struct mbuf *
1057 m_devget(buf, totlen, off0, ifp, copy)
1058 	char *buf;
1059 	int totlen, off0;
1060 	struct ifnet *ifp;
1061 	void (*copy) __P((char *from, caddr_t to, u_int len));
1062 {
1063 	register struct mbuf *m;
1064 	struct mbuf *top = 0, **mp = &top;
1065 	register int off = off0, len;
1066 	register char *cp;
1067 	char *epkt;
1068 
1069 	cp = buf;
1070 	epkt = cp + totlen;
1071 	if (off) {
1072 		cp += off + 2 * sizeof(u_short);
1073 		totlen -= 2 * sizeof(u_short);
1074 	}
1075 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1076 	if (m == 0)
1077 		return (0);
1078 	m->m_pkthdr.rcvif = ifp;
1079 	m->m_pkthdr.len = totlen;
1080 	m->m_len = MHLEN;
1081 
1082 	while (totlen > 0) {
1083 		if (top) {
1084 			MGET(m, M_DONTWAIT, MT_DATA);
1085 			if (m == 0) {
1086 				m_freem(top);
1087 				return (0);
1088 			}
1089 			m->m_len = MLEN;
1090 		}
1091 		len = min(totlen, epkt - cp);
1092 		if (len >= MINCLSIZE) {
1093 			MCLGET(m, M_DONTWAIT);
1094 			if (m->m_flags & M_EXT)
1095 				m->m_len = len = min(len, MCLBYTES);
1096 			else
1097 				len = m->m_len;
1098 		} else {
1099 			/*
1100 			 * Place initial small packet/header at end of mbuf.
1101 			 */
1102 			if (len < m->m_len) {
1103 				if (top == 0 && len + max_linkhdr <= m->m_len)
1104 					m->m_data += max_linkhdr;
1105 				m->m_len = len;
1106 			} else
1107 				len = m->m_len;
1108 		}
1109 		if (copy)
1110 			copy(cp, mtod(m, caddr_t), (unsigned)len);
1111 		else
1112 			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1113 		cp += len;
1114 		*mp = m;
1115 		mp = &m->m_next;
1116 		totlen -= len;
1117 		if (cp == epkt)
1118 			cp = buf;
1119 	}
1120 	return (top);
1121 }
1122 
1123 /*
1124  * Copy data from a buffer back into the indicated mbuf chain,
1125  * starting "off" bytes from the beginning, extending the mbuf
1126  * chain if necessary.
1127  */
1128 void
1129 m_copyback(m0, off, len, cp)
1130 	struct	mbuf *m0;
1131 	register int off;
1132 	register int len;
1133 	caddr_t cp;
1134 {
1135 	register int mlen;
1136 	register struct mbuf *m = m0, *n;
1137 	int totlen = 0;
1138 
1139 	if (m0 == 0)
1140 		return;
1141 	while (off > (mlen = m->m_len)) {
1142 		off -= mlen;
1143 		totlen += mlen;
1144 		if (m->m_next == 0) {
1145 			n = m_getclr(M_DONTWAIT, m->m_type);
1146 			if (n == 0)
1147 				goto out;
1148 			n->m_len = min(MLEN, len + off);
1149 			m->m_next = n;
1150 		}
1151 		m = m->m_next;
1152 	}
1153 	while (len > 0) {
1154 		mlen = min (m->m_len - off, len);
1155 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1156 		cp += mlen;
1157 		len -= mlen;
1158 		mlen += off;
1159 		off = 0;
1160 		totlen += mlen;
1161 		if (len == 0)
1162 			break;
1163 		if (m->m_next == 0) {
1164 			n = m_get(M_DONTWAIT, m->m_type);
1165 			if (n == 0)
1166 				break;
1167 			n->m_len = min(MLEN, len);
1168 			m->m_next = n;
1169 		}
1170 		m = m->m_next;
1171 	}
1172 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1173 		m->m_pkthdr.len = totlen;
1174 }
1175 
1176 void
1177 m_print(const struct mbuf *m)
1178 {
1179 	int len;
1180 	const struct mbuf *m2;
1181 
1182 	len = m->m_pkthdr.len;
1183 	m2 = m;
1184 	while (len) {
1185 		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1186 		len -= m2->m_len;
1187 		m2 = m2->m_next;
1188 	}
1189 	return;
1190 }
1191