xref: /freebsd/sys/kern/kern_mbuf.c (revision aa77200569e397d6ff1fdb4d255d0fa254d0a128)
1 /*-
2  * Copyright (c) 2004, 2005,
3  * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_param.h"
32 
33 #include <sys/param.h>
34 #include <sys/malloc.h>
35 #include <sys/systm.h>
36 #include <sys/mbuf.h>
37 #include <sys/domain.h>
38 #include <sys/eventhandler.h>
39 #include <sys/kernel.h>
40 #include <sys/protosw.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 
44 #include <security/mac/mac_framework.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_extern.h>
48 #include <vm/vm_kern.h>
49 #include <vm/vm_page.h>
50 #include <vm/uma.h>
51 #include <vm/uma_int.h>
52 #include <vm/uma_dbg.h>
53 
54 /*
55  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
56  * Zones.
57  *
58  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
59  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
60  * administrator so desires.
61  *
62  * Mbufs are allocated from a UMA Master Zone called the Mbuf
63  * Zone.
64  *
65  * Additionally, FreeBSD provides a Packet Zone, which it
66  * configures as a Secondary Zone to the Mbuf Master Zone,
67  * thus sharing backend Slab kegs with the Mbuf Master Zone.
68  *
69  * Thus common-case allocations and locking are simplified:
70  *
71  *  m_clget()                m_getcl()
72  *    |                         |
73  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
74  *    |   |             [     Packet   ]            |
75  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
76  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
77  *        |                       \________         |
78  *  [ Cluster Keg   ]                      \       /
79  *        |    	                         [ Mbuf Keg   ]
80  *  [ Cluster Slabs ]                         |
81  *        |                              [ Mbuf Slabs ]
82  *         \____________(VM)_________________/
83  *
84  *
85  * Whenever an object is allocated with uma_zalloc() out of
86  * one of the Zones its _ctor_ function is executed.  The same
87  * for any deallocation through uma_zfree() the _dtor_ function
88  * is executed.
89  *
90  * Caches are per-CPU and are filled from the Master Zone.
91  *
92  * Whenever an object is allocated from the underlying global
93  * memory pool it gets pre-initialized with the _zinit_ functions.
94  * When the Keg's are overfull objects get decomissioned with
95  * _zfini_ functions and free'd back to the global memory pool.
96  *
97  */
98 
99 int nmbufs;			/* limits number of mbufs */
100 int nmbclusters;		/* limits number of mbuf clusters */
101 int nmbjumbop;			/* limits number of page size jumbo clusters */
102 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
103 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
104 struct mbstat mbstat;
105 
106 /*
107  * tunable_mbinit() has to be run before init_maxsockets() thus
108  * the SYSINIT order below is SI_ORDER_MIDDLE while init_maxsockets()
109  * runs at SI_ORDER_ANY.
110  *
111  * NB: This has to be done before VM init.
112  */
113 static void
114 tunable_mbinit(void *dummy)
115 {
116 
117 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
118 	if (nmbclusters == 0)
119 		nmbclusters = maxmbufmem / MCLBYTES / 4;
120 
121 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
122 	if (nmbjumbop == 0)
123 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
124 
125 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
126 	if (nmbjumbo9 == 0)
127 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
128 
129 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
130 	if (nmbjumbo16 == 0)
131 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
132 
133 	/*
134 	 * We need at least as many mbufs as we have clusters of
135 	 * the various types added together.
136 	 */
137 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
138 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
139 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
140 			      nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
141 
142 }
143 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
144 
145 static int
146 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
147 {
148 	int error, newnmbclusters;
149 
150 	newnmbclusters = nmbclusters;
151 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
152 	if (error == 0 && req->newptr) {
153 		if (newnmbclusters > nmbclusters &&
154 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
155 			nmbclusters = newnmbclusters;
156 			uma_zone_set_max(zone_clust, nmbclusters);
157 			nmbclusters = uma_zone_get_max(zone_clust);
158 			EVENTHANDLER_INVOKE(nmbclusters_change);
159 		} else
160 			error = EINVAL;
161 	}
162 	return (error);
163 }
164 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
165 &nmbclusters, 0, sysctl_nmbclusters, "IU",
166     "Maximum number of mbuf clusters allowed");
167 
168 static int
169 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
170 {
171 	int error, newnmbjumbop;
172 
173 	newnmbjumbop = nmbjumbop;
174 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
175 	if (error == 0 && req->newptr) {
176 		if (newnmbjumbop > nmbjumbop &&
177 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
178 			nmbjumbop = newnmbjumbop;
179 			uma_zone_set_max(zone_jumbop, nmbjumbop);
180 			nmbjumbop = uma_zone_get_max(zone_jumbop);
181 		} else
182 			error = EINVAL;
183 	}
184 	return (error);
185 }
186 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
187 &nmbjumbop, 0, sysctl_nmbjumbop, "IU",
188 	 "Maximum number of mbuf page size jumbo clusters allowed");
189 
190 
191 static int
192 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
193 {
194 	int error, newnmbjumbo9;
195 
196 	newnmbjumbo9 = nmbjumbo9;
197 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
198 	if (error == 0 && req->newptr) {
199 		if (newnmbjumbo9 > nmbjumbo9&&
200 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
201 			nmbjumbo9 = newnmbjumbo9;
202 			uma_zone_set_max(zone_jumbo9, nmbjumbo9);
203 			nmbjumbo9 = uma_zone_get_max(zone_jumbo9);
204 		} else
205 			error = EINVAL;
206 	}
207 	return (error);
208 }
209 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
210 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
211 	"Maximum number of mbuf 9k jumbo clusters allowed");
212 
213 static int
214 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
215 {
216 	int error, newnmbjumbo16;
217 
218 	newnmbjumbo16 = nmbjumbo16;
219 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
220 	if (error == 0 && req->newptr) {
221 		if (newnmbjumbo16 > nmbjumbo16 &&
222 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
223 			nmbjumbo16 = newnmbjumbo16;
224 			uma_zone_set_max(zone_jumbo16, nmbjumbo16);
225 			nmbjumbo16 = uma_zone_get_max(zone_jumbo16);
226 		} else
227 			error = EINVAL;
228 	}
229 	return (error);
230 }
231 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
232 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
233     "Maximum number of mbuf 16k jumbo clusters allowed");
234 
235 static int
236 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
237 {
238 	int error, newnmbufs;
239 
240 	newnmbufs = nmbufs;
241 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
242 	if (error == 0 && req->newptr) {
243 		if (newnmbufs > nmbufs) {
244 			nmbufs = newnmbufs;
245 			uma_zone_set_max(zone_mbuf, nmbufs);
246 			nmbufs = uma_zone_get_max(zone_mbuf);
247 			EVENTHANDLER_INVOKE(nmbufs_change);
248 		} else
249 			error = EINVAL;
250 	}
251 	return (error);
252 }
253 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbuf, CTLTYPE_INT|CTLFLAG_RW,
254 &nmbufs, 0, sysctl_nmbufs, "IU",
255     "Maximum number of mbufs allowed");
256 
257 
258 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
259     "Mbuf general information and statistics");
260 
261 /*
262  * Zones from which we allocate.
263  */
264 uma_zone_t	zone_mbuf;
265 uma_zone_t	zone_clust;
266 uma_zone_t	zone_pack;
267 uma_zone_t	zone_jumbop;
268 uma_zone_t	zone_jumbo9;
269 uma_zone_t	zone_jumbo16;
270 uma_zone_t	zone_ext_refcnt;
271 
272 /*
273  * Local prototypes.
274  */
275 static int	mb_ctor_mbuf(void *, int, void *, int);
276 static int	mb_ctor_clust(void *, int, void *, int);
277 static int	mb_ctor_pack(void *, int, void *, int);
278 static void	mb_dtor_mbuf(void *, int, void *);
279 static void	mb_dtor_clust(void *, int, void *);
280 static void	mb_dtor_pack(void *, int, void *);
281 static int	mb_zinit_pack(void *, int, int);
282 static void	mb_zfini_pack(void *, int);
283 
284 static void	mb_reclaim(void *);
285 static void	mbuf_init(void *);
286 static void    *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int);
287 
288 /* Ensure that MSIZE must be a power of 2. */
289 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
290 
291 /*
292  * Initialize FreeBSD Network buffer allocation.
293  */
294 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
295 static void
296 mbuf_init(void *dummy)
297 {
298 
299 	/*
300 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
301 	 */
302 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
303 	    mb_ctor_mbuf, mb_dtor_mbuf,
304 #ifdef INVARIANTS
305 	    trash_init, trash_fini,
306 #else
307 	    NULL, NULL,
308 #endif
309 	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
310 	if (nmbufs > 0) {
311 		uma_zone_set_max(zone_mbuf, nmbufs);
312 		nmbufs = uma_zone_get_max(zone_mbuf);
313 	}
314 
315 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
316 	    mb_ctor_clust, mb_dtor_clust,
317 #ifdef INVARIANTS
318 	    trash_init, trash_fini,
319 #else
320 	    NULL, NULL,
321 #endif
322 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
323 	if (nmbclusters > 0) {
324 		uma_zone_set_max(zone_clust, nmbclusters);
325 		nmbclusters = uma_zone_get_max(zone_clust);
326 	}
327 
328 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
329 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
330 
331 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
332 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
333 	    mb_ctor_clust, mb_dtor_clust,
334 #ifdef INVARIANTS
335 	    trash_init, trash_fini,
336 #else
337 	    NULL, NULL,
338 #endif
339 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
340 	if (nmbjumbop > 0) {
341 		uma_zone_set_max(zone_jumbop, nmbjumbop);
342 		nmbjumbop = uma_zone_get_max(zone_jumbop);
343 	}
344 
345 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
346 	    mb_ctor_clust, mb_dtor_clust,
347 #ifdef INVARIANTS
348 	    trash_init, trash_fini,
349 #else
350 	    NULL, NULL,
351 #endif
352 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
353 	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
354 	if (nmbjumbo9 > 0) {
355 		uma_zone_set_max(zone_jumbo9, nmbjumbo9);
356 		nmbjumbo9 = uma_zone_get_max(zone_jumbo9);
357 	}
358 
359 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
360 	    mb_ctor_clust, mb_dtor_clust,
361 #ifdef INVARIANTS
362 	    trash_init, trash_fini,
363 #else
364 	    NULL, NULL,
365 #endif
366 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
367 	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
368 	if (nmbjumbo16 > 0) {
369 		uma_zone_set_max(zone_jumbo16, nmbjumbo16);
370 		nmbjumbo16 = uma_zone_get_max(zone_jumbo16);
371 	}
372 
373 	zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
374 	    NULL, NULL,
375 	    NULL, NULL,
376 	    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
377 
378 	/* uma_prealloc() goes here... */
379 
380 	/*
381 	 * Hook event handler for low-memory situation, used to
382 	 * drain protocols and push data back to the caches (UMA
383 	 * later pushes it back to VM).
384 	 */
385 	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
386 	    EVENTHANDLER_PRI_FIRST);
387 
388 	/*
389 	 * [Re]set counters and local statistics knobs.
390 	 * XXX Some of these should go and be replaced, but UMA stat
391 	 * gathering needs to be revised.
392 	 */
393 	mbstat.m_mbufs = 0;
394 	mbstat.m_mclusts = 0;
395 	mbstat.m_drain = 0;
396 	mbstat.m_msize = MSIZE;
397 	mbstat.m_mclbytes = MCLBYTES;
398 	mbstat.m_minclsize = MINCLSIZE;
399 	mbstat.m_mlen = MLEN;
400 	mbstat.m_mhlen = MHLEN;
401 	mbstat.m_numtypes = MT_NTYPES;
402 
403 	mbstat.m_mcfail = mbstat.m_mpfail = 0;
404 	mbstat.sf_iocnt = 0;
405 	mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
406 }
407 
408 /*
409  * UMA backend page allocator for the jumbo frame zones.
410  *
411  * Allocates kernel virtual memory that is backed by contiguous physical
412  * pages.
413  */
414 static void *
415 mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
416 {
417 
418 	/* Inform UMA that this allocator uses kernel_map/object. */
419 	*flags = UMA_SLAB_KERNEL;
420 	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait,
421 	    (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
422 }
423 
424 /*
425  * Constructor for Mbuf master zone.
426  *
427  * The 'arg' pointer points to a mb_args structure which
428  * contains call-specific information required to support the
429  * mbuf allocation API.  See mbuf.h.
430  */
431 static int
432 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
433 {
434 	struct mbuf *m;
435 	struct mb_args *args;
436 #ifdef MAC
437 	int error;
438 #endif
439 	int flags;
440 	short type;
441 
442 #ifdef INVARIANTS
443 	trash_ctor(mem, size, arg, how);
444 #endif
445 	m = (struct mbuf *)mem;
446 	args = (struct mb_args *)arg;
447 	flags = args->flags;
448 	type = args->type;
449 
450 	/*
451 	 * The mbuf is initialized later.  The caller has the
452 	 * responsibility to set up any MAC labels too.
453 	 */
454 	if (type == MT_NOINIT)
455 		return (0);
456 
457 	m->m_next = NULL;
458 	m->m_nextpkt = NULL;
459 	m->m_len = 0;
460 	m->m_flags = flags;
461 	m->m_type = type;
462 	if (flags & M_PKTHDR) {
463 		m->m_data = m->m_pktdat;
464 		m->m_pkthdr.rcvif = NULL;
465 		m->m_pkthdr.header = NULL;
466 		m->m_pkthdr.len = 0;
467 		m->m_pkthdr.csum_flags = 0;
468 		m->m_pkthdr.csum_data = 0;
469 		m->m_pkthdr.tso_segsz = 0;
470 		m->m_pkthdr.ether_vtag = 0;
471 		m->m_pkthdr.flowid = 0;
472 		SLIST_INIT(&m->m_pkthdr.tags);
473 #ifdef MAC
474 		/* If the label init fails, fail the alloc */
475 		error = mac_mbuf_init(m, how);
476 		if (error)
477 			return (error);
478 #endif
479 	} else
480 		m->m_data = m->m_dat;
481 	return (0);
482 }
483 
484 /*
485  * The Mbuf master zone destructor.
486  */
487 static void
488 mb_dtor_mbuf(void *mem, int size, void *arg)
489 {
490 	struct mbuf *m;
491 	unsigned long flags;
492 
493 	m = (struct mbuf *)mem;
494 	flags = (unsigned long)arg;
495 
496 	if ((flags & MB_NOTAGS) == 0 && (m->m_flags & M_PKTHDR) != 0)
497 		m_tag_delete_chain(m, NULL);
498 	KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
499 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
500 #ifdef INVARIANTS
501 	trash_dtor(mem, size, arg);
502 #endif
503 }
504 
505 /*
506  * The Mbuf Packet zone destructor.
507  */
508 static void
509 mb_dtor_pack(void *mem, int size, void *arg)
510 {
511 	struct mbuf *m;
512 
513 	m = (struct mbuf *)mem;
514 	if ((m->m_flags & M_PKTHDR) != 0)
515 		m_tag_delete_chain(m, NULL);
516 
517 	/* Make sure we've got a clean cluster back. */
518 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
519 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
520 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
521 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
522 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
523 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
524 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
525 	KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__));
526 #ifdef INVARIANTS
527 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
528 #endif
529 	/*
530 	 * If there are processes blocked on zone_clust, waiting for pages
531 	 * to be freed up, * cause them to be woken up by draining the
532 	 * packet zone.  We are exposed to a race here * (in the check for
533 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
534 	 * is deliberate. We don't want to acquire the zone lock for every
535 	 * mbuf free.
536 	 */
537 	if (uma_zone_exhausted_nolock(zone_clust))
538 		zone_drain(zone_pack);
539 }
540 
541 /*
542  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
543  *
544  * Here the 'arg' pointer points to the Mbuf which we
545  * are configuring cluster storage for.  If 'arg' is
546  * empty we allocate just the cluster without setting
547  * the mbuf to it.  See mbuf.h.
548  */
549 static int
550 mb_ctor_clust(void *mem, int size, void *arg, int how)
551 {
552 	struct mbuf *m;
553 	u_int *refcnt;
554 	int type;
555 	uma_zone_t zone;
556 
557 #ifdef INVARIANTS
558 	trash_ctor(mem, size, arg, how);
559 #endif
560 	switch (size) {
561 	case MCLBYTES:
562 		type = EXT_CLUSTER;
563 		zone = zone_clust;
564 		break;
565 #if MJUMPAGESIZE != MCLBYTES
566 	case MJUMPAGESIZE:
567 		type = EXT_JUMBOP;
568 		zone = zone_jumbop;
569 		break;
570 #endif
571 	case MJUM9BYTES:
572 		type = EXT_JUMBO9;
573 		zone = zone_jumbo9;
574 		break;
575 	case MJUM16BYTES:
576 		type = EXT_JUMBO16;
577 		zone = zone_jumbo16;
578 		break;
579 	default:
580 		panic("unknown cluster size");
581 		break;
582 	}
583 
584 	m = (struct mbuf *)arg;
585 	refcnt = uma_find_refcnt(zone, mem);
586 	*refcnt = 1;
587 	if (m != NULL) {
588 		m->m_ext.ext_buf = (caddr_t)mem;
589 		m->m_data = m->m_ext.ext_buf;
590 		m->m_flags |= M_EXT;
591 		m->m_ext.ext_free = NULL;
592 		m->m_ext.ext_arg1 = NULL;
593 		m->m_ext.ext_arg2 = NULL;
594 		m->m_ext.ext_size = size;
595 		m->m_ext.ext_type = type;
596 		m->m_ext.ref_cnt = refcnt;
597 	}
598 
599 	return (0);
600 }
601 
602 /*
603  * The Mbuf Cluster zone destructor.
604  */
605 static void
606 mb_dtor_clust(void *mem, int size, void *arg)
607 {
608 #ifdef INVARIANTS
609 	uma_zone_t zone;
610 
611 	zone = m_getzone(size);
612 	KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
613 		("%s: refcnt incorrect %u", __func__,
614 		 *(uma_find_refcnt(zone, mem))) );
615 
616 	trash_dtor(mem, size, arg);
617 #endif
618 }
619 
620 /*
621  * The Packet secondary zone's init routine, executed on the
622  * object's transition from mbuf keg slab to zone cache.
623  */
624 static int
625 mb_zinit_pack(void *mem, int size, int how)
626 {
627 	struct mbuf *m;
628 
629 	m = (struct mbuf *)mem;		/* m is virgin. */
630 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
631 	    m->m_ext.ext_buf == NULL)
632 		return (ENOMEM);
633 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
634 #ifdef INVARIANTS
635 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
636 #endif
637 	return (0);
638 }
639 
640 /*
641  * The Packet secondary zone's fini routine, executed on the
642  * object's transition from zone cache to keg slab.
643  */
644 static void
645 mb_zfini_pack(void *mem, int size)
646 {
647 	struct mbuf *m;
648 
649 	m = (struct mbuf *)mem;
650 #ifdef INVARIANTS
651 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
652 #endif
653 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
654 #ifdef INVARIANTS
655 	trash_dtor(mem, size, NULL);
656 #endif
657 }
658 
659 /*
660  * The "packet" keg constructor.
661  */
662 static int
663 mb_ctor_pack(void *mem, int size, void *arg, int how)
664 {
665 	struct mbuf *m;
666 	struct mb_args *args;
667 #ifdef MAC
668 	int error;
669 #endif
670 	int flags;
671 	short type;
672 
673 	m = (struct mbuf *)mem;
674 	args = (struct mb_args *)arg;
675 	flags = args->flags;
676 	type = args->type;
677 
678 #ifdef INVARIANTS
679 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
680 #endif
681 	m->m_next = NULL;
682 	m->m_nextpkt = NULL;
683 	m->m_data = m->m_ext.ext_buf;
684 	m->m_len = 0;
685 	m->m_flags = (flags | M_EXT);
686 	m->m_type = type;
687 
688 	if (flags & M_PKTHDR) {
689 		m->m_pkthdr.rcvif = NULL;
690 		m->m_pkthdr.len = 0;
691 		m->m_pkthdr.header = NULL;
692 		m->m_pkthdr.csum_flags = 0;
693 		m->m_pkthdr.csum_data = 0;
694 		m->m_pkthdr.tso_segsz = 0;
695 		m->m_pkthdr.ether_vtag = 0;
696 		m->m_pkthdr.flowid = 0;
697 		SLIST_INIT(&m->m_pkthdr.tags);
698 #ifdef MAC
699 		/* If the label init fails, fail the alloc */
700 		error = mac_mbuf_init(m, how);
701 		if (error)
702 			return (error);
703 #endif
704 	}
705 	/* m_ext is already initialized. */
706 
707 	return (0);
708 }
709 
710 int
711 m_pkthdr_init(struct mbuf *m, int how)
712 {
713 #ifdef MAC
714 	int error;
715 #endif
716 	m->m_data = m->m_pktdat;
717 	SLIST_INIT(&m->m_pkthdr.tags);
718 	m->m_pkthdr.rcvif = NULL;
719 	m->m_pkthdr.header = NULL;
720 	m->m_pkthdr.len = 0;
721 	m->m_pkthdr.flowid = 0;
722 	m->m_pkthdr.csum_flags = 0;
723 	m->m_pkthdr.csum_data = 0;
724 	m->m_pkthdr.tso_segsz = 0;
725 	m->m_pkthdr.ether_vtag = 0;
726 #ifdef MAC
727 	/* If the label init fails, fail the alloc */
728 	error = mac_mbuf_init(m, how);
729 	if (error)
730 		return (error);
731 #endif
732 
733 	return (0);
734 }
735 
736 /*
737  * This is the protocol drain routine.
738  *
739  * No locks should be held when this is called.  The drain routines have to
740  * presently acquire some locks which raises the possibility of lock order
741  * reversal.
742  */
743 static void
744 mb_reclaim(void *junk)
745 {
746 	struct domain *dp;
747 	struct protosw *pr;
748 
749 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
750 	    "mb_reclaim()");
751 
752 	for (dp = domains; dp != NULL; dp = dp->dom_next)
753 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
754 			if (pr->pr_drain != NULL)
755 				(*pr->pr_drain)();
756 }
757