xref: /freebsd/sys/kern/kern_mbuf.c (revision 23541160bb3e58f5deb04a299eda60fc80b731bc)
1 /*-
2  * Copyright (c) 2004, 2005,
3  *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_param.h"
32 
33 #include <sys/param.h>
34 #include <sys/malloc.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/mbuf.h>
38 #include <sys/domain.h>
39 #include <sys/eventhandler.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/protosw.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 
47 #include <security/mac/mac_framework.h>
48 
49 #include <vm/vm.h>
50 #include <vm/vm_extern.h>
51 #include <vm/vm_kern.h>
52 #include <vm/vm_page.h>
53 #include <vm/vm_map.h>
54 #include <vm/uma.h>
55 #include <vm/uma_dbg.h>
56 
57 /*
58  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
59  * Zones.
60  *
61  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
62  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
63  * administrator so desires.
64  *
65  * Mbufs are allocated from a UMA Master Zone called the Mbuf
66  * Zone.
67  *
68  * Additionally, FreeBSD provides a Packet Zone, which it
69  * configures as a Secondary Zone to the Mbuf Master Zone,
70  * thus sharing backend Slab kegs with the Mbuf Master Zone.
71  *
72  * Thus common-case allocations and locking are simplified:
73  *
74  *  m_clget()                m_getcl()
75  *    |                         |
76  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
77  *    |   |             [     Packet   ]            |
78  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
79  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
80  *        |                       \________         |
81  *  [ Cluster Keg   ]                      \       /
82  *        |	                         [ Mbuf Keg   ]
83  *  [ Cluster Slabs ]                         |
84  *        |                              [ Mbuf Slabs ]
85  *         \____________(VM)_________________/
86  *
87  *
88  * Whenever an object is allocated with uma_zalloc() out of
89  * one of the Zones its _ctor_ function is executed.  The same
90  * for any deallocation through uma_zfree() the _dtor_ function
91  * is executed.
92  *
93  * Caches are per-CPU and are filled from the Master Zone.
94  *
95  * Whenever an object is allocated from the underlying global
96  * memory pool it gets pre-initialized with the _zinit_ functions.
97  * When the Keg's are overfull objects get decomissioned with
98  * _zfini_ functions and free'd back to the global memory pool.
99  *
100  */
101 
102 int nmbufs;			/* limits number of mbufs */
103 int nmbclusters;		/* limits number of mbuf clusters */
104 int nmbjumbop;			/* limits number of page size jumbo clusters */
105 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
106 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
107 
108 static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
109 
110 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
111     "Maximum real memory allocatable to various mbuf types");
112 
113 /*
114  * tunable_mbinit() has to be run before any mbuf allocations are done.
115  */
116 static void
117 tunable_mbinit(void *dummy)
118 {
119 	quad_t realmem;
120 
121 	/*
122 	 * The default limit for all mbuf related memory is 1/2 of all
123 	 * available kernel memory (physical or kmem).
124 	 * At most it can be 3/4 of available kernel memory.
125 	 */
126 	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
127 	maxmbufmem = realmem / 2;
128 	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
129 	if (maxmbufmem > realmem / 4 * 3)
130 		maxmbufmem = realmem / 4 * 3;
131 
132 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
133 	if (nmbclusters == 0)
134 		nmbclusters = maxmbufmem / MCLBYTES / 4;
135 
136 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
137 	if (nmbjumbop == 0)
138 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
139 
140 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
141 	if (nmbjumbo9 == 0)
142 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
143 
144 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
145 	if (nmbjumbo16 == 0)
146 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
147 
148 	/*
149 	 * We need at least as many mbufs as we have clusters of
150 	 * the various types added together.
151 	 */
152 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
153 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
154 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
155 		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
156 }
157 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
158 
159 static int
160 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
161 {
162 	int error, newnmbclusters;
163 
164 	newnmbclusters = nmbclusters;
165 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
166 	if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
167 		if (newnmbclusters > nmbclusters &&
168 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
169 			nmbclusters = newnmbclusters;
170 			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
171 			EVENTHANDLER_INVOKE(nmbclusters_change);
172 		} else
173 			error = EINVAL;
174 	}
175 	return (error);
176 }
177 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
178 &nmbclusters, 0, sysctl_nmbclusters, "IU",
179     "Maximum number of mbuf clusters allowed");
180 
181 static int
182 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
183 {
184 	int error, newnmbjumbop;
185 
186 	newnmbjumbop = nmbjumbop;
187 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
188 	if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
189 		if (newnmbjumbop > nmbjumbop &&
190 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
191 			nmbjumbop = newnmbjumbop;
192 			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
193 		} else
194 			error = EINVAL;
195 	}
196 	return (error);
197 }
198 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
199 &nmbjumbop, 0, sysctl_nmbjumbop, "IU",
200     "Maximum number of mbuf page size jumbo clusters allowed");
201 
202 static int
203 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
204 {
205 	int error, newnmbjumbo9;
206 
207 	newnmbjumbo9 = nmbjumbo9;
208 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
209 	if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
210 		if (newnmbjumbo9 > nmbjumbo9 &&
211 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
212 			nmbjumbo9 = newnmbjumbo9;
213 			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
214 		} else
215 			error = EINVAL;
216 	}
217 	return (error);
218 }
219 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
220 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
221     "Maximum number of mbuf 9k jumbo clusters allowed");
222 
223 static int
224 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
225 {
226 	int error, newnmbjumbo16;
227 
228 	newnmbjumbo16 = nmbjumbo16;
229 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
230 	if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
231 		if (newnmbjumbo16 > nmbjumbo16 &&
232 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
233 			nmbjumbo16 = newnmbjumbo16;
234 			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
235 		} else
236 			error = EINVAL;
237 	}
238 	return (error);
239 }
240 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
241 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
242     "Maximum number of mbuf 16k jumbo clusters allowed");
243 
244 static int
245 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
246 {
247 	int error, newnmbufs;
248 
249 	newnmbufs = nmbufs;
250 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
251 	if (error == 0 && req->newptr && newnmbufs != nmbufs) {
252 		if (newnmbufs > nmbufs) {
253 			nmbufs = newnmbufs;
254 			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
255 			EVENTHANDLER_INVOKE(nmbufs_change);
256 		} else
257 			error = EINVAL;
258 	}
259 	return (error);
260 }
261 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW,
262 &nmbufs, 0, sysctl_nmbufs, "IU",
263     "Maximum number of mbufs allowed");
264 
265 /*
266  * Zones from which we allocate.
267  */
268 uma_zone_t	zone_mbuf;
269 uma_zone_t	zone_clust;
270 uma_zone_t	zone_pack;
271 uma_zone_t	zone_jumbop;
272 uma_zone_t	zone_jumbo9;
273 uma_zone_t	zone_jumbo16;
274 uma_zone_t	zone_ext_refcnt;
275 
276 /*
277  * Local prototypes.
278  */
279 static int	mb_ctor_mbuf(void *, int, void *, int);
280 static int	mb_ctor_clust(void *, int, void *, int);
281 static int	mb_ctor_pack(void *, int, void *, int);
282 static void	mb_dtor_mbuf(void *, int, void *);
283 static void	mb_dtor_clust(void *, int, void *);
284 static void	mb_dtor_pack(void *, int, void *);
285 static int	mb_zinit_pack(void *, int, int);
286 static void	mb_zfini_pack(void *, int);
287 
288 static void	mb_reclaim(uma_zone_t, int);
289 static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
290 
291 /* Ensure that MSIZE is a power of 2. */
292 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
293 
294 /*
295  * Initialize FreeBSD Network buffer allocation.
296  */
297 static void
298 mbuf_init(void *dummy)
299 {
300 
301 	/*
302 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
303 	 */
304 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
305 	    mb_ctor_mbuf, mb_dtor_mbuf,
306 #ifdef INVARIANTS
307 	    trash_init, trash_fini,
308 #else
309 	    NULL, NULL,
310 #endif
311 	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
312 	if (nmbufs > 0)
313 		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
314 	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
315 	uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
316 
317 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
318 	    mb_ctor_clust, mb_dtor_clust,
319 #ifdef INVARIANTS
320 	    trash_init, trash_fini,
321 #else
322 	    NULL, NULL,
323 #endif
324 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
325 	if (nmbclusters > 0)
326 		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
327 	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
328 	uma_zone_set_maxaction(zone_clust, mb_reclaim);
329 
330 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
331 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
332 
333 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
334 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
335 	    mb_ctor_clust, mb_dtor_clust,
336 #ifdef INVARIANTS
337 	    trash_init, trash_fini,
338 #else
339 	    NULL, NULL,
340 #endif
341 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
342 	if (nmbjumbop > 0)
343 		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
344 	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
345 	uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
346 
347 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
348 	    mb_ctor_clust, mb_dtor_clust,
349 #ifdef INVARIANTS
350 	    trash_init, trash_fini,
351 #else
352 	    NULL, NULL,
353 #endif
354 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
355 	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
356 	if (nmbjumbo9 > 0)
357 		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
358 	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
359 	uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
360 
361 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
362 	    mb_ctor_clust, mb_dtor_clust,
363 #ifdef INVARIANTS
364 	    trash_init, trash_fini,
365 #else
366 	    NULL, NULL,
367 #endif
368 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
369 	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
370 	if (nmbjumbo16 > 0)
371 		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
372 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
373 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
374 
375 	zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
376 	    NULL, NULL,
377 	    NULL, NULL,
378 	    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
379 
380 	/*
381 	 * Hook event handler for low-memory situation, used to
382 	 * drain protocols and push data back to the caches (UMA
383 	 * later pushes it back to VM).
384 	 */
385 	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
386 	    EVENTHANDLER_PRI_FIRST);
387 }
388 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
389 
390 /*
391  * UMA backend page allocator for the jumbo frame zones.
392  *
393  * Allocates kernel virtual memory that is backed by contiguous physical
394  * pages.
395  */
396 static void *
397 mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
398 {
399 
400 	/* Inform UMA that this allocator uses kernel_map/object. */
401 	*flags = UMA_SLAB_KERNEL;
402 	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait,
403 	    (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
404 }
405 
406 /*
407  * Constructor for Mbuf master zone.
408  *
409  * The 'arg' pointer points to a mb_args structure which
410  * contains call-specific information required to support the
411  * mbuf allocation API.  See mbuf.h.
412  */
413 static int
414 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
415 {
416 	struct mbuf *m;
417 	struct mb_args *args;
418 	int error;
419 	int flags;
420 	short type;
421 
422 #ifdef INVARIANTS
423 	trash_ctor(mem, size, arg, how);
424 #endif
425 	args = (struct mb_args *)arg;
426 	type = args->type;
427 
428 	/*
429 	 * The mbuf is initialized later.  The caller has the
430 	 * responsibility to set up any MAC labels too.
431 	 */
432 	if (type == MT_NOINIT)
433 		return (0);
434 
435 	m = (struct mbuf *)mem;
436 	flags = args->flags;
437 
438 	error = m_init(m, NULL, size, how, type, flags);
439 
440 	return (error);
441 }
442 
443 /*
444  * The Mbuf master zone destructor.
445  */
446 static void
447 mb_dtor_mbuf(void *mem, int size, void *arg)
448 {
449 	struct mbuf *m;
450 	unsigned long flags;
451 
452 	m = (struct mbuf *)mem;
453 	flags = (unsigned long)arg;
454 
455 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
456 	if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
457 		m_tag_delete_chain(m, NULL);
458 #ifdef INVARIANTS
459 	trash_dtor(mem, size, arg);
460 #endif
461 }
462 
463 /*
464  * The Mbuf Packet zone destructor.
465  */
466 static void
467 mb_dtor_pack(void *mem, int size, void *arg)
468 {
469 	struct mbuf *m;
470 
471 	m = (struct mbuf *)mem;
472 	if ((m->m_flags & M_PKTHDR) != 0)
473 		m_tag_delete_chain(m, NULL);
474 
475 	/* Make sure we've got a clean cluster back. */
476 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
477 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
478 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
479 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
480 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
481 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
482 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
483 	KASSERT(*m->m_ext.ext_cnt == 1, ("%s: ext_cnt != 1", __func__));
484 #ifdef INVARIANTS
485 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
486 #endif
487 	/*
488 	 * If there are processes blocked on zone_clust, waiting for pages
489 	 * to be freed up, * cause them to be woken up by draining the
490 	 * packet zone.  We are exposed to a race here * (in the check for
491 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
492 	 * is deliberate. We don't want to acquire the zone lock for every
493 	 * mbuf free.
494 	 */
495 	if (uma_zone_exhausted_nolock(zone_clust))
496 		zone_drain(zone_pack);
497 }
498 
499 /*
500  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
501  *
502  * Here the 'arg' pointer points to the Mbuf which we
503  * are configuring cluster storage for.  If 'arg' is
504  * empty we allocate just the cluster without setting
505  * the mbuf to it.  See mbuf.h.
506  */
507 static int
508 mb_ctor_clust(void *mem, int size, void *arg, int how)
509 {
510 	struct mbuf *m;
511 	u_int *refcnt;
512 	int type;
513 	uma_zone_t zone;
514 
515 #ifdef INVARIANTS
516 	trash_ctor(mem, size, arg, how);
517 #endif
518 	switch (size) {
519 	case MCLBYTES:
520 		type = EXT_CLUSTER;
521 		zone = zone_clust;
522 		break;
523 #if MJUMPAGESIZE != MCLBYTES
524 	case MJUMPAGESIZE:
525 		type = EXT_JUMBOP;
526 		zone = zone_jumbop;
527 		break;
528 #endif
529 	case MJUM9BYTES:
530 		type = EXT_JUMBO9;
531 		zone = zone_jumbo9;
532 		break;
533 	case MJUM16BYTES:
534 		type = EXT_JUMBO16;
535 		zone = zone_jumbo16;
536 		break;
537 	default:
538 		panic("unknown cluster size");
539 		break;
540 	}
541 
542 	m = (struct mbuf *)arg;
543 	refcnt = uma_find_refcnt(zone, mem);
544 	*refcnt = 1;
545 	if (m != NULL) {
546 		m->m_ext.ext_buf = (caddr_t)mem;
547 		m->m_data = m->m_ext.ext_buf;
548 		m->m_flags |= M_EXT;
549 		m->m_ext.ext_free = NULL;
550 		m->m_ext.ext_arg1 = NULL;
551 		m->m_ext.ext_arg2 = NULL;
552 		m->m_ext.ext_size = size;
553 		m->m_ext.ext_type = type;
554 		m->m_ext.ext_flags = 0;
555 		m->m_ext.ext_cnt = refcnt;
556 	}
557 
558 	return (0);
559 }
560 
561 /*
562  * The Mbuf Cluster zone destructor.
563  */
564 static void
565 mb_dtor_clust(void *mem, int size, void *arg)
566 {
567 #ifdef INVARIANTS
568 	uma_zone_t zone;
569 
570 	zone = m_getzone(size);
571 	KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
572 		("%s: refcnt incorrect %u", __func__,
573 		 *(uma_find_refcnt(zone, mem))) );
574 
575 	trash_dtor(mem, size, arg);
576 #endif
577 }
578 
579 /*
580  * The Packet secondary zone's init routine, executed on the
581  * object's transition from mbuf keg slab to zone cache.
582  */
583 static int
584 mb_zinit_pack(void *mem, int size, int how)
585 {
586 	struct mbuf *m;
587 
588 	m = (struct mbuf *)mem;		/* m is virgin. */
589 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
590 	    m->m_ext.ext_buf == NULL)
591 		return (ENOMEM);
592 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
593 #ifdef INVARIANTS
594 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
595 #endif
596 	return (0);
597 }
598 
599 /*
600  * The Packet secondary zone's fini routine, executed on the
601  * object's transition from zone cache to keg slab.
602  */
603 static void
604 mb_zfini_pack(void *mem, int size)
605 {
606 	struct mbuf *m;
607 
608 	m = (struct mbuf *)mem;
609 #ifdef INVARIANTS
610 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
611 #endif
612 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
613 #ifdef INVARIANTS
614 	trash_dtor(mem, size, NULL);
615 #endif
616 }
617 
618 /*
619  * The "packet" keg constructor.
620  */
621 static int
622 mb_ctor_pack(void *mem, int size, void *arg, int how)
623 {
624 	struct mbuf *m;
625 	struct mb_args *args;
626 	int error, flags;
627 	short type;
628 
629 	m = (struct mbuf *)mem;
630 	args = (struct mb_args *)arg;
631 	flags = args->flags;
632 	type = args->type;
633 
634 #ifdef INVARIANTS
635 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
636 #endif
637 
638 	error = m_init(m, NULL, size, how, type, flags);
639 
640 	/* m_ext is already initialized. */
641 	m->m_data = m->m_ext.ext_buf;
642  	m->m_flags = (flags | M_EXT);
643 
644 	return (error);
645 }
646 
647 int
648 m_pkthdr_init(struct mbuf *m, int how)
649 {
650 #ifdef MAC
651 	int error;
652 #endif
653 	m->m_data = m->m_pktdat;
654 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
655 #ifdef MAC
656 	/* If the label init fails, fail the alloc */
657 	error = mac_mbuf_init(m, how);
658 	if (error)
659 		return (error);
660 #endif
661 
662 	return (0);
663 }
664 
665 /*
666  * This is the protocol drain routine.  Called by UMA whenever any of the
667  * mbuf zones is closed to its limit.
668  *
669  * No locks should be held when this is called.  The drain routines have to
670  * presently acquire some locks which raises the possibility of lock order
671  * reversal.
672  */
673 static void
674 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
675 {
676 	struct domain *dp;
677 	struct protosw *pr;
678 
679 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__);
680 
681 	for (dp = domains; dp != NULL; dp = dp->dom_next)
682 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
683 			if (pr->pr_drain != NULL)
684 				(*pr->pr_drain)();
685 }
686