xref: /titanic_44/usr/src/uts/common/io/multidata.c (revision 52244c0958bdf281ca42932b449f644b4decfdc2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Multidata, as described in the following papers:
29  *
30  * Adi Masputra,
31  * Multidata V.2: VA-Disjoint Packet Extents Framework Interface
32  * Design Specification.  August 2004.
33  * Available as http://sac.sfbay/PSARC/2004/594/materials/mmd2.pdf.
34  *
35  * Adi Masputra,
36  * Multidata Interface Design Specification.  Sep 2002.
37  * Available as http://sac.sfbay/PSARC/2002/276/materials/mmd.pdf.
38  *
39  * Adi Masputra, Frank DiMambro, Kacheong Poon,
40  * An Efficient Networking Transmit Mechanism for Solaris:
41  * Multidata Transmit (MDT).  May 2002.
42  * Available as http://sac.sfbay/PSARC/2002/276/materials/mdt.pdf.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/stream.h>
47 #include <sys/dlpi.h>
48 #include <sys/stropts.h>
49 #include <sys/strsun.h>
50 #include <sys/strlog.h>
51 #include <sys/strsubr.h>
52 #include <sys/sysmacros.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/kmem.h>
56 #include <sys/atomic.h>
57 
58 #include <sys/multidata.h>
59 #include <sys/multidata_impl.h>
60 
61 static int mmd_constructor(void *, void *, int);
62 static void mmd_destructor(void *, void *);
63 static int pdslab_constructor(void *, void *, int);
64 static void pdslab_destructor(void *, void *);
65 static int pattbl_constructor(void *, void *, int);
66 static void pattbl_destructor(void *, void *);
67 static void mmd_esballoc_free(caddr_t);
68 static int mmd_copy_pattbl(patbkt_t *, multidata_t *, pdesc_t *, int);
69 
70 static boolean_t pbuf_ref_valid(multidata_t *, pdescinfo_t *);
71 #pragma inline(pbuf_ref_valid)
72 
73 static boolean_t pdi_in_range(pdescinfo_t *, pdescinfo_t *);
74 #pragma inline(pdi_in_range)
75 
76 static pdesc_t *mmd_addpdesc_int(multidata_t *, pdescinfo_t *, int *, int);
77 #pragma inline(mmd_addpdesc_int)
78 
79 static void mmd_destroy_pattbl(patbkt_t **);
80 #pragma inline(mmd_destroy_pattbl)
81 
82 static pattr_t *mmd_find_pattr(patbkt_t *, uint_t);
83 #pragma inline(mmd_find_pattr)
84 
85 static pdesc_t *mmd_destroy_pdesc(multidata_t *, pdesc_t *);
86 #pragma inline(mmd_destroy_pdesc)
87 
88 static pdesc_t *mmd_getpdesc(multidata_t *, pdesc_t *, pdescinfo_t *, uint_t,
89     boolean_t);
90 #pragma inline(mmd_getpdesc)
91 
92 static struct kmem_cache *mmd_cache;
93 static struct kmem_cache *pd_slab_cache;
94 static struct kmem_cache *pattbl_cache;
95 
96 int mmd_debug = 1;
97 #define	MMD_DEBUG(s)	if (mmd_debug > 0) cmn_err s
98 
99 /*
100  * Set to this to true to bypass pdesc bounds checking.
101  */
102 boolean_t mmd_speed_over_safety = B_FALSE;
103 
104 /*
105  * Patchable kmem_cache flags.
106  */
107 int mmd_kmem_flags = 0;
108 int pdslab_kmem_flags = 0;
109 int pattbl_kmem_flags = 0;
110 
111 /*
112  * Alignment (in bytes) of our kmem caches.
113  */
114 #define	MULTIDATA_CACHE_ALIGN	64
115 
116 /*
117  * Default number of packet descriptors per descriptor slab.  Making
118  * this too small will trigger more descriptor slab allocation; making
119  * it too large will create too many unclaimed descriptors.
120  */
121 #define	PDSLAB_SZ	15
122 uint_t pdslab_sz = PDSLAB_SZ;
123 
124 /*
125  * Default attribute hash table size.  It's okay to set this to a small
126  * value (even to 1) because there aren't that many attributes currently
127  * defined, and because we assume there won't be many attributes associated
128  * with a Multidata at a given time.  Increasing the size will reduce
129  * attribute search time (given a large number of attributes in a Multidata),
130  * and decreasing it will reduce the memory footprints and the overhead
131  * associated with managing the table.
132  */
133 #define	PATTBL_SZ	1
134 uint_t pattbl_sz = PATTBL_SZ;
135 
136 /*
137  * Attribute hash key.
138  */
139 #define	PATTBL_HASH(x, sz)	((x) % (sz))
140 
141 /*
142  * Structure that precedes each Multidata metadata.
143  */
144 struct mmd_buf_info {
145 	frtn_t	frp;		/* free routine */
146 	uint_t	buf_len;	/* length of kmem buffer */
147 };
148 
149 /*
150  * The size of each metadata buffer.
151  */
152 #define	MMD_CACHE_SIZE	\
153 	(sizeof (struct mmd_buf_info) + sizeof (multidata_t))
154 
155 /*
156  * Called during startup in order to create the Multidata kmem caches.
157  */
158 void
159 mmd_init(void)
160 {
161 	pdslab_sz = MAX(1, pdslab_sz);	/* at least 1 descriptor */
162 	pattbl_sz = MAX(1, pattbl_sz);	/* at least 1 bucket */
163 
164 	mmd_cache = kmem_cache_create("multidata", MMD_CACHE_SIZE,
165 	    MULTIDATA_CACHE_ALIGN, mmd_constructor, mmd_destructor,
166 	    NULL, NULL, NULL, mmd_kmem_flags);
167 
168 	pd_slab_cache = kmem_cache_create("multidata_pdslab",
169 	    PDESC_SLAB_SIZE(pdslab_sz), MULTIDATA_CACHE_ALIGN,
170 	    pdslab_constructor, pdslab_destructor, NULL,
171 	    (void *)(uintptr_t)pdslab_sz, NULL, pdslab_kmem_flags);
172 
173 	pattbl_cache = kmem_cache_create("multidata_pattbl",
174 	    sizeof (patbkt_t) * pattbl_sz, MULTIDATA_CACHE_ALIGN,
175 	    pattbl_constructor, pattbl_destructor, NULL,
176 	    (void *)(uintptr_t)pattbl_sz, NULL, pattbl_kmem_flags);
177 }
178 
179 /*
180  * Create a Multidata message block.
181  */
182 multidata_t *
183 mmd_alloc(mblk_t *hdr_mp, mblk_t **mmd_mp, int kmflags)
184 {
185 	uchar_t *buf;
186 	multidata_t *mmd;
187 	uint_t mmd_mplen;
188 	struct mmd_buf_info *buf_info;
189 
190 	ASSERT(hdr_mp != NULL);
191 	ASSERT(mmd_mp != NULL);
192 
193 	/*
194 	 * Caller should never pass in a chain of mblks since we
195 	 * only care about the first one, hence the assertions.
196 	 */
197 	ASSERT(hdr_mp->b_cont == NULL);
198 
199 	if ((buf = kmem_cache_alloc(mmd_cache, kmflags)) == NULL)
200 		return (NULL);
201 
202 	buf_info = (struct mmd_buf_info *)buf;
203 	buf_info->frp.free_arg = (caddr_t)buf;
204 
205 	mmd = (multidata_t *)(buf_info + 1);
206 	mmd_mplen = sizeof (*mmd);
207 
208 	if ((*mmd_mp = desballoc((uchar_t *)mmd, mmd_mplen, BPRI_HI,
209 	    &(buf_info->frp))) == NULL) {
210 		kmem_cache_free(mmd_cache, buf);
211 		return (NULL);
212 	}
213 
214 	DB_TYPE(*mmd_mp) = M_MULTIDATA;
215 	(*mmd_mp)->b_wptr += mmd_mplen;
216 	mmd->mmd_dp = (*mmd_mp)->b_datap;
217 	mmd->mmd_hbuf = hdr_mp;
218 
219 	return (mmd);
220 }
221 
222 /*
223  * Associate additional payload buffer to the Multidata.
224  */
225 int
226 mmd_addpldbuf(multidata_t *mmd, mblk_t *pld_mp)
227 {
228 	int i;
229 
230 	ASSERT(mmd != NULL);
231 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
232 	ASSERT(pld_mp != NULL);
233 
234 	mutex_enter(&mmd->mmd_pd_slab_lock);
235 	for (i = 0; i < MULTIDATA_MAX_PBUFS &&
236 	    mmd->mmd_pbuf_cnt < MULTIDATA_MAX_PBUFS; i++) {
237 		if (mmd->mmd_pbuf[i] == pld_mp) {
238 			/* duplicate entry */
239 			MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding "
240 			    "pld 0x%p to mmd 0x%p since it has been "
241 			    "previously added into slot %d (total %d)\n",
242 			    (void *)pld_mp, (void *)mmd, i, mmd->mmd_pbuf_cnt));
243 			mutex_exit(&mmd->mmd_pd_slab_lock);
244 			return (-1);
245 		} else if (mmd->mmd_pbuf[i] == NULL) {
246 			mmd->mmd_pbuf[i] = pld_mp;
247 			mmd->mmd_pbuf_cnt++;
248 			mutex_exit(&mmd->mmd_pd_slab_lock);
249 			return (i);
250 		}
251 	}
252 
253 	/* all slots are taken */
254 	MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding pld 0x%p to mmd 0x%p "
255 	    "since no slot space is left (total %d max %d)\n", (void *)pld_mp,
256 	    (void *)mmd, mmd->mmd_pbuf_cnt, MULTIDATA_MAX_PBUFS));
257 	mutex_exit(&mmd->mmd_pd_slab_lock);
258 
259 	return (-1);
260 }
261 
262 /*
263  * Multidata metadata kmem cache constructor routine.
264  */
265 /* ARGSUSED */
266 static int
267 mmd_constructor(void *buf, void *cdrarg, int kmflags)
268 {
269 	struct mmd_buf_info *buf_info;
270 	multidata_t *mmd;
271 
272 	bzero((void *)buf, MMD_CACHE_SIZE);
273 
274 	buf_info = (struct mmd_buf_info *)buf;
275 	buf_info->frp.free_func = mmd_esballoc_free;
276 	buf_info->buf_len = MMD_CACHE_SIZE;
277 
278 	mmd = (multidata_t *)(buf_info + 1);
279 	mmd->mmd_magic = MULTIDATA_MAGIC;
280 
281 	mutex_init(&(mmd->mmd_pd_slab_lock), NULL, MUTEX_DRIVER, NULL);
282 	QL_INIT(&(mmd->mmd_pd_slab_q));
283 	QL_INIT(&(mmd->mmd_pd_q));
284 
285 	return (0);
286 }
287 
288 /*
289  * Multidata metadata kmem cache destructor routine.
290  */
291 /* ARGSUSED */
292 static void
293 mmd_destructor(void *buf, void *cdrarg)
294 {
295 	multidata_t *mmd;
296 #ifdef DEBUG
297 	int i;
298 #endif
299 
300 	mmd = (multidata_t *)((uchar_t *)buf + sizeof (struct mmd_buf_info));
301 
302 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
303 	ASSERT(mmd->mmd_dp == NULL);
304 	ASSERT(mmd->mmd_hbuf == NULL);
305 	ASSERT(mmd->mmd_pbuf_cnt == 0);
306 #ifdef DEBUG
307 	for (i = 0; i < MULTIDATA_MAX_PBUFS; i++)
308 		ASSERT(mmd->mmd_pbuf[i] == NULL);
309 #endif
310 	ASSERT(mmd->mmd_pattbl == NULL);
311 
312 	mutex_destroy(&(mmd->mmd_pd_slab_lock));
313 	ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
314 	ASSERT(mmd->mmd_slab_cnt == 0);
315 	ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
316 	ASSERT(mmd->mmd_pd_cnt == 0);
317 	ASSERT(mmd->mmd_hbuf_ref == 0);
318 	ASSERT(mmd->mmd_pbuf_ref == 0);
319 }
320 
321 /*
322  * Multidata message block free callback routine.
323  */
324 static void
325 mmd_esballoc_free(caddr_t buf)
326 {
327 	multidata_t *mmd;
328 	pdesc_t *pd;
329 	pdesc_slab_t *slab;
330 	int i;
331 
332 	ASSERT(buf != NULL);
333 	ASSERT(((struct mmd_buf_info *)buf)->buf_len == MMD_CACHE_SIZE);
334 
335 	mmd = (multidata_t *)(buf + sizeof (struct mmd_buf_info));
336 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
337 
338 	ASSERT(mmd->mmd_dp != NULL);
339 	ASSERT(mmd->mmd_dp->db_ref == 1);
340 
341 	/* remove all packet descriptors and private attributes */
342 	pd = Q2PD(mmd->mmd_pd_q.ql_next);
343 	while (pd != Q2PD(&(mmd->mmd_pd_q)))
344 		pd = mmd_destroy_pdesc(mmd, pd);
345 
346 	ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
347 	ASSERT(mmd->mmd_pd_cnt == 0);
348 	ASSERT(mmd->mmd_hbuf_ref == 0);
349 	ASSERT(mmd->mmd_pbuf_ref == 0);
350 
351 	/* remove all global attributes */
352 	if (mmd->mmd_pattbl != NULL)
353 		mmd_destroy_pattbl(&(mmd->mmd_pattbl));
354 
355 	/* remove all descriptor slabs */
356 	slab = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_next);
357 	while (slab != Q2PDSLAB(&(mmd->mmd_pd_slab_q))) {
358 		pdesc_slab_t *slab_next = Q2PDSLAB(slab->pds_next);
359 
360 		remque(&(slab->pds_next));
361 		slab->pds_next = NULL;
362 		slab->pds_prev = NULL;
363 		slab->pds_mmd = NULL;
364 		slab->pds_used = 0;
365 		kmem_cache_free(pd_slab_cache, slab);
366 
367 		ASSERT(mmd->mmd_slab_cnt > 0);
368 		mmd->mmd_slab_cnt--;
369 		slab = slab_next;
370 	}
371 	ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
372 	ASSERT(mmd->mmd_slab_cnt == 0);
373 
374 	mmd->mmd_dp = NULL;
375 
376 	/* finally, free all associated message blocks */
377 	if (mmd->mmd_hbuf != NULL) {
378 		freeb(mmd->mmd_hbuf);
379 		mmd->mmd_hbuf = NULL;
380 	}
381 
382 	for (i = 0; i < MULTIDATA_MAX_PBUFS; i++) {
383 		if (mmd->mmd_pbuf[i] != NULL) {
384 			freeb(mmd->mmd_pbuf[i]);
385 			mmd->mmd_pbuf[i] = NULL;
386 			ASSERT(mmd->mmd_pbuf_cnt > 0);
387 			mmd->mmd_pbuf_cnt--;
388 		}
389 	}
390 
391 	ASSERT(mmd->mmd_pbuf_cnt == 0);
392 	ASSERT(MUTEX_NOT_HELD(&(mmd->mmd_pd_slab_lock)));
393 	kmem_cache_free(mmd_cache, buf);
394 }
395 
396 /*
397  * Multidata message block copy routine, called by copyb() when it
398  * encounters a M_MULTIDATA data block type.  This routine should
399  * not be called by anyone other than copyb(), since it may go away
400  * (read: become static to this module) once some sort of copy callback
401  * routine is made available.
402  */
403 mblk_t *
404 mmd_copy(mblk_t *bp, int kmflags)
405 {
406 	multidata_t *mmd, *n_mmd;
407 	mblk_t *n_hbuf = NULL, *n_pbuf[MULTIDATA_MAX_PBUFS];
408 	mblk_t **pmp_last = &n_pbuf[MULTIDATA_MAX_PBUFS - 1];
409 	mblk_t **pmp;
410 	mblk_t *n_bp = NULL;
411 	pdesc_t *pd;
412 	uint_t n_pbuf_cnt = 0;
413 	int idx, i;
414 
415 #define	FREE_PBUFS() {					\
416 	for (pmp = &n_pbuf[0]; pmp <= pmp_last; pmp++)	\
417 		if (*pmp != NULL) freeb(*pmp);		\
418 }
419 
420 #define	REL_OFF(p, base, n_base)			\
421 	((uchar_t *)(n_base) + ((uchar_t *)(p) - (uchar_t *)base))
422 
423 	ASSERT(bp != NULL && DB_TYPE(bp) == M_MULTIDATA);
424 	mmd = mmd_getmultidata(bp);
425 
426 	/* copy the header buffer */
427 	if (mmd->mmd_hbuf != NULL && (n_hbuf = copyb(mmd->mmd_hbuf)) == NULL)
428 		return (NULL);
429 
430 	/* copy the payload buffer(s) */
431 	mutex_enter(&mmd->mmd_pd_slab_lock);
432 	bzero((void *)&n_pbuf[0], sizeof (mblk_t *) * MULTIDATA_MAX_PBUFS);
433 	n_pbuf_cnt = mmd->mmd_pbuf_cnt;
434 	for (i = 0; i < n_pbuf_cnt; i++) {
435 		ASSERT(mmd->mmd_pbuf[i] != NULL);
436 		n_pbuf[i] = copyb(mmd->mmd_pbuf[i]);
437 		if (n_pbuf[i] == NULL) {
438 			FREE_PBUFS();
439 			mutex_exit(&mmd->mmd_pd_slab_lock);
440 			return (NULL);
441 		}
442 	}
443 
444 	/* allocate new Multidata */
445 	n_mmd = mmd_alloc(n_hbuf, &n_bp, kmflags);
446 	if (n_mmd == NULL) {
447 		if (n_hbuf != NULL)
448 			freeb(n_hbuf);
449 		if (n_pbuf_cnt != 0)
450 			FREE_PBUFS();
451 		mutex_exit(&mmd->mmd_pd_slab_lock);
452 		return (NULL);
453 	}
454 
455 	/*
456 	 * Add payload buffer(s); upon success, leave n_pbuf array
457 	 * alone, as the newly-created Multidata had already contained
458 	 * the mblk pointers stored in the array.  These will be freed
459 	 * along with the Multidata itself.
460 	 */
461 	for (i = 0, pmp = &n_pbuf[0]; i < n_pbuf_cnt; i++, pmp++) {
462 		idx = mmd_addpldbuf(n_mmd, *pmp);
463 		if (idx < 0) {
464 			FREE_PBUFS();
465 			freeb(n_bp);
466 			mutex_exit(&mmd->mmd_pd_slab_lock);
467 			return (NULL);
468 		}
469 	}
470 
471 	/* copy over global attributes */
472 	if (mmd->mmd_pattbl != NULL &&
473 	    mmd_copy_pattbl(mmd->mmd_pattbl, n_mmd, NULL, kmflags) < 0) {
474 		freeb(n_bp);
475 		mutex_exit(&mmd->mmd_pd_slab_lock);
476 		return (NULL);
477 	}
478 
479 	/* copy over packet descriptors and their atttributes */
480 	pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);	/* first pdesc */
481 	while (pd != NULL) {
482 		pdesc_t *n_pd;
483 		pdescinfo_t *pdi, n_pdi;
484 		uchar_t *n_base, *base;
485 		pdesc_t *pd_next;
486 
487 		/* next pdesc */
488 		pd_next = mmd_getpdesc(pd->pd_slab->pds_mmd, pd, NULL,
489 		    1, B_TRUE);
490 
491 		/* skip if already removed */
492 		if (pd->pd_flags & PDESC_REM_DEFER) {
493 			pd = pd_next;
494 			continue;
495 		}
496 
497 		pdi = &(pd->pd_pdi);
498 		bzero(&n_pdi, sizeof (n_pdi));
499 
500 		/*
501 		 * Calculate new descriptor values based on the offset of
502 		 * each pointer relative to the associated buffer(s).
503 		 */
504 		ASSERT(pdi->flags & PDESC_HAS_REF);
505 		if (pdi->flags & PDESC_HBUF_REF) {
506 			n_base = n_mmd->mmd_hbuf->b_rptr;
507 			base = mmd->mmd_hbuf->b_rptr;
508 
509 			n_pdi.flags |= PDESC_HBUF_REF;
510 			n_pdi.hdr_base = REL_OFF(pdi->hdr_base, base, n_base);
511 			n_pdi.hdr_rptr = REL_OFF(pdi->hdr_rptr, base, n_base);
512 			n_pdi.hdr_wptr = REL_OFF(pdi->hdr_wptr, base, n_base);
513 			n_pdi.hdr_lim = REL_OFF(pdi->hdr_lim, base, n_base);
514 		}
515 
516 		if (pdi->flags & PDESC_PBUF_REF) {
517 			n_pdi.flags |= PDESC_PBUF_REF;
518 			n_pdi.pld_cnt = pdi->pld_cnt;
519 
520 			for (i = 0; i < pdi->pld_cnt; i++) {
521 				idx = pdi->pld_ary[i].pld_pbuf_idx;
522 				ASSERT(idx < MULTIDATA_MAX_PBUFS);
523 				ASSERT(n_mmd->mmd_pbuf[idx] != NULL);
524 				ASSERT(mmd->mmd_pbuf[idx] != NULL);
525 
526 				n_base = n_mmd->mmd_pbuf[idx]->b_rptr;
527 				base = mmd->mmd_pbuf[idx]->b_rptr;
528 
529 				n_pdi.pld_ary[i].pld_pbuf_idx = idx;
530 
531 				/*
532 				 * We can't copy the pointers just like that,
533 				 * so calculate the relative offset.
534 				 */
535 				n_pdi.pld_ary[i].pld_rptr =
536 				    REL_OFF(pdi->pld_ary[i].pld_rptr,
537 					base, n_base);
538 				n_pdi.pld_ary[i].pld_wptr =
539 				    REL_OFF(pdi->pld_ary[i].pld_wptr,
540 					base, n_base);
541 			}
542 		}
543 
544 		/* add the new descriptor to the new Multidata */
545 		n_pd = mmd_addpdesc_int(n_mmd, &n_pdi, NULL, kmflags);
546 
547 		if (n_pd == NULL || (pd->pd_pattbl != NULL &&
548 		    mmd_copy_pattbl(pd->pd_pattbl, n_mmd, n_pd, kmflags) < 0)) {
549 			freeb(n_bp);
550 			mutex_exit(&mmd->mmd_pd_slab_lock);
551 			return (NULL);
552 		}
553 
554 		pd = pd_next;
555 	}
556 #undef REL_OFF
557 #undef FREE_PBUFS
558 
559 	mutex_exit(&mmd->mmd_pd_slab_lock);
560 	return (n_bp);
561 }
562 
563 /*
564  * Given a Multidata message block, return the Multidata metadata handle.
565  */
566 multidata_t *
567 mmd_getmultidata(mblk_t *mp)
568 {
569 	multidata_t *mmd;
570 
571 	ASSERT(mp != NULL);
572 
573 	if (DB_TYPE(mp) != M_MULTIDATA)
574 		return (NULL);
575 
576 	mmd = (multidata_t *)mp->b_rptr;
577 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
578 
579 	return (mmd);
580 }
581 
582 /*
583  * Return the start and end addresses of the associated buffer(s).
584  */
585 void
586 mmd_getregions(multidata_t *mmd, mbufinfo_t *mbi)
587 {
588 	int i;
589 
590 	ASSERT(mmd != NULL);
591 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
592 	ASSERT(mbi != NULL);
593 
594 	bzero((void *)mbi, sizeof (mbufinfo_t));
595 
596 	if (mmd->mmd_hbuf != NULL) {
597 		mbi->hbuf_rptr = mmd->mmd_hbuf->b_rptr;
598 		mbi->hbuf_wptr = mmd->mmd_hbuf->b_wptr;
599 	}
600 
601 	mutex_enter(&mmd->mmd_pd_slab_lock);
602 	for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
603 		ASSERT(mmd->mmd_pbuf[i] != NULL);
604 		mbi->pbuf_ary[i].pbuf_rptr = mmd->mmd_pbuf[i]->b_rptr;
605 		mbi->pbuf_ary[i].pbuf_wptr = mmd->mmd_pbuf[i]->b_wptr;
606 
607 	}
608 	mbi->pbuf_cnt = mmd->mmd_pbuf_cnt;
609 	mutex_exit(&mmd->mmd_pd_slab_lock);
610 }
611 
612 /*
613  * Return the Multidata statistics.
614  */
615 uint_t
616 mmd_getcnt(multidata_t *mmd, uint_t *hbuf_ref, uint_t *pbuf_ref)
617 {
618 	uint_t pd_cnt;
619 
620 	ASSERT(mmd != NULL);
621 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
622 
623 	mutex_enter(&(mmd->mmd_pd_slab_lock));
624 	if (hbuf_ref != NULL)
625 		*hbuf_ref = mmd->mmd_hbuf_ref;
626 	if (pbuf_ref != NULL)
627 		*pbuf_ref = mmd->mmd_pbuf_ref;
628 	pd_cnt = mmd->mmd_pd_cnt;
629 	mutex_exit(&(mmd->mmd_pd_slab_lock));
630 
631 	return (pd_cnt);
632 }
633 
634 #define	HBUF_REF_VALID(mmd, pdi)					\
635 	((mmd)->mmd_hbuf != NULL && (pdi)->hdr_rptr != NULL &&		\
636 	(pdi)->hdr_wptr != NULL && (pdi)->hdr_base != NULL &&		\
637 	(pdi)->hdr_lim != NULL && (pdi)->hdr_lim >= (pdi)->hdr_base &&	\
638 	(pdi)->hdr_wptr >= (pdi)->hdr_rptr &&				\
639 	(pdi)->hdr_base <= (pdi)->hdr_rptr &&				\
640 	(pdi)->hdr_lim >= (pdi)->hdr_wptr &&				\
641 	(pdi)->hdr_base >= (mmd)->mmd_hbuf->b_rptr &&			\
642 	MBLKIN((mmd)->mmd_hbuf,						\
643 	(pdi->hdr_base - (mmd)->mmd_hbuf->b_rptr),			\
644 	PDESC_HDRSIZE(pdi)))
645 
646 /*
647  * Bounds check payload area(s).
648  */
649 static boolean_t
650 pbuf_ref_valid(multidata_t *mmd, pdescinfo_t *pdi)
651 {
652 	int i = 0, idx;
653 	boolean_t valid = B_TRUE;
654 	struct pld_ary_s *pa;
655 
656 	mutex_enter(&mmd->mmd_pd_slab_lock);
657 	if (pdi->pld_cnt == 0 || pdi->pld_cnt > mmd->mmd_pbuf_cnt) {
658 		mutex_exit(&mmd->mmd_pd_slab_lock);
659 		return (B_FALSE);
660 	}
661 
662 	pa = &pdi->pld_ary[0];
663 	while (valid && i < pdi->pld_cnt) {
664 		valid = (((idx = pa->pld_pbuf_idx) < mmd->mmd_pbuf_cnt) &&
665 		    pa->pld_rptr != NULL && pa->pld_wptr != NULL &&
666 		    pa->pld_wptr >= pa->pld_rptr &&
667 		    pa->pld_rptr >= mmd->mmd_pbuf[idx]->b_rptr &&
668 		    MBLKIN(mmd->mmd_pbuf[idx], (pa->pld_rptr -
669 			mmd->mmd_pbuf[idx]->b_rptr),
670 			PDESC_PLD_SPAN_SIZE(pdi, i)));
671 
672 		if (!valid) {
673 			MMD_DEBUG((CE_WARN,
674 			    "pbuf_ref_valid: pdi 0x%p pld out of bound; "
675 			    "index %d has pld_cnt %d pbuf_idx %d "
676 			    "(mmd_pbuf_cnt %d), "
677 			    "pld_rptr 0x%p pld_wptr 0x%p len %d "
678 			    "(valid 0x%p-0x%p len %d)\n", (void *)pdi,
679 			    i, pdi->pld_cnt, idx, mmd->mmd_pbuf_cnt,
680 			    (void *)pa->pld_rptr,
681 			    (void *)pa->pld_wptr,
682 			    (int)PDESC_PLD_SPAN_SIZE(pdi, i),
683 			    (void *)mmd->mmd_pbuf[idx]->b_rptr,
684 			    (void *)mmd->mmd_pbuf[idx]->b_wptr,
685 			    (int)MBLKL(mmd->mmd_pbuf[idx])));
686 		}
687 
688 		/* advance to next entry */
689 		i++;
690 		pa++;
691 	}
692 
693 	mutex_exit(&mmd->mmd_pd_slab_lock);
694 	return (valid);
695 }
696 
697 /*
698  * Add a packet descriptor to the Multidata.
699  */
700 pdesc_t *
701 mmd_addpdesc(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
702 {
703 	ASSERT(mmd != NULL);
704 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
705 	ASSERT(pdi != NULL);
706 	ASSERT(pdi->flags & PDESC_HAS_REF);
707 
708 	/* do the references refer to invalid memory regions? */
709 	if (!mmd_speed_over_safety &&
710 	    (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
711 	    ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi)))) {
712 		if (err != NULL)
713 			*err = EINVAL;
714 		return (NULL);
715 	}
716 
717 	return (mmd_addpdesc_int(mmd, pdi, err, kmflags));
718 }
719 
720 /*
721  * Internal routine to add a packet descriptor, called when mmd_addpdesc
722  * or mmd_copy tries to allocate and add a descriptor to a Multidata.
723  */
724 static pdesc_t *
725 mmd_addpdesc_int(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
726 {
727 	pdesc_slab_t *slab, *slab_last;
728 	pdesc_t *pd;
729 
730 	ASSERT(pdi->flags & PDESC_HAS_REF);
731 	ASSERT(!(pdi->flags & PDESC_HBUF_REF) || HBUF_REF_VALID(mmd, pdi));
732 	ASSERT(!(pdi->flags & PDESC_PBUF_REF) || pbuf_ref_valid(mmd, pdi));
733 
734 	if (err != NULL)
735 		*err = 0;
736 
737 	mutex_enter(&(mmd->mmd_pd_slab_lock));
738 	/*
739 	 * Is slab list empty or the last-added slab is full?  If so,
740 	 * allocate new slab for the descriptor; otherwise, use the
741 	 * last-added slab instead.
742 	 */
743 	slab_last = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_prev);
744 	if (mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q) ||
745 	    slab_last->pds_used == slab_last->pds_sz) {
746 		slab = kmem_cache_alloc(pd_slab_cache, kmflags);
747 		if (slab == NULL) {
748 			if (err != NULL)
749 				*err = ENOMEM;
750 			mutex_exit(&(mmd->mmd_pd_slab_lock));
751 			return (NULL);
752 		}
753 		slab->pds_mmd = mmd;
754 
755 		ASSERT(slab->pds_used == 0);
756 		ASSERT(slab->pds_next == NULL && slab->pds_prev == NULL);
757 
758 		/* insert slab at end of list */
759 		insque(&(slab->pds_next), mmd->mmd_pd_slab_q.ql_prev);
760 		mmd->mmd_slab_cnt++;
761 	} else {
762 		slab = slab_last;
763 	}
764 	ASSERT(slab->pds_used < slab->pds_sz);
765 	pd = &(slab->pds_free_desc[slab->pds_used++]);
766 	ASSERT(pd->pd_magic == PDESC_MAGIC);
767 	pd->pd_next = NULL;
768 	pd->pd_prev = NULL;
769 	pd->pd_slab = slab;
770 	pd->pd_pattbl = NULL;
771 
772 	/* copy over the descriptor info from caller */
773 	PDI_COPY(pdi, &(pd->pd_pdi));
774 
775 	if (pd->pd_flags & PDESC_HBUF_REF)
776 		mmd->mmd_hbuf_ref++;
777 	if (pd->pd_flags & PDESC_PBUF_REF)
778 		mmd->mmd_pbuf_ref += pd->pd_pdi.pld_cnt;
779 	mmd->mmd_pd_cnt++;
780 
781 	/* insert descriptor at end of list */
782 	insque(&(pd->pd_next), mmd->mmd_pd_q.ql_prev);
783 	mutex_exit(&(mmd->mmd_pd_slab_lock));
784 
785 	return (pd);
786 }
787 
788 /*
789  * Packet descriptor slab kmem cache constructor routine.
790  */
791 /* ARGSUSED */
792 static int
793 pdslab_constructor(void *buf, void *cdrarg, int kmflags)
794 {
795 	pdesc_slab_t *slab;
796 	uint_t cnt = (uint_t)(uintptr_t)cdrarg;
797 	int i;
798 
799 	ASSERT(cnt > 0);	/* slab size can't be zero */
800 
801 	slab = (pdesc_slab_t *)buf;
802 	slab->pds_next = NULL;
803 	slab->pds_prev = NULL;
804 	slab->pds_mmd = NULL;
805 	slab->pds_used = 0;
806 	slab->pds_sz = cnt;
807 
808 	for (i = 0; i < cnt; i++) {
809 		pdesc_t *pd = &(slab->pds_free_desc[i]);
810 		pd->pd_magic = PDESC_MAGIC;
811 	}
812 	return (0);
813 }
814 
815 /*
816  * Packet descriptor slab kmem cache destructor routine.
817  */
818 /* ARGSUSED */
819 static void
820 pdslab_destructor(void *buf, void *cdrarg)
821 {
822 	pdesc_slab_t *slab;
823 
824 	slab = (pdesc_slab_t *)buf;
825 	ASSERT(slab->pds_next == NULL);
826 	ASSERT(slab->pds_prev == NULL);
827 	ASSERT(slab->pds_mmd == NULL);
828 	ASSERT(slab->pds_used == 0);
829 	ASSERT(slab->pds_sz > 0);
830 }
831 
832 /*
833  * Remove a packet descriptor from the in-use descriptor list,
834  * called by mmd_rempdesc or during free.
835  */
836 static pdesc_t *
837 mmd_destroy_pdesc(multidata_t *mmd, pdesc_t *pd)
838 {
839 	pdesc_t *pd_next;
840 
841 	pd_next = Q2PD(pd->pd_next);
842 	remque(&(pd->pd_next));
843 
844 	/* remove all local attributes */
845 	if (pd->pd_pattbl != NULL)
846 		mmd_destroy_pattbl(&(pd->pd_pattbl));
847 
848 	/* don't decrease counts for a removed descriptor */
849 	if (!(pd->pd_flags & PDESC_REM_DEFER)) {
850 		if (pd->pd_flags & PDESC_HBUF_REF) {
851 			ASSERT(mmd->mmd_hbuf_ref > 0);
852 			mmd->mmd_hbuf_ref--;
853 		}
854 		if (pd->pd_flags & PDESC_PBUF_REF) {
855 			ASSERT(mmd->mmd_pbuf_ref > 0);
856 			mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
857 		}
858 		ASSERT(mmd->mmd_pd_cnt > 0);
859 		mmd->mmd_pd_cnt--;
860 	}
861 	return (pd_next);
862 }
863 
864 /*
865  * Remove a packet descriptor from the Multidata.
866  */
867 void
868 mmd_rempdesc(pdesc_t *pd)
869 {
870 	multidata_t *mmd;
871 
872 	ASSERT(pd->pd_magic == PDESC_MAGIC);
873 	ASSERT(pd->pd_slab != NULL);
874 
875 	mmd = pd->pd_slab->pds_mmd;
876 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
877 
878 	mutex_enter(&(mmd->mmd_pd_slab_lock));
879 	/*
880 	 * We can't deallocate the associated resources if the Multidata
881 	 * is shared with other threads, because it's possible that the
882 	 * descriptor handle value is held by those threads.  That's why
883 	 * we simply mark the entry as "removed" and decrement the counts.
884 	 * If there are no other threads, then we free the descriptor.
885 	 */
886 	if (mmd->mmd_dp->db_ref > 1) {
887 		pd->pd_flags |= PDESC_REM_DEFER;
888 		if (pd->pd_flags & PDESC_HBUF_REF) {
889 			ASSERT(mmd->mmd_hbuf_ref > 0);
890 			mmd->mmd_hbuf_ref--;
891 		}
892 		if (pd->pd_flags & PDESC_PBUF_REF) {
893 			ASSERT(mmd->mmd_pbuf_ref > 0);
894 			mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
895 		}
896 		ASSERT(mmd->mmd_pd_cnt > 0);
897 		mmd->mmd_pd_cnt--;
898 	} else {
899 		(void) mmd_destroy_pdesc(mmd, pd);
900 	}
901 	mutex_exit(&(mmd->mmd_pd_slab_lock));
902 }
903 
904 /*
905  * A generic routine to traverse the packet descriptor in-use list.
906  */
907 static pdesc_t *
908 mmd_getpdesc(multidata_t *mmd, pdesc_t *pd, pdescinfo_t *pdi, uint_t forw,
909     boolean_t mutex_held)
910 {
911 	pdesc_t *pd_head;
912 
913 	ASSERT(pd == NULL || pd->pd_slab->pds_mmd == mmd);
914 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
915 	ASSERT(!mutex_held || MUTEX_HELD(&(mmd->mmd_pd_slab_lock)));
916 
917 	if (!mutex_held)
918 		mutex_enter(&(mmd->mmd_pd_slab_lock));
919 	pd_head = Q2PD(&(mmd->mmd_pd_q));
920 
921 	if (pd == NULL) {
922 		/*
923 		 * We're called by mmd_get{first,last}pdesc, and so
924 		 * return either the first or last list element.
925 		 */
926 		pd = forw ? Q2PD(mmd->mmd_pd_q.ql_next) :
927 		    Q2PD(mmd->mmd_pd_q.ql_prev);
928 	} else {
929 		/*
930 		 * We're called by mmd_get{next,prev}pdesc, and so
931 		 * return either the next or previous list element.
932 		 */
933 		pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
934 	}
935 
936 	while (pd != pd_head) {
937 		/* skip element if it has been removed */
938 		if (!(pd->pd_flags & PDESC_REM_DEFER))
939 			break;
940 		pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
941 	}
942 	if (!mutex_held)
943 		mutex_exit(&(mmd->mmd_pd_slab_lock));
944 
945 	/* return NULL if we're back at the beginning */
946 	if (pd == pd_head)
947 		pd = NULL;
948 
949 	/* got an entry; copy descriptor info to caller */
950 	if (pd != NULL && pdi != NULL)
951 		PDI_COPY(&(pd->pd_pdi), pdi);
952 
953 	ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
954 	return (pd);
955 
956 }
957 
958 /*
959  * Return the first packet descriptor in the in-use list.
960  */
961 pdesc_t *
962 mmd_getfirstpdesc(multidata_t *mmd, pdescinfo_t *pdi)
963 {
964 	return (mmd_getpdesc(mmd, NULL, pdi, 1, B_FALSE));
965 }
966 
967 /*
968  * Return the last packet descriptor in the in-use list.
969  */
970 pdesc_t *
971 mmd_getlastpdesc(multidata_t *mmd, pdescinfo_t *pdi)
972 {
973 	return (mmd_getpdesc(mmd, NULL, pdi, 0, B_FALSE));
974 }
975 
976 /*
977  * Return the next packet descriptor in the in-use list.
978  */
979 pdesc_t *
980 mmd_getnextpdesc(pdesc_t *pd, pdescinfo_t *pdi)
981 {
982 	return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 1, B_FALSE));
983 }
984 
985 /*
986  * Return the previous packet descriptor in the in-use list.
987  */
988 pdesc_t *
989 mmd_getprevpdesc(pdesc_t *pd, pdescinfo_t *pdi)
990 {
991 	return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 0, B_FALSE));
992 }
993 
994 /*
995  * Check to see if pdi stretches over c_pdi; used to ensure that a packet
996  * descriptor's header and payload span may not be extended beyond the
997  * current boundaries.
998  */
999 static boolean_t
1000 pdi_in_range(pdescinfo_t *pdi, pdescinfo_t *c_pdi)
1001 {
1002 	int i;
1003 	struct pld_ary_s *pa = &pdi->pld_ary[0];
1004 	struct pld_ary_s *c_pa = &c_pdi->pld_ary[0];
1005 
1006 	if (pdi->hdr_base < c_pdi->hdr_base || pdi->hdr_lim > c_pdi->hdr_lim)
1007 		return (B_FALSE);
1008 
1009 	/*
1010 	 * We don't allow the number of span to be reduced, for the sake
1011 	 * of simplicity.  Instead, we provide PDESC_PLD_SPAN_CLEAR() to
1012 	 * clear a packet descriptor.  Note that we allow the span count to
1013 	 * be increased, and the bounds check for the new one happens
1014 	 * in pbuf_ref_valid.
1015 	 */
1016 	if (pdi->pld_cnt < c_pdi->pld_cnt)
1017 		return (B_FALSE);
1018 
1019 	/* compare only those which are currently defined */
1020 	for (i = 0; i < c_pdi->pld_cnt; i++, pa++, c_pa++) {
1021 		if (pa->pld_pbuf_idx != c_pa->pld_pbuf_idx ||
1022 		    pa->pld_rptr < c_pa->pld_rptr ||
1023 		    pa->pld_wptr > c_pa->pld_wptr)
1024 			return (B_FALSE);
1025 	}
1026 	return (B_TRUE);
1027 }
1028 
1029 /*
1030  * Modify the layout of a packet descriptor.
1031  */
1032 pdesc_t *
1033 mmd_adjpdesc(pdesc_t *pd, pdescinfo_t *pdi)
1034 {
1035 	multidata_t *mmd;
1036 	pdescinfo_t *c_pdi;
1037 
1038 	ASSERT(pd != NULL);
1039 	ASSERT(pdi != NULL);
1040 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1041 
1042 	mmd = pd->pd_slab->pds_mmd;
1043 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1044 
1045 	/* entry has been removed */
1046 	if (pd->pd_flags & PDESC_REM_DEFER)
1047 		return (NULL);
1048 
1049 	/* caller doesn't intend to specify any buffer reference? */
1050 	if (!(pdi->flags & PDESC_HAS_REF))
1051 		return (NULL);
1052 
1053 	/* do the references refer to invalid memory regions? */
1054 	if (!mmd_speed_over_safety &&
1055 	    (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
1056 	    ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi))))
1057 		return (NULL);
1058 
1059 	/* they're not subsets of current references? */
1060 	c_pdi = &(pd->pd_pdi);
1061 	if (!pdi_in_range(pdi, c_pdi))
1062 		return (NULL);
1063 
1064 	/* copy over the descriptor info from caller */
1065 	PDI_COPY(pdi, c_pdi);
1066 
1067 	return (pd);
1068 }
1069 
1070 /*
1071  * Copy the contents of a packet descriptor into a new buffer.  If the
1072  * descriptor points to more than one buffer fragments, the contents
1073  * of both fragments will be joined, with the header buffer fragment
1074  * preceding the payload buffer fragment(s).
1075  */
1076 mblk_t *
1077 mmd_transform(pdesc_t *pd)
1078 {
1079 	multidata_t *mmd;
1080 	pdescinfo_t *pdi;
1081 	mblk_t *mp;
1082 	int h_size = 0, p_size = 0;
1083 	int i, len;
1084 
1085 	ASSERT(pd != NULL);
1086 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1087 
1088 	mmd = pd->pd_slab->pds_mmd;
1089 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1090 
1091 	/* entry has been removed */
1092 	if (pd->pd_flags & PDESC_REM_DEFER)
1093 		return (NULL);
1094 
1095 	mutex_enter(&mmd->mmd_pd_slab_lock);
1096 	pdi = &(pd->pd_pdi);
1097 	if (pdi->flags & PDESC_HBUF_REF)
1098 		h_size = PDESC_HDRL(pdi);
1099 	if (pdi->flags & PDESC_PBUF_REF) {
1100 		for (i = 0; i < pdi->pld_cnt; i++)
1101 			p_size += PDESC_PLD_SPAN_SIZE(pdi, i);
1102 	}
1103 
1104 	/* allocate space large enough to hold the fragment(s) */
1105 	ASSERT(h_size + p_size >= 0);
1106 	if ((mp = allocb(h_size + p_size, BPRI_HI)) == NULL) {
1107 		mutex_exit(&mmd->mmd_pd_slab_lock);
1108 		return (NULL);
1109 	}
1110 
1111 	/* copy over the header fragment */
1112 	if ((pdi->flags & PDESC_HBUF_REF) && h_size > 0) {
1113 		bcopy(pdi->hdr_rptr, mp->b_wptr, h_size);
1114 		mp->b_wptr += h_size;
1115 	}
1116 
1117 	/* copy over the payload fragment */
1118 	if ((pdi->flags & PDESC_PBUF_REF) && p_size > 0) {
1119 		for (i = 0; i < pdi->pld_cnt; i++) {
1120 			len = PDESC_PLD_SPAN_SIZE(pdi, i);
1121 			if (len > 0) {
1122 				bcopy(pdi->pld_ary[i].pld_rptr,
1123 				    mp->b_wptr, len);
1124 				mp->b_wptr += len;
1125 			}
1126 		}
1127 	}
1128 
1129 	mutex_exit(&mmd->mmd_pd_slab_lock);
1130 	return (mp);
1131 }
1132 
1133 /*
1134  * Return a chain of mblks representing the Multidata packet.
1135  */
1136 mblk_t *
1137 mmd_transform_link(pdesc_t *pd)
1138 {
1139 	multidata_t *mmd;
1140 	pdescinfo_t *pdi;
1141 	mblk_t *nmp = NULL;
1142 
1143 	ASSERT(pd != NULL);
1144 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1145 
1146 	mmd = pd->pd_slab->pds_mmd;
1147 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1148 
1149 	/* entry has been removed */
1150 	if (pd->pd_flags & PDESC_REM_DEFER)
1151 		return (NULL);
1152 
1153 	pdi = &(pd->pd_pdi);
1154 
1155 	/* duplicate header buffer */
1156 	if ((pdi->flags & PDESC_HBUF_REF)) {
1157 		if ((nmp = dupb(mmd->mmd_hbuf)) == NULL)
1158 			return (NULL);
1159 		nmp->b_rptr = pdi->hdr_rptr;
1160 		nmp->b_wptr = pdi->hdr_wptr;
1161 	}
1162 
1163 	/* duplicate payload buffer(s) */
1164 	if (pdi->flags & PDESC_PBUF_REF) {
1165 		int i;
1166 		mblk_t *mp;
1167 		struct pld_ary_s *pa = &pdi->pld_ary[0];
1168 
1169 		mutex_enter(&mmd->mmd_pd_slab_lock);
1170 		for (i = 0; i < pdi->pld_cnt; i++, pa++) {
1171 			ASSERT(mmd->mmd_pbuf[pa->pld_pbuf_idx] != NULL);
1172 
1173 			/* skip empty ones */
1174 			if (PDESC_PLD_SPAN_SIZE(pdi, i) == 0)
1175 				continue;
1176 
1177 			mp = dupb(mmd->mmd_pbuf[pa->pld_pbuf_idx]);
1178 			if (mp == NULL) {
1179 				if (nmp != NULL)
1180 					freemsg(nmp);
1181 				mutex_exit(&mmd->mmd_pd_slab_lock);
1182 				return (NULL);
1183 			}
1184 			mp->b_rptr = pa->pld_rptr;
1185 			mp->b_wptr = pa->pld_wptr;
1186 			if (nmp == NULL)
1187 				nmp = mp;
1188 			else
1189 				linkb(nmp, mp);
1190 		}
1191 		mutex_exit(&mmd->mmd_pd_slab_lock);
1192 	}
1193 
1194 	return (nmp);
1195 }
1196 
1197 /*
1198  * Return duplicate message block(s) of the associated buffer(s).
1199  */
1200 int
1201 mmd_dupbufs(multidata_t *mmd, mblk_t **hmp, mblk_t **pmp)
1202 {
1203 	ASSERT(mmd != NULL);
1204 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1205 
1206 	if (hmp != NULL) {
1207 		*hmp = NULL;
1208 		if (mmd->mmd_hbuf != NULL &&
1209 		    (*hmp = dupb(mmd->mmd_hbuf)) == NULL)
1210 			return (-1);
1211 	}
1212 
1213 	if (pmp != NULL) {
1214 		int i;
1215 		mblk_t *mp;
1216 
1217 		mutex_enter(&mmd->mmd_pd_slab_lock);
1218 		*pmp = NULL;
1219 		for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1220 			ASSERT(mmd->mmd_pbuf[i] != NULL);
1221 			mp = dupb(mmd->mmd_pbuf[i]);
1222 			if (mp == NULL) {
1223 				if (hmp != NULL && *hmp != NULL)
1224 					freeb(*hmp);
1225 				if (*pmp != NULL)
1226 					freemsg(*pmp);
1227 				mutex_exit(&mmd->mmd_pd_slab_lock);
1228 				return (-1);
1229 			}
1230 			if (*pmp == NULL)
1231 				*pmp = mp;
1232 			else
1233 				linkb(*pmp, mp);
1234 		}
1235 		mutex_exit(&mmd->mmd_pd_slab_lock);
1236 	}
1237 
1238 	return (0);
1239 }
1240 
1241 /*
1242  * Return the layout of a packet descriptor.
1243  */
1244 int
1245 mmd_getpdescinfo(pdesc_t *pd, pdescinfo_t *pdi)
1246 {
1247 	ASSERT(pd != NULL);
1248 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1249 	ASSERT(pd->pd_slab != NULL);
1250 	ASSERT(pd->pd_slab->pds_mmd->mmd_magic == MULTIDATA_MAGIC);
1251 	ASSERT(pdi != NULL);
1252 
1253 	/* entry has been removed */
1254 	if (pd->pd_flags & PDESC_REM_DEFER)
1255 		return (-1);
1256 
1257 	/* copy descriptor info to caller */
1258 	PDI_COPY(&(pd->pd_pdi), pdi);
1259 
1260 	return (0);
1261 }
1262 
1263 /*
1264  * Add a global or local attribute to a Multidata.  Global attribute
1265  * association is specified by a NULL packet descriptor.
1266  */
1267 pattr_t *
1268 mmd_addpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai,
1269     boolean_t persistent, int kmflags)
1270 {
1271 	patbkt_t **tbl_p;
1272 	patbkt_t *tbl, *o_tbl;
1273 	patbkt_t *bkt;
1274 	pattr_t *pa;
1275 	uint_t size;
1276 
1277 	ASSERT(mmd != NULL);
1278 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1279 	ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
1280 	ASSERT(pai != NULL);
1281 
1282 	/* pointer to the attribute hash table (local or global) */
1283 	tbl_p = pd != NULL ? &(pd->pd_pattbl) : &(mmd->mmd_pattbl);
1284 
1285 	/*
1286 	 * See if the hash table has not yet been created; if so,
1287 	 * we create the table and store its address atomically.
1288 	 */
1289 	if ((tbl = *tbl_p) == NULL) {
1290 		tbl = kmem_cache_alloc(pattbl_cache, kmflags);
1291 		if (tbl == NULL)
1292 			return (NULL);
1293 
1294 		/* if someone got there first, use his table instead */
1295 		if ((o_tbl = atomic_cas_ptr(tbl_p, NULL, tbl)) != NULL) {
1296 			kmem_cache_free(pattbl_cache, tbl);
1297 			tbl = o_tbl;
1298 		}
1299 	}
1300 
1301 	ASSERT(tbl->pbkt_tbl_sz > 0);
1302 	bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1303 
1304 	/* attribute of the same type already exists? */
1305 	if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL)
1306 		return (NULL);
1307 
1308 	size = sizeof (*pa) + pai->len;
1309 	if ((pa = kmem_zalloc(size, kmflags)) == NULL)
1310 		return (NULL);
1311 
1312 	pa->pat_magic = PATTR_MAGIC;
1313 	pa->pat_lock = &(bkt->pbkt_lock);
1314 	pa->pat_mmd = mmd;
1315 	pa->pat_buflen = size;
1316 	pa->pat_type = pai->type;
1317 	pai->buf = pai->len > 0 ? ((uchar_t *)(pa + 1)) : NULL;
1318 
1319 	if (persistent)
1320 		pa->pat_flags = PATTR_PERSIST;
1321 
1322 	/* insert attribute at end of hash chain */
1323 	mutex_enter(&(bkt->pbkt_lock));
1324 	insque(&(pa->pat_next), bkt->pbkt_pattr_q.ql_prev);
1325 	mutex_exit(&(bkt->pbkt_lock));
1326 
1327 	return (pa);
1328 }
1329 
1330 /*
1331  * Attribute hash table kmem cache constructor routine.
1332  */
1333 /* ARGSUSED */
1334 static int
1335 pattbl_constructor(void *buf, void *cdrarg, int kmflags)
1336 {
1337 	patbkt_t *bkt;
1338 	uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1339 	uint_t i;
1340 
1341 	ASSERT(tbl_sz > 0);	/* table size can't be zero */
1342 
1343 	for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1344 		mutex_init(&(bkt->pbkt_lock), NULL, MUTEX_DRIVER, NULL);
1345 		QL_INIT(&(bkt->pbkt_pattr_q));
1346 
1347 		/* first bucket contains the table size */
1348 		bkt->pbkt_tbl_sz = i == 0 ? tbl_sz : 0;
1349 	}
1350 	return (0);
1351 }
1352 
1353 /*
1354  * Attribute hash table kmem cache destructor routine.
1355  */
1356 /* ARGSUSED */
1357 static void
1358 pattbl_destructor(void *buf, void *cdrarg)
1359 {
1360 	patbkt_t *bkt;
1361 	uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1362 	uint_t i;
1363 
1364 	ASSERT(tbl_sz > 0);	/* table size can't be zero */
1365 
1366 	for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1367 		mutex_destroy(&(bkt->pbkt_lock));
1368 		ASSERT(bkt->pbkt_pattr_q.ql_next == &(bkt->pbkt_pattr_q));
1369 		ASSERT(i > 0 || bkt->pbkt_tbl_sz == tbl_sz);
1370 	}
1371 }
1372 
1373 /*
1374  * Destroy an attribute hash table, called by mmd_rempdesc or during free.
1375  */
1376 static void
1377 mmd_destroy_pattbl(patbkt_t **tbl)
1378 {
1379 	patbkt_t *bkt;
1380 	pattr_t *pa, *pa_next;
1381 	uint_t i, tbl_sz;
1382 
1383 	ASSERT(tbl != NULL);
1384 	bkt = *tbl;
1385 	tbl_sz = bkt->pbkt_tbl_sz;
1386 
1387 	/* make sure caller passes in the first bucket */
1388 	ASSERT(tbl_sz > 0);
1389 
1390 	/* destroy the contents of each bucket */
1391 	for (i = 0; i < tbl_sz; i++, bkt++) {
1392 		/* we ought to be exclusive at this point */
1393 		ASSERT(MUTEX_NOT_HELD(&(bkt->pbkt_lock)));
1394 
1395 		pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1396 		while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1397 			ASSERT(pa->pat_magic == PATTR_MAGIC);
1398 			pa_next = Q2PATTR(pa->pat_next);
1399 			remque(&(pa->pat_next));
1400 			kmem_free(pa, pa->pat_buflen);
1401 			pa = pa_next;
1402 		}
1403 	}
1404 
1405 	kmem_cache_free(pattbl_cache, *tbl);
1406 	*tbl = NULL;
1407 
1408 	/* commit all previous stores */
1409 	membar_producer();
1410 }
1411 
1412 /*
1413  * Copy the contents of an attribute hash table, called by mmd_copy.
1414  */
1415 static int
1416 mmd_copy_pattbl(patbkt_t *src_tbl, multidata_t *n_mmd, pdesc_t *n_pd,
1417     int kmflags)
1418 {
1419 	patbkt_t *bkt;
1420 	pattr_t *pa;
1421 	pattrinfo_t pai;
1422 	uint_t i, tbl_sz;
1423 
1424 	ASSERT(src_tbl != NULL);
1425 	bkt = src_tbl;
1426 	tbl_sz = bkt->pbkt_tbl_sz;
1427 
1428 	/* make sure caller passes in the first bucket */
1429 	ASSERT(tbl_sz > 0);
1430 
1431 	for (i = 0; i < tbl_sz; i++, bkt++) {
1432 		mutex_enter(&(bkt->pbkt_lock));
1433 		pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1434 		while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1435 			pattr_t *pa_next = Q2PATTR(pa->pat_next);
1436 
1437 			/* skip if it's removed */
1438 			if (pa->pat_flags & PATTR_REM_DEFER) {
1439 				pa = pa_next;
1440 				continue;
1441 			}
1442 
1443 			pai.type = pa->pat_type;
1444 			pai.len = pa->pat_buflen - sizeof (*pa);
1445 			if (mmd_addpattr(n_mmd, n_pd, &pai, (pa->pat_flags &
1446 			    PATTR_PERSIST) != 0, kmflags) == NULL) {
1447 				mutex_exit(&(bkt->pbkt_lock));
1448 				return (-1);
1449 			}
1450 
1451 			/* copy over the contents */
1452 			if (pai.buf != NULL)
1453 				bcopy(pa + 1, pai.buf, pai.len);
1454 
1455 			pa = pa_next;
1456 		}
1457 		mutex_exit(&(bkt->pbkt_lock));
1458 	}
1459 
1460 	return (0);
1461 }
1462 
1463 /*
1464  * Search for an attribute type within an attribute hash bucket.
1465  */
1466 static pattr_t *
1467 mmd_find_pattr(patbkt_t *bkt, uint_t type)
1468 {
1469 	pattr_t *pa_head, *pa;
1470 
1471 	mutex_enter(&(bkt->pbkt_lock));
1472 	pa_head = Q2PATTR(&(bkt->pbkt_pattr_q));
1473 	pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1474 
1475 	while (pa != pa_head) {
1476 		ASSERT(pa->pat_magic == PATTR_MAGIC);
1477 
1478 		/* return a match; we treat removed entry as non-existent */
1479 		if (pa->pat_type == type && !(pa->pat_flags & PATTR_REM_DEFER))
1480 			break;
1481 		pa = Q2PATTR(pa->pat_next);
1482 	}
1483 	mutex_exit(&(bkt->pbkt_lock));
1484 
1485 	return (pa == pa_head ? NULL : pa);
1486 }
1487 
1488 /*
1489  * Remove an attribute from a Multidata.
1490  */
1491 void
1492 mmd_rempattr(pattr_t *pa)
1493 {
1494 	kmutex_t *pat_lock = pa->pat_lock;
1495 
1496 	ASSERT(pa->pat_magic == PATTR_MAGIC);
1497 
1498 	/* ignore if attribute was marked as persistent */
1499 	if ((pa->pat_flags & PATTR_PERSIST) != 0)
1500 		return;
1501 
1502 	mutex_enter(pat_lock);
1503 	/*
1504 	 * We can't deallocate the associated resources if the Multidata
1505 	 * is shared with other threads, because it's possible that the
1506 	 * attribute handle value is held by those threads.  That's why
1507 	 * we simply mark the entry as "removed".  If there are no other
1508 	 * threads, then we free the attribute.
1509 	 */
1510 	if (pa->pat_mmd->mmd_dp->db_ref > 1) {
1511 		pa->pat_flags |= PATTR_REM_DEFER;
1512 	} else {
1513 		remque(&(pa->pat_next));
1514 		kmem_free(pa, pa->pat_buflen);
1515 	}
1516 	mutex_exit(pat_lock);
1517 }
1518 
1519 /*
1520  * Find an attribute (according to its type) and return its handle.
1521  */
1522 pattr_t *
1523 mmd_getpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai)
1524 {
1525 	patbkt_t *tbl, *bkt;
1526 	pattr_t *pa;
1527 
1528 	ASSERT(mmd != NULL);
1529 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1530 	ASSERT(pai != NULL);
1531 
1532 	/* get the right attribute hash table (local or global) */
1533 	tbl = pd != NULL ? pd->pd_pattbl : mmd->mmd_pattbl;
1534 
1535 	/* attribute hash table doesn't exist? */
1536 	if (tbl == NULL)
1537 		return (NULL);
1538 
1539 	ASSERT(tbl->pbkt_tbl_sz > 0);
1540 	bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1541 
1542 	if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL) {
1543 		ASSERT(pa->pat_buflen >= sizeof (*pa));
1544 		pai->len = pa->pat_buflen - sizeof (*pa);
1545 		pai->buf = pai->len > 0 ?
1546 		    (uchar_t *)pa + sizeof (pattr_t) : NULL;
1547 	}
1548 	ASSERT(pa == NULL || pa->pat_magic == PATTR_MAGIC);
1549 	return (pa);
1550 }
1551 
1552 /*
1553  * Return total size of buffers and total size of areas referenced
1554  * by all in-use (unremoved) packet descriptors.
1555  */
1556 void
1557 mmd_getsize(multidata_t *mmd, uint_t *ptotal, uint_t *pinuse)
1558 {
1559 	pdesc_t *pd;
1560 	pdescinfo_t *pdi;
1561 	int i;
1562 
1563 	ASSERT(mmd != NULL);
1564 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1565 
1566 	mutex_enter(&mmd->mmd_pd_slab_lock);
1567 	if (ptotal != NULL) {
1568 		*ptotal = 0;
1569 
1570 		if (mmd->mmd_hbuf != NULL)
1571 			*ptotal += MBLKL(mmd->mmd_hbuf);
1572 
1573 		for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1574 			ASSERT(mmd->mmd_pbuf[i] != NULL);
1575 			*ptotal += MBLKL(mmd->mmd_pbuf[i]);
1576 		}
1577 	}
1578 	if (pinuse != NULL) {
1579 		*pinuse = 0;
1580 
1581 		/* first pdesc */
1582 		pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);
1583 		while (pd != NULL) {
1584 			pdi = &pd->pd_pdi;
1585 
1586 			/* next pdesc */
1587 			pd = mmd_getpdesc(mmd, pd, NULL, 1, B_TRUE);
1588 
1589 			/* skip over removed descriptor */
1590 			if (pdi->flags & PDESC_REM_DEFER)
1591 				continue;
1592 
1593 			if (pdi->flags & PDESC_HBUF_REF)
1594 				*pinuse += PDESC_HDRL(pdi);
1595 
1596 			if (pdi->flags & PDESC_PBUF_REF) {
1597 				for (i = 0; i < pdi->pld_cnt; i++)
1598 					*pinuse += PDESC_PLDL(pdi, i);
1599 			}
1600 		}
1601 	}
1602 	mutex_exit(&mmd->mmd_pd_slab_lock);
1603 }
1604