xref: /titanic_51/usr/src/uts/common/io/multidata.c (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Multidata, as described in the following papers:
31  *
32  * Adi Masputra,
33  * Multidata V.2: VA-Disjoint Packet Extents Framework Interface
34  * Design Specification.  August 2004.
35  * Available as http://sac.sfbay/PSARC/2004/594/materials/mmd2.pdf.
36  *
37  * Adi Masputra,
38  * Multidata Interface Design Specification.  Sep 2002.
39  * Available as http://sac.sfbay/PSARC/2002/276/materials/mmd.pdf.
40  *
41  * Adi Masputra, Frank DiMambro, Kacheong Poon,
42  * An Efficient Networking Transmit Mechanism for Solaris:
43  * Multidata Transmit (MDT).  May 2002.
44  * Available as http://sac.sfbay/PSARC/2002/276/materials/mdt.pdf.
45  */
46 
47 #include <sys/types.h>
48 #include <sys/stream.h>
49 #include <sys/dlpi.h>
50 #include <sys/stropts.h>
51 #include <sys/strsun.h>
52 #include <sys/strlog.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/cmn_err.h>
56 #include <sys/debug.h>
57 #include <sys/kmem.h>
58 #include <sys/atomic.h>
59 
60 #include <sys/multidata.h>
61 #include <sys/multidata_impl.h>
62 
63 extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
64 
65 static int mmd_constructor(void *, void *, int);
66 static void mmd_destructor(void *, void *);
67 static int pdslab_constructor(void *, void *, int);
68 static void pdslab_destructor(void *, void *);
69 static int pattbl_constructor(void *, void *, int);
70 static void pattbl_destructor(void *, void *);
71 static void mmd_esballoc_free(caddr_t);
72 static int mmd_copy_pattbl(patbkt_t *, multidata_t *, pdesc_t *, int);
73 
74 static boolean_t pbuf_ref_valid(multidata_t *, pdescinfo_t *);
75 #pragma inline(pbuf_ref_valid)
76 
77 static boolean_t pdi_in_range(pdescinfo_t *, pdescinfo_t *);
78 #pragma inline(pdi_in_range)
79 
80 static pdesc_t *mmd_addpdesc_int(multidata_t *, pdescinfo_t *, int *, int);
81 #pragma inline(mmd_addpdesc_int)
82 
83 static void mmd_destroy_pattbl(patbkt_t **);
84 #pragma inline(mmd_destroy_pattbl)
85 
86 static pattr_t *mmd_find_pattr(patbkt_t *, uint_t);
87 #pragma inline(mmd_find_pattr)
88 
89 static pdesc_t *mmd_destroy_pdesc(multidata_t *, pdesc_t *);
90 #pragma inline(mmd_destroy_pdesc)
91 
92 static pdesc_t *mmd_getpdesc(multidata_t *, pdesc_t *, pdescinfo_t *, uint_t,
93     boolean_t);
94 #pragma inline(mmd_getpdesc)
95 
96 static struct kmem_cache *mmd_cache;
97 static struct kmem_cache *pd_slab_cache;
98 static struct kmem_cache *pattbl_cache;
99 
100 int mmd_debug = 1;
101 #define	MMD_DEBUG(s)	if (mmd_debug > 0) cmn_err s
102 
103 /*
104  * Set to this to true to bypass pdesc bounds checking.
105  */
106 boolean_t mmd_speed_over_safety = B_FALSE;
107 
108 /*
109  * Patchable kmem_cache flags.
110  */
111 int mmd_kmem_flags = 0;
112 int pdslab_kmem_flags = 0;
113 int pattbl_kmem_flags = 0;
114 
115 /*
116  * Alignment (in bytes) of our kmem caches.
117  */
118 #define	MULTIDATA_CACHE_ALIGN	64
119 
120 /*
121  * Default number of packet descriptors per descriptor slab.  Making
122  * this too small will trigger more descriptor slab allocation; making
123  * it too large will create too many unclaimed descriptors.
124  */
125 #define	PDSLAB_SZ	15
126 uint_t pdslab_sz = PDSLAB_SZ;
127 
128 /*
129  * Default attribute hash table size.  It's okay to set this to a small
130  * value (even to 1) because there aren't that many attributes currently
131  * defined, and because we assume there won't be many attributes associated
132  * with a Multidata at a given time.  Increasing the size will reduce
133  * attribute search time (given a large number of attributes in a Multidata),
134  * and decreasing it will reduce the memory footprints and the overhead
135  * associated with managing the table.
136  */
137 #define	PATTBL_SZ	1
138 uint_t pattbl_sz = PATTBL_SZ;
139 
140 /*
141  * Attribute hash key.
142  */
143 #define	PATTBL_HASH(x, sz)	((x) % (sz))
144 
145 /*
146  * Structure that precedes each Multidata metadata.
147  */
148 struct mmd_buf_info {
149 	frtn_t	frp;		/* free routine */
150 	uint_t	buf_len;	/* length of kmem buffer */
151 };
152 
153 /*
154  * The size of each metadata buffer.
155  */
156 #define	MMD_CACHE_SIZE	\
157 	(sizeof (struct mmd_buf_info) + sizeof (multidata_t))
158 
159 /*
160  * Called during startup in order to create the Multidata kmem caches.
161  */
162 void
163 mmd_init(void)
164 {
165 	pdslab_sz = MAX(1, pdslab_sz);	/* at least 1 descriptor */
166 	pattbl_sz = MAX(1, pattbl_sz);	/* at least 1 bucket */
167 
168 	mmd_cache = kmem_cache_create("multidata", MMD_CACHE_SIZE,
169 	    MULTIDATA_CACHE_ALIGN, mmd_constructor, mmd_destructor,
170 	    NULL, NULL, NULL, mmd_kmem_flags);
171 
172 	pd_slab_cache = kmem_cache_create("multidata_pdslab",
173 	    PDESC_SLAB_SIZE(pdslab_sz), MULTIDATA_CACHE_ALIGN,
174 	    pdslab_constructor, pdslab_destructor, NULL,
175 	    (void *)(uintptr_t)pdslab_sz, NULL, pdslab_kmem_flags);
176 
177 	pattbl_cache = kmem_cache_create("multidata_pattbl",
178 	    sizeof (patbkt_t) * pattbl_sz, MULTIDATA_CACHE_ALIGN,
179 	    pattbl_constructor, pattbl_destructor, NULL,
180 	    (void *)(uintptr_t)pattbl_sz, NULL, pattbl_kmem_flags);
181 }
182 
183 /*
184  * Create a Multidata message block.
185  */
186 multidata_t *
187 mmd_alloc(mblk_t *hdr_mp, mblk_t **mmd_mp, int kmflags)
188 {
189 	uchar_t *buf;
190 	multidata_t *mmd;
191 	uint_t mmd_mplen;
192 	struct mmd_buf_info *buf_info;
193 
194 	ASSERT(hdr_mp != NULL);
195 	ASSERT(mmd_mp != NULL);
196 
197 	/*
198 	 * Caller should never pass in a chain of mblks since we
199 	 * only care about the first one, hence the assertions.
200 	 */
201 	ASSERT(hdr_mp->b_cont == NULL);
202 
203 	if ((buf = kmem_cache_alloc(mmd_cache, kmflags)) == NULL)
204 		return (NULL);
205 
206 	buf_info = (struct mmd_buf_info *)buf;
207 	buf_info->frp.free_arg = (caddr_t)buf;
208 
209 	mmd = (multidata_t *)(buf_info + 1);
210 	mmd_mplen = sizeof (*mmd);
211 
212 	if ((*mmd_mp = desballoc((uchar_t *)mmd, mmd_mplen, BPRI_HI,
213 	    &(buf_info->frp))) == NULL) {
214 		kmem_cache_free(mmd_cache, buf);
215 		return (NULL);
216 	}
217 
218 	DB_TYPE(*mmd_mp) = M_MULTIDATA;
219 	(*mmd_mp)->b_wptr += mmd_mplen;
220 	mmd->mmd_dp = (*mmd_mp)->b_datap;
221 	mmd->mmd_hbuf = hdr_mp;
222 
223 	return (mmd);
224 }
225 
226 /*
227  * Associate additional payload buffer to the Multidata.
228  */
229 int
230 mmd_addpldbuf(multidata_t *mmd, mblk_t *pld_mp)
231 {
232 	int i;
233 
234 	ASSERT(mmd != NULL);
235 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
236 	ASSERT(pld_mp != NULL);
237 
238 	mutex_enter(&mmd->mmd_pd_slab_lock);
239 	for (i = 0; i < MULTIDATA_MAX_PBUFS &&
240 	    mmd->mmd_pbuf_cnt < MULTIDATA_MAX_PBUFS; i++) {
241 		if (mmd->mmd_pbuf[i] == pld_mp) {
242 			/* duplicate entry */
243 			MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding "
244 			    "pld 0x%p to mmd 0x%p since it has been "
245 			    "previously added into slot %d (total %d)\n",
246 			    (void *)pld_mp, (void *)mmd, i, mmd->mmd_pbuf_cnt));
247 			mutex_exit(&mmd->mmd_pd_slab_lock);
248 			return (-1);
249 		} else if (mmd->mmd_pbuf[i] == NULL) {
250 			mmd->mmd_pbuf[i] = pld_mp;
251 			mmd->mmd_pbuf_cnt++;
252 			mutex_exit(&mmd->mmd_pd_slab_lock);
253 			return (i);
254 		}
255 	}
256 
257 	/* all slots are taken */
258 	MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding pld 0x%p to mmd 0x%p "
259 	    "since no slot space is left (total %d max %d)\n", (void *)pld_mp,
260 	    (void *)mmd, mmd->mmd_pbuf_cnt, MULTIDATA_MAX_PBUFS));
261 	mutex_exit(&mmd->mmd_pd_slab_lock);
262 
263 	return (-1);
264 }
265 
266 /*
267  * Multidata metadata kmem cache constructor routine.
268  */
269 /* ARGSUSED */
270 static int
271 mmd_constructor(void *buf, void *cdrarg, int kmflags)
272 {
273 	struct mmd_buf_info *buf_info;
274 	multidata_t *mmd;
275 
276 	bzero((void *)buf, MMD_CACHE_SIZE);
277 
278 	buf_info = (struct mmd_buf_info *)buf;
279 	buf_info->frp.free_func = mmd_esballoc_free;
280 	buf_info->buf_len = MMD_CACHE_SIZE;
281 
282 	mmd = (multidata_t *)(buf_info + 1);
283 	mmd->mmd_magic = MULTIDATA_MAGIC;
284 
285 	mutex_init(&(mmd->mmd_pd_slab_lock), NULL, MUTEX_DRIVER, NULL);
286 	QL_INIT(&(mmd->mmd_pd_slab_q));
287 	QL_INIT(&(mmd->mmd_pd_q));
288 
289 	return (0);
290 }
291 
292 /*
293  * Multidata metadata kmem cache destructor routine.
294  */
295 /* ARGSUSED */
296 static void
297 mmd_destructor(void *buf, void *cdrarg)
298 {
299 	multidata_t *mmd;
300 #ifdef DEBUG
301 	int i;
302 #endif
303 
304 	mmd = (multidata_t *)((uchar_t *)buf + sizeof (struct mmd_buf_info));
305 
306 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
307 	ASSERT(mmd->mmd_dp == NULL);
308 	ASSERT(mmd->mmd_hbuf == NULL);
309 	ASSERT(mmd->mmd_pbuf_cnt == 0);
310 #ifdef DEBUG
311 	for (i = 0; i < MULTIDATA_MAX_PBUFS; i++)
312 		ASSERT(mmd->mmd_pbuf[i] == NULL);
313 #endif
314 	ASSERT(mmd->mmd_pattbl == NULL);
315 
316 	mutex_destroy(&(mmd->mmd_pd_slab_lock));
317 	ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
318 	ASSERT(mmd->mmd_slab_cnt == 0);
319 	ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
320 	ASSERT(mmd->mmd_pd_cnt == 0);
321 	ASSERT(mmd->mmd_hbuf_ref == 0);
322 	ASSERT(mmd->mmd_pbuf_ref == 0);
323 }
324 
325 /*
326  * Multidata message block free callback routine.
327  */
328 static void
329 mmd_esballoc_free(caddr_t buf)
330 {
331 	multidata_t *mmd;
332 	pdesc_t *pd;
333 	pdesc_slab_t *slab;
334 	int i;
335 
336 	ASSERT(buf != NULL);
337 	ASSERT(((struct mmd_buf_info *)buf)->buf_len == MMD_CACHE_SIZE);
338 
339 	mmd = (multidata_t *)(buf + sizeof (struct mmd_buf_info));
340 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
341 
342 	ASSERT(mmd->mmd_dp != NULL);
343 	ASSERT(mmd->mmd_dp->db_ref == 1);
344 
345 	/* remove all packet descriptors and private attributes */
346 	pd = Q2PD(mmd->mmd_pd_q.ql_next);
347 	while (pd != Q2PD(&(mmd->mmd_pd_q)))
348 		pd = mmd_destroy_pdesc(mmd, pd);
349 
350 	ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
351 	ASSERT(mmd->mmd_pd_cnt == 0);
352 	ASSERT(mmd->mmd_hbuf_ref == 0);
353 	ASSERT(mmd->mmd_pbuf_ref == 0);
354 
355 	/* remove all global attributes */
356 	if (mmd->mmd_pattbl != NULL)
357 		mmd_destroy_pattbl(&(mmd->mmd_pattbl));
358 
359 	/* remove all descriptor slabs */
360 	slab = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_next);
361 	while (slab != Q2PDSLAB(&(mmd->mmd_pd_slab_q))) {
362 		pdesc_slab_t *slab_next = Q2PDSLAB(slab->pds_next);
363 
364 		remque(&(slab->pds_next));
365 		slab->pds_next = NULL;
366 		slab->pds_prev = NULL;
367 		slab->pds_mmd = NULL;
368 		slab->pds_used = 0;
369 		kmem_cache_free(pd_slab_cache, slab);
370 
371 		ASSERT(mmd->mmd_slab_cnt > 0);
372 		mmd->mmd_slab_cnt--;
373 		slab = slab_next;
374 	}
375 	ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
376 	ASSERT(mmd->mmd_slab_cnt == 0);
377 
378 	mmd->mmd_dp = NULL;
379 
380 	/* finally, free all associated message blocks */
381 	if (mmd->mmd_hbuf != NULL) {
382 		freeb(mmd->mmd_hbuf);
383 		mmd->mmd_hbuf = NULL;
384 	}
385 
386 	for (i = 0; i < MULTIDATA_MAX_PBUFS; i++) {
387 		if (mmd->mmd_pbuf[i] != NULL) {
388 			freeb(mmd->mmd_pbuf[i]);
389 			mmd->mmd_pbuf[i] = NULL;
390 			ASSERT(mmd->mmd_pbuf_cnt > 0);
391 			mmd->mmd_pbuf_cnt--;
392 		}
393 	}
394 
395 	ASSERT(mmd->mmd_pbuf_cnt == 0);
396 	ASSERT(MUTEX_NOT_HELD(&(mmd->mmd_pd_slab_lock)));
397 	kmem_cache_free(mmd_cache, buf);
398 }
399 
400 /*
401  * Multidata message block copy routine, called by copyb() when it
402  * encounters a M_MULTIDATA data block type.  This routine should
403  * not be called by anyone other than copyb(), since it may go away
404  * (read: become static to this module) once some sort of copy callback
405  * routine is made available.
406  */
407 mblk_t *
408 mmd_copy(mblk_t *bp, int kmflags)
409 {
410 	multidata_t *mmd, *n_mmd;
411 	mblk_t *n_hbuf = NULL, *n_pbuf[MULTIDATA_MAX_PBUFS];
412 	mblk_t **pmp_last = &n_pbuf[MULTIDATA_MAX_PBUFS - 1];
413 	mblk_t **pmp;
414 	mblk_t *n_bp = NULL;
415 	pdesc_t *pd;
416 	uint_t n_pbuf_cnt = 0;
417 	int idx, i;
418 
419 #define	FREE_PBUFS() {					\
420 	for (pmp = &n_pbuf[0]; pmp <= pmp_last; pmp++)	\
421 		if (*pmp != NULL) freeb(*pmp);		\
422 }
423 
424 #define	REL_OFF(p, base, n_base)			\
425 	((uchar_t *)(n_base) + ((uchar_t *)(p) - (uchar_t *)base))
426 
427 	ASSERT(bp != NULL && DB_TYPE(bp) == M_MULTIDATA);
428 	mmd = mmd_getmultidata(bp);
429 
430 	/* copy the header buffer */
431 	if (mmd->mmd_hbuf != NULL && (n_hbuf = copyb(mmd->mmd_hbuf)) == NULL)
432 		return (NULL);
433 
434 	/* copy the payload buffer(s) */
435 	mutex_enter(&mmd->mmd_pd_slab_lock);
436 	bzero((void *)&n_pbuf[0], sizeof (mblk_t *) * MULTIDATA_MAX_PBUFS);
437 	n_pbuf_cnt = mmd->mmd_pbuf_cnt;
438 	for (i = 0; i < n_pbuf_cnt; i++) {
439 		ASSERT(mmd->mmd_pbuf[i] != NULL);
440 		n_pbuf[i] = copyb(mmd->mmd_pbuf[i]);
441 		if (n_pbuf[i] == NULL) {
442 			FREE_PBUFS();
443 			mutex_exit(&mmd->mmd_pd_slab_lock);
444 			return (NULL);
445 		}
446 	}
447 
448 	/* allocate new Multidata */
449 	n_mmd = mmd_alloc(n_hbuf, &n_bp, kmflags);
450 	if (n_mmd == NULL) {
451 		if (n_hbuf != NULL)
452 			freeb(n_hbuf);
453 		if (n_pbuf_cnt != 0)
454 			FREE_PBUFS();
455 		mutex_exit(&mmd->mmd_pd_slab_lock);
456 		return (NULL);
457 	}
458 
459 	/*
460 	 * Add payload buffer(s); upon success, leave n_pbuf array
461 	 * alone, as the newly-created Multidata had already contained
462 	 * the mblk pointers stored in the array.  These will be freed
463 	 * along with the Multidata itself.
464 	 */
465 	for (i = 0, pmp = &n_pbuf[0]; i < n_pbuf_cnt; i++, pmp++) {
466 		idx = mmd_addpldbuf(n_mmd, *pmp);
467 		if (idx < 0) {
468 			FREE_PBUFS();
469 			freeb(n_bp);
470 			mutex_exit(&mmd->mmd_pd_slab_lock);
471 			return (NULL);
472 		}
473 	}
474 
475 	/* copy over global attributes */
476 	if (mmd->mmd_pattbl != NULL &&
477 	    mmd_copy_pattbl(mmd->mmd_pattbl, n_mmd, NULL, kmflags) < 0) {
478 		freeb(n_bp);
479 		mutex_exit(&mmd->mmd_pd_slab_lock);
480 		return (NULL);
481 	}
482 
483 	/* copy over packet descriptors and their atttributes */
484 	pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);	/* first pdesc */
485 	while (pd != NULL) {
486 		pdesc_t *n_pd;
487 		pdescinfo_t *pdi, n_pdi;
488 		uchar_t *n_base, *base;
489 		pdesc_t *pd_next;
490 
491 		/* next pdesc */
492 		pd_next = mmd_getpdesc(pd->pd_slab->pds_mmd, pd, NULL,
493 		    1, B_TRUE);
494 
495 		/* skip if already removed */
496 		if (pd->pd_flags & PDESC_REM_DEFER) {
497 			pd = pd_next;
498 			continue;
499 		}
500 
501 		pdi = &(pd->pd_pdi);
502 		bzero(&n_pdi, sizeof (n_pdi));
503 
504 		/*
505 		 * Calculate new descriptor values based on the offset of
506 		 * each pointer relative to the associated buffer(s).
507 		 */
508 		ASSERT(pdi->flags & PDESC_HAS_REF);
509 		if (pdi->flags & PDESC_HBUF_REF) {
510 			n_base = n_mmd->mmd_hbuf->b_rptr;
511 			base = mmd->mmd_hbuf->b_rptr;
512 
513 			n_pdi.flags |= PDESC_HBUF_REF;
514 			n_pdi.hdr_base = REL_OFF(pdi->hdr_base, base, n_base);
515 			n_pdi.hdr_rptr = REL_OFF(pdi->hdr_rptr, base, n_base);
516 			n_pdi.hdr_wptr = REL_OFF(pdi->hdr_wptr, base, n_base);
517 			n_pdi.hdr_lim = REL_OFF(pdi->hdr_lim, base, n_base);
518 		}
519 
520 		if (pdi->flags & PDESC_PBUF_REF) {
521 			n_pdi.flags |= PDESC_PBUF_REF;
522 			n_pdi.pld_cnt = pdi->pld_cnt;
523 
524 			for (i = 0; i < pdi->pld_cnt; i++) {
525 				idx = pdi->pld_ary[i].pld_pbuf_idx;
526 				ASSERT(idx < MULTIDATA_MAX_PBUFS);
527 				ASSERT(n_mmd->mmd_pbuf[idx] != NULL);
528 				ASSERT(mmd->mmd_pbuf[idx] != NULL);
529 
530 				n_base = n_mmd->mmd_pbuf[idx]->b_rptr;
531 				base = mmd->mmd_pbuf[idx]->b_rptr;
532 
533 				n_pdi.pld_ary[i].pld_pbuf_idx = idx;
534 
535 				/*
536 				 * We can't copy the pointers just like that,
537 				 * so calculate the relative offset.
538 				 */
539 				n_pdi.pld_ary[i].pld_rptr =
540 				    REL_OFF(pdi->pld_ary[i].pld_rptr,
541 					base, n_base);
542 				n_pdi.pld_ary[i].pld_wptr =
543 				    REL_OFF(pdi->pld_ary[i].pld_wptr,
544 					base, n_base);
545 			}
546 		}
547 
548 		/* add the new descriptor to the new Multidata */
549 		n_pd = mmd_addpdesc_int(n_mmd, &n_pdi, NULL, kmflags);
550 
551 		if (n_pd == NULL || (pd->pd_pattbl != NULL &&
552 		    mmd_copy_pattbl(pd->pd_pattbl, n_mmd, n_pd, kmflags) < 0)) {
553 			freeb(n_bp);
554 			mutex_exit(&mmd->mmd_pd_slab_lock);
555 			return (NULL);
556 		}
557 
558 		pd = pd_next;
559 	}
560 #undef REL_OFF
561 #undef FREE_PBUFS
562 
563 	mutex_exit(&mmd->mmd_pd_slab_lock);
564 	return (n_bp);
565 }
566 
567 /*
568  * Given a Multidata message block, return the Multidata metadata handle.
569  */
570 multidata_t *
571 mmd_getmultidata(mblk_t *mp)
572 {
573 	multidata_t *mmd;
574 
575 	ASSERT(mp != NULL);
576 
577 	if (DB_TYPE(mp) != M_MULTIDATA)
578 		return (NULL);
579 
580 	mmd = (multidata_t *)mp->b_rptr;
581 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
582 
583 	return (mmd);
584 }
585 
586 /*
587  * Return the start and end addresses of the associated buffer(s).
588  */
589 void
590 mmd_getregions(multidata_t *mmd, mbufinfo_t *mbi)
591 {
592 	int i;
593 
594 	ASSERT(mmd != NULL);
595 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
596 	ASSERT(mbi != NULL);
597 
598 	bzero((void *)mbi, sizeof (mbufinfo_t));
599 
600 	if (mmd->mmd_hbuf != NULL) {
601 		mbi->hbuf_rptr = mmd->mmd_hbuf->b_rptr;
602 		mbi->hbuf_wptr = mmd->mmd_hbuf->b_wptr;
603 	}
604 
605 	mutex_enter(&mmd->mmd_pd_slab_lock);
606 	for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
607 		ASSERT(mmd->mmd_pbuf[i] != NULL);
608 		mbi->pbuf_ary[i].pbuf_rptr = mmd->mmd_pbuf[i]->b_rptr;
609 		mbi->pbuf_ary[i].pbuf_wptr = mmd->mmd_pbuf[i]->b_wptr;
610 
611 	}
612 	mbi->pbuf_cnt = mmd->mmd_pbuf_cnt;
613 	mutex_exit(&mmd->mmd_pd_slab_lock);
614 }
615 
616 /*
617  * Return the Multidata statistics.
618  */
619 uint_t
620 mmd_getcnt(multidata_t *mmd, uint_t *hbuf_ref, uint_t *pbuf_ref)
621 {
622 	uint_t pd_cnt;
623 
624 	ASSERT(mmd != NULL);
625 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
626 
627 	mutex_enter(&(mmd->mmd_pd_slab_lock));
628 	if (hbuf_ref != NULL)
629 		*hbuf_ref = mmd->mmd_hbuf_ref;
630 	if (pbuf_ref != NULL)
631 		*pbuf_ref = mmd->mmd_pbuf_ref;
632 	pd_cnt = mmd->mmd_pd_cnt;
633 	mutex_exit(&(mmd->mmd_pd_slab_lock));
634 
635 	return (pd_cnt);
636 }
637 
638 #define	HBUF_REF_VALID(mmd, pdi)					\
639 	((mmd)->mmd_hbuf != NULL && (pdi)->hdr_rptr != NULL &&		\
640 	(pdi)->hdr_wptr != NULL && (pdi)->hdr_base != NULL &&		\
641 	(pdi)->hdr_lim != NULL && (pdi)->hdr_lim >= (pdi)->hdr_base &&	\
642 	(pdi)->hdr_wptr >= (pdi)->hdr_rptr &&				\
643 	(pdi)->hdr_base <= (pdi)->hdr_rptr &&				\
644 	(pdi)->hdr_lim >= (pdi)->hdr_wptr &&				\
645 	(pdi)->hdr_base >= (mmd)->mmd_hbuf->b_rptr &&			\
646 	MBLKIN((mmd)->mmd_hbuf,						\
647 	(pdi->hdr_base - (mmd)->mmd_hbuf->b_rptr),			\
648 	PDESC_HDRSIZE(pdi)))
649 
650 /*
651  * Bounds check payload area(s).
652  */
653 static boolean_t
654 pbuf_ref_valid(multidata_t *mmd, pdescinfo_t *pdi)
655 {
656 	int i = 0, idx;
657 	boolean_t valid = B_TRUE;
658 	struct pld_ary_s *pa;
659 
660 	mutex_enter(&mmd->mmd_pd_slab_lock);
661 	if (pdi->pld_cnt == 0 || pdi->pld_cnt > mmd->mmd_pbuf_cnt) {
662 		mutex_exit(&mmd->mmd_pd_slab_lock);
663 		return (B_FALSE);
664 	}
665 
666 	pa = &pdi->pld_ary[0];
667 	while (valid && i < pdi->pld_cnt) {
668 		valid = (((idx = pa->pld_pbuf_idx) < mmd->mmd_pbuf_cnt) &&
669 		    pa->pld_rptr != NULL && pa->pld_wptr != NULL &&
670 		    pa->pld_wptr >= pa->pld_rptr &&
671 		    pa->pld_rptr >= mmd->mmd_pbuf[idx]->b_rptr &&
672 		    MBLKIN(mmd->mmd_pbuf[idx], (pa->pld_rptr -
673 			mmd->mmd_pbuf[idx]->b_rptr),
674 			PDESC_PLD_SPAN_SIZE(pdi, i)));
675 
676 		if (!valid) {
677 			MMD_DEBUG((CE_WARN,
678 			    "pbuf_ref_valid: pdi 0x%p pld out of bound; "
679 			    "index %d has pld_cnt %d pbuf_idx %d "
680 			    "(mmd_pbuf_cnt %d), "
681 			    "pld_rptr 0x%p pld_wptr 0x%p len %d "
682 			    "(valid 0x%p-0x%p len %d)\n", (void *)pdi,
683 			    i, pdi->pld_cnt, idx, mmd->mmd_pbuf_cnt,
684 			    (void *)pa->pld_rptr,
685 			    (void *)pa->pld_wptr,
686 			    (int)PDESC_PLD_SPAN_SIZE(pdi, i),
687 			    (void *)mmd->mmd_pbuf[idx]->b_rptr,
688 			    (void *)mmd->mmd_pbuf[idx]->b_wptr,
689 			    (int)MBLKL(mmd->mmd_pbuf[idx])));
690 		}
691 
692 		/* advance to next entry */
693 		i++;
694 		pa++;
695 	}
696 
697 	mutex_exit(&mmd->mmd_pd_slab_lock);
698 	return (valid);
699 }
700 
701 /*
702  * Add a packet descriptor to the Multidata.
703  */
704 pdesc_t *
705 mmd_addpdesc(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
706 {
707 	ASSERT(mmd != NULL);
708 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
709 	ASSERT(pdi != NULL);
710 	ASSERT(pdi->flags & PDESC_HAS_REF);
711 
712 	/* do the references refer to invalid memory regions? */
713 	if (!mmd_speed_over_safety &&
714 	    (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
715 	    ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi)))) {
716 		if (err != NULL)
717 			*err = EINVAL;
718 		return (NULL);
719 	}
720 
721 	return (mmd_addpdesc_int(mmd, pdi, err, kmflags));
722 }
723 
724 /*
725  * Internal routine to add a packet descriptor, called when mmd_addpdesc
726  * or mmd_copy tries to allocate and add a descriptor to a Multidata.
727  */
728 static pdesc_t *
729 mmd_addpdesc_int(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
730 {
731 	pdesc_slab_t *slab, *slab_last;
732 	pdesc_t *pd;
733 
734 	ASSERT(pdi->flags & PDESC_HAS_REF);
735 	ASSERT(!(pdi->flags & PDESC_HBUF_REF) || HBUF_REF_VALID(mmd, pdi));
736 	ASSERT(!(pdi->flags & PDESC_PBUF_REF) || pbuf_ref_valid(mmd, pdi));
737 
738 	if (err != NULL)
739 		*err = 0;
740 
741 	mutex_enter(&(mmd->mmd_pd_slab_lock));
742 	/*
743 	 * Is slab list empty or the last-added slab is full?  If so,
744 	 * allocate new slab for the descriptor; otherwise, use the
745 	 * last-added slab instead.
746 	 */
747 	slab_last = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_prev);
748 	if (mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q) ||
749 	    slab_last->pds_used == slab_last->pds_sz) {
750 		slab = kmem_cache_alloc(pd_slab_cache, kmflags);
751 		if (slab == NULL) {
752 			if (err != NULL)
753 				*err = ENOMEM;
754 			mutex_exit(&(mmd->mmd_pd_slab_lock));
755 			return (NULL);
756 		}
757 		slab->pds_mmd = mmd;
758 
759 		ASSERT(slab->pds_used == 0);
760 		ASSERT(slab->pds_next == NULL && slab->pds_prev == NULL);
761 
762 		/* insert slab at end of list */
763 		insque(&(slab->pds_next), mmd->mmd_pd_slab_q.ql_prev);
764 		mmd->mmd_slab_cnt++;
765 	} else {
766 		slab = slab_last;
767 	}
768 	ASSERT(slab->pds_used < slab->pds_sz);
769 	pd = &(slab->pds_free_desc[slab->pds_used++]);
770 	ASSERT(pd->pd_magic == PDESC_MAGIC);
771 	pd->pd_next = NULL;
772 	pd->pd_prev = NULL;
773 	pd->pd_slab = slab;
774 	pd->pd_pattbl = NULL;
775 
776 	/* copy over the descriptor info from caller */
777 	PDI_COPY(pdi, &(pd->pd_pdi));
778 
779 	if (pd->pd_flags & PDESC_HBUF_REF)
780 		mmd->mmd_hbuf_ref++;
781 	if (pd->pd_flags & PDESC_PBUF_REF)
782 		mmd->mmd_pbuf_ref += pd->pd_pdi.pld_cnt;
783 	mmd->mmd_pd_cnt++;
784 
785 	/* insert descriptor at end of list */
786 	insque(&(pd->pd_next), mmd->mmd_pd_q.ql_prev);
787 	mutex_exit(&(mmd->mmd_pd_slab_lock));
788 
789 	return (pd);
790 }
791 
792 /*
793  * Packet descriptor slab kmem cache constructor routine.
794  */
795 /* ARGSUSED */
796 static int
797 pdslab_constructor(void *buf, void *cdrarg, int kmflags)
798 {
799 	pdesc_slab_t *slab;
800 	uint_t cnt = (uint_t)(uintptr_t)cdrarg;
801 	int i;
802 
803 	ASSERT(cnt > 0);	/* slab size can't be zero */
804 
805 	slab = (pdesc_slab_t *)buf;
806 	slab->pds_next = NULL;
807 	slab->pds_prev = NULL;
808 	slab->pds_mmd = NULL;
809 	slab->pds_used = 0;
810 	slab->pds_sz = cnt;
811 
812 	for (i = 0; i < cnt; i++) {
813 		pdesc_t *pd = &(slab->pds_free_desc[i]);
814 		pd->pd_magic = PDESC_MAGIC;
815 	}
816 	return (0);
817 }
818 
819 /*
820  * Packet descriptor slab kmem cache destructor routine.
821  */
822 /* ARGSUSED */
823 static void
824 pdslab_destructor(void *buf, void *cdrarg)
825 {
826 	pdesc_slab_t *slab;
827 
828 	slab = (pdesc_slab_t *)buf;
829 	ASSERT(slab->pds_next == NULL);
830 	ASSERT(slab->pds_prev == NULL);
831 	ASSERT(slab->pds_mmd == NULL);
832 	ASSERT(slab->pds_used == 0);
833 	ASSERT(slab->pds_sz > 0);
834 }
835 
836 /*
837  * Remove a packet descriptor from the in-use descriptor list,
838  * called by mmd_rempdesc or during free.
839  */
840 static pdesc_t *
841 mmd_destroy_pdesc(multidata_t *mmd, pdesc_t *pd)
842 {
843 	pdesc_t *pd_next;
844 
845 	pd_next = Q2PD(pd->pd_next);
846 	remque(&(pd->pd_next));
847 
848 	/* remove all local attributes */
849 	if (pd->pd_pattbl != NULL)
850 		mmd_destroy_pattbl(&(pd->pd_pattbl));
851 
852 	/* don't decrease counts for a removed descriptor */
853 	if (!(pd->pd_flags & PDESC_REM_DEFER)) {
854 		if (pd->pd_flags & PDESC_HBUF_REF) {
855 			ASSERT(mmd->mmd_hbuf_ref > 0);
856 			mmd->mmd_hbuf_ref--;
857 		}
858 		if (pd->pd_flags & PDESC_PBUF_REF) {
859 			ASSERT(mmd->mmd_pbuf_ref > 0);
860 			mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
861 		}
862 		ASSERT(mmd->mmd_pd_cnt > 0);
863 		mmd->mmd_pd_cnt--;
864 	}
865 	return (pd_next);
866 }
867 
868 /*
869  * Remove a packet descriptor from the Multidata.
870  */
871 void
872 mmd_rempdesc(pdesc_t *pd)
873 {
874 	multidata_t *mmd;
875 
876 	ASSERT(pd->pd_magic == PDESC_MAGIC);
877 	ASSERT(pd->pd_slab != NULL);
878 
879 	mmd = pd->pd_slab->pds_mmd;
880 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
881 
882 	mutex_enter(&(mmd->mmd_pd_slab_lock));
883 	/*
884 	 * We can't deallocate the associated resources if the Multidata
885 	 * is shared with other threads, because it's possible that the
886 	 * descriptor handle value is held by those threads.  That's why
887 	 * we simply mark the entry as "removed" and decrement the counts.
888 	 * If there are no other threads, then we free the descriptor.
889 	 */
890 	if (mmd->mmd_dp->db_ref > 1) {
891 		pd->pd_flags |= PDESC_REM_DEFER;
892 		if (pd->pd_flags & PDESC_HBUF_REF) {
893 			ASSERT(mmd->mmd_hbuf_ref > 0);
894 			mmd->mmd_hbuf_ref--;
895 		}
896 		if (pd->pd_flags & PDESC_PBUF_REF) {
897 			ASSERT(mmd->mmd_pbuf_ref > 0);
898 			mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
899 		}
900 		ASSERT(mmd->mmd_pd_cnt > 0);
901 		mmd->mmd_pd_cnt--;
902 	} else {
903 		(void) mmd_destroy_pdesc(mmd, pd);
904 	}
905 	mutex_exit(&(mmd->mmd_pd_slab_lock));
906 }
907 
908 /*
909  * A generic routine to traverse the packet descriptor in-use list.
910  */
911 static pdesc_t *
912 mmd_getpdesc(multidata_t *mmd, pdesc_t *pd, pdescinfo_t *pdi, uint_t forw,
913     boolean_t mutex_held)
914 {
915 	pdesc_t *pd_head;
916 
917 	ASSERT(pd == NULL || pd->pd_slab->pds_mmd == mmd);
918 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
919 	ASSERT(!mutex_held || MUTEX_HELD(&(mmd->mmd_pd_slab_lock)));
920 
921 	if (!mutex_held)
922 		mutex_enter(&(mmd->mmd_pd_slab_lock));
923 	pd_head = Q2PD(&(mmd->mmd_pd_q));
924 
925 	if (pd == NULL) {
926 		/*
927 		 * We're called by mmd_get{first,last}pdesc, and so
928 		 * return either the first or last list element.
929 		 */
930 		pd = forw ? Q2PD(mmd->mmd_pd_q.ql_next) :
931 		    Q2PD(mmd->mmd_pd_q.ql_prev);
932 	} else {
933 		/*
934 		 * We're called by mmd_get{next,prev}pdesc, and so
935 		 * return either the next or previous list element.
936 		 */
937 		pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
938 	}
939 
940 	while (pd != pd_head) {
941 		/* skip element if it has been removed */
942 		if (!(pd->pd_flags & PDESC_REM_DEFER))
943 			break;
944 		pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
945 	}
946 	if (!mutex_held)
947 		mutex_exit(&(mmd->mmd_pd_slab_lock));
948 
949 	/* return NULL if we're back at the beginning */
950 	if (pd == pd_head)
951 		pd = NULL;
952 
953 	/* got an entry; copy descriptor info to caller */
954 	if (pd != NULL && pdi != NULL)
955 		PDI_COPY(&(pd->pd_pdi), pdi);
956 
957 	ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
958 	return (pd);
959 
960 }
961 
962 /*
963  * Return the first packet descriptor in the in-use list.
964  */
965 pdesc_t *
966 mmd_getfirstpdesc(multidata_t *mmd, pdescinfo_t *pdi)
967 {
968 	return (mmd_getpdesc(mmd, NULL, pdi, 1, B_FALSE));
969 }
970 
971 /*
972  * Return the last packet descriptor in the in-use list.
973  */
974 pdesc_t *
975 mmd_getlastpdesc(multidata_t *mmd, pdescinfo_t *pdi)
976 {
977 	return (mmd_getpdesc(mmd, NULL, pdi, 0, B_FALSE));
978 }
979 
980 /*
981  * Return the next packet descriptor in the in-use list.
982  */
983 pdesc_t *
984 mmd_getnextpdesc(pdesc_t *pd, pdescinfo_t *pdi)
985 {
986 	return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 1, B_FALSE));
987 }
988 
989 /*
990  * Return the previous packet descriptor in the in-use list.
991  */
992 pdesc_t *
993 mmd_getprevpdesc(pdesc_t *pd, pdescinfo_t *pdi)
994 {
995 	return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 0, B_FALSE));
996 }
997 
998 /*
999  * Check to see if pdi stretches over c_pdi; used to ensure that a packet
1000  * descriptor's header and payload span may not be extended beyond the
1001  * current boundaries.
1002  */
1003 static boolean_t
1004 pdi_in_range(pdescinfo_t *pdi, pdescinfo_t *c_pdi)
1005 {
1006 	int i;
1007 	struct pld_ary_s *pa = &pdi->pld_ary[0];
1008 	struct pld_ary_s *c_pa = &c_pdi->pld_ary[0];
1009 
1010 	if (pdi->hdr_base < c_pdi->hdr_base || pdi->hdr_lim > c_pdi->hdr_lim)
1011 		return (B_FALSE);
1012 
1013 	/*
1014 	 * We don't allow the number of span to be reduced, for the sake
1015 	 * of simplicity.  Instead, we provide PDESC_PLD_SPAN_CLEAR() to
1016 	 * clear a packet descriptor.  Note that we allow the span count to
1017 	 * be increased, and the bounds check for the new one happens
1018 	 * in pbuf_ref_valid.
1019 	 */
1020 	if (pdi->pld_cnt < c_pdi->pld_cnt)
1021 		return (B_FALSE);
1022 
1023 	/* compare only those which are currently defined */
1024 	for (i = 0; i < c_pdi->pld_cnt; i++, pa++, c_pa++) {
1025 		if (pa->pld_pbuf_idx != c_pa->pld_pbuf_idx ||
1026 		    pa->pld_rptr < c_pa->pld_rptr ||
1027 		    pa->pld_wptr > c_pa->pld_wptr)
1028 			return (B_FALSE);
1029 	}
1030 	return (B_TRUE);
1031 }
1032 
1033 /*
1034  * Modify the layout of a packet descriptor.
1035  */
1036 pdesc_t *
1037 mmd_adjpdesc(pdesc_t *pd, pdescinfo_t *pdi)
1038 {
1039 	multidata_t *mmd;
1040 	pdescinfo_t *c_pdi;
1041 
1042 	ASSERT(pd != NULL);
1043 	ASSERT(pdi != NULL);
1044 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1045 
1046 	mmd = pd->pd_slab->pds_mmd;
1047 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1048 
1049 	/* entry has been removed */
1050 	if (pd->pd_flags & PDESC_REM_DEFER)
1051 		return (NULL);
1052 
1053 	/* caller doesn't intend to specify any buffer reference? */
1054 	if (!(pdi->flags & PDESC_HAS_REF))
1055 		return (NULL);
1056 
1057 	/* do the references refer to invalid memory regions? */
1058 	if (!mmd_speed_over_safety &&
1059 	    (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
1060 	    ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi))))
1061 		return (NULL);
1062 
1063 	/* they're not subsets of current references? */
1064 	c_pdi = &(pd->pd_pdi);
1065 	if (!pdi_in_range(pdi, c_pdi))
1066 		return (NULL);
1067 
1068 	/* copy over the descriptor info from caller */
1069 	PDI_COPY(pdi, c_pdi);
1070 
1071 	return (pd);
1072 }
1073 
1074 /*
1075  * Copy the contents of a packet descriptor into a new buffer.  If the
1076  * descriptor points to more than one buffer fragments, the contents
1077  * of both fragments will be joined, with the header buffer fragment
1078  * preceding the payload buffer fragment(s).
1079  */
1080 mblk_t *
1081 mmd_transform(pdesc_t *pd)
1082 {
1083 	multidata_t *mmd;
1084 	pdescinfo_t *pdi;
1085 	mblk_t *mp;
1086 	int h_size = 0, p_size = 0;
1087 	int i, len;
1088 
1089 	ASSERT(pd != NULL);
1090 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1091 
1092 	mmd = pd->pd_slab->pds_mmd;
1093 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1094 
1095 	/* entry has been removed */
1096 	if (pd->pd_flags & PDESC_REM_DEFER)
1097 		return (NULL);
1098 
1099 	mutex_enter(&mmd->mmd_pd_slab_lock);
1100 	pdi = &(pd->pd_pdi);
1101 	if (pdi->flags & PDESC_HBUF_REF)
1102 		h_size = PDESC_HDRL(pdi);
1103 	if (pdi->flags & PDESC_PBUF_REF) {
1104 		for (i = 0; i < pdi->pld_cnt; i++)
1105 			p_size += PDESC_PLD_SPAN_SIZE(pdi, i);
1106 	}
1107 
1108 	/* allocate space large enough to hold the fragment(s) */
1109 	ASSERT(h_size + p_size >= 0);
1110 	if ((mp = allocb(h_size + p_size, BPRI_HI)) == NULL) {
1111 		mutex_exit(&mmd->mmd_pd_slab_lock);
1112 		return (NULL);
1113 	}
1114 
1115 	/* copy over the header fragment */
1116 	if ((pdi->flags & PDESC_HBUF_REF) && h_size > 0) {
1117 		bcopy(pdi->hdr_rptr, mp->b_wptr, h_size);
1118 		mp->b_wptr += h_size;
1119 	}
1120 
1121 	/* copy over the payload fragment */
1122 	if ((pdi->flags & PDESC_PBUF_REF) && p_size > 0) {
1123 		for (i = 0; i < pdi->pld_cnt; i++) {
1124 			len = PDESC_PLD_SPAN_SIZE(pdi, i);
1125 			if (len > 0) {
1126 				bcopy(pdi->pld_ary[i].pld_rptr,
1127 				    mp->b_wptr, len);
1128 				mp->b_wptr += len;
1129 			}
1130 		}
1131 	}
1132 
1133 	mutex_exit(&mmd->mmd_pd_slab_lock);
1134 	return (mp);
1135 }
1136 
1137 /*
1138  * Return a chain of mblks representing the Multidata packet.
1139  */
1140 mblk_t *
1141 mmd_transform_link(pdesc_t *pd)
1142 {
1143 	multidata_t *mmd;
1144 	pdescinfo_t *pdi;
1145 	mblk_t *nmp = NULL;
1146 
1147 	ASSERT(pd != NULL);
1148 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1149 
1150 	mmd = pd->pd_slab->pds_mmd;
1151 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1152 
1153 	/* entry has been removed */
1154 	if (pd->pd_flags & PDESC_REM_DEFER)
1155 		return (NULL);
1156 
1157 	pdi = &(pd->pd_pdi);
1158 
1159 	/* duplicate header buffer */
1160 	if ((pdi->flags & PDESC_HBUF_REF)) {
1161 		if ((nmp = dupb(mmd->mmd_hbuf)) == NULL)
1162 			return (NULL);
1163 		nmp->b_rptr = pdi->hdr_rptr;
1164 		nmp->b_wptr = pdi->hdr_wptr;
1165 	}
1166 
1167 	/* duplicate payload buffer(s) */
1168 	if (pdi->flags & PDESC_PBUF_REF) {
1169 		int i;
1170 		mblk_t *mp;
1171 		struct pld_ary_s *pa = &pdi->pld_ary[0];
1172 
1173 		mutex_enter(&mmd->mmd_pd_slab_lock);
1174 		for (i = 0; i < pdi->pld_cnt; i++, pa++) {
1175 			ASSERT(mmd->mmd_pbuf[pa->pld_pbuf_idx] != NULL);
1176 
1177 			/* skip empty ones */
1178 			if (PDESC_PLD_SPAN_SIZE(pdi, i) == 0)
1179 				continue;
1180 
1181 			mp = dupb(mmd->mmd_pbuf[pa->pld_pbuf_idx]);
1182 			if (mp == NULL) {
1183 				if (nmp != NULL)
1184 					freemsg(nmp);
1185 				mutex_exit(&mmd->mmd_pd_slab_lock);
1186 				return (NULL);
1187 			}
1188 			mp->b_rptr = pa->pld_rptr;
1189 			mp->b_wptr = pa->pld_wptr;
1190 			if (nmp == NULL)
1191 				nmp = mp;
1192 			else
1193 				linkb(nmp, mp);
1194 		}
1195 		mutex_exit(&mmd->mmd_pd_slab_lock);
1196 	}
1197 
1198 	return (nmp);
1199 }
1200 
1201 /*
1202  * Return duplicate message block(s) of the associated buffer(s).
1203  */
1204 int
1205 mmd_dupbufs(multidata_t *mmd, mblk_t **hmp, mblk_t **pmp)
1206 {
1207 	ASSERT(mmd != NULL);
1208 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1209 
1210 	if (hmp != NULL) {
1211 		*hmp = NULL;
1212 		if (mmd->mmd_hbuf != NULL &&
1213 		    (*hmp = dupb(mmd->mmd_hbuf)) == NULL)
1214 			return (-1);
1215 	}
1216 
1217 	if (pmp != NULL) {
1218 		int i;
1219 		mblk_t *mp;
1220 
1221 		mutex_enter(&mmd->mmd_pd_slab_lock);
1222 		*pmp = NULL;
1223 		for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1224 			ASSERT(mmd->mmd_pbuf[i] != NULL);
1225 			mp = dupb(mmd->mmd_pbuf[i]);
1226 			if (mp == NULL) {
1227 				if (hmp != NULL && *hmp != NULL)
1228 					freeb(*hmp);
1229 				if (*pmp != NULL)
1230 					freemsg(*pmp);
1231 				mutex_exit(&mmd->mmd_pd_slab_lock);
1232 				return (-1);
1233 			}
1234 			if (*pmp == NULL)
1235 				*pmp = mp;
1236 			else
1237 				linkb(*pmp, mp);
1238 		}
1239 		mutex_exit(&mmd->mmd_pd_slab_lock);
1240 	}
1241 
1242 	return (0);
1243 }
1244 
1245 /*
1246  * Return the layout of a packet descriptor.
1247  */
1248 int
1249 mmd_getpdescinfo(pdesc_t *pd, pdescinfo_t *pdi)
1250 {
1251 	ASSERT(pd != NULL);
1252 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1253 	ASSERT(pd->pd_slab != NULL);
1254 	ASSERT(pd->pd_slab->pds_mmd->mmd_magic == MULTIDATA_MAGIC);
1255 	ASSERT(pdi != NULL);
1256 
1257 	/* entry has been removed */
1258 	if (pd->pd_flags & PDESC_REM_DEFER)
1259 		return (-1);
1260 
1261 	/* copy descriptor info to caller */
1262 	PDI_COPY(&(pd->pd_pdi), pdi);
1263 
1264 	return (0);
1265 }
1266 
1267 /*
1268  * Add a global or local attribute to a Multidata.  Global attribute
1269  * association is specified by a NULL packet descriptor.
1270  */
1271 pattr_t *
1272 mmd_addpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai,
1273     boolean_t persistent, int kmflags)
1274 {
1275 	patbkt_t **tbl_p;
1276 	patbkt_t *tbl, *o_tbl;
1277 	patbkt_t *bkt;
1278 	pattr_t *pa;
1279 	uint_t size;
1280 
1281 	ASSERT(mmd != NULL);
1282 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1283 	ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
1284 	ASSERT(pai != NULL);
1285 
1286 	/* pointer to the attribute hash table (local or global) */
1287 	tbl_p = pd != NULL ? &(pd->pd_pattbl) : &(mmd->mmd_pattbl);
1288 
1289 	/*
1290 	 * See if the hash table has not yet been created; if so,
1291 	 * we create the table and store its address atomically.
1292 	 */
1293 	if ((tbl = *tbl_p) == NULL) {
1294 		tbl = kmem_cache_alloc(pattbl_cache, kmflags);
1295 		if (tbl == NULL)
1296 			return (NULL);
1297 
1298 		/* if someone got there first, use his table instead */
1299 		if ((o_tbl = casptr(tbl_p, NULL, tbl)) != NULL) {
1300 			kmem_cache_free(pattbl_cache, tbl);
1301 			tbl = o_tbl;
1302 		}
1303 	}
1304 
1305 	ASSERT(tbl->pbkt_tbl_sz > 0);
1306 	bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1307 
1308 	/* attribute of the same type already exists? */
1309 	if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL)
1310 		return (NULL);
1311 
1312 	size = sizeof (*pa) + pai->len;
1313 	if ((pa = kmem_zalloc(size, kmflags)) == NULL)
1314 		return (NULL);
1315 
1316 	pa->pat_magic = PATTR_MAGIC;
1317 	pa->pat_lock = &(bkt->pbkt_lock);
1318 	pa->pat_mmd = mmd;
1319 	pa->pat_buflen = size;
1320 	pa->pat_type = pai->type;
1321 	pai->buf = pai->len > 0 ? ((uchar_t *)(pa + 1)) : NULL;
1322 
1323 	if (persistent)
1324 		pa->pat_flags = PATTR_PERSIST;
1325 
1326 	/* insert attribute at end of hash chain */
1327 	mutex_enter(&(bkt->pbkt_lock));
1328 	insque(&(pa->pat_next), bkt->pbkt_pattr_q.ql_prev);
1329 	mutex_exit(&(bkt->pbkt_lock));
1330 
1331 	return (pa);
1332 }
1333 
1334 /*
1335  * Attribute hash table kmem cache constructor routine.
1336  */
1337 /* ARGSUSED */
1338 static int
1339 pattbl_constructor(void *buf, void *cdrarg, int kmflags)
1340 {
1341 	patbkt_t *bkt;
1342 	uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1343 	uint_t i;
1344 
1345 	ASSERT(tbl_sz > 0);	/* table size can't be zero */
1346 
1347 	for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1348 		mutex_init(&(bkt->pbkt_lock), NULL, MUTEX_DRIVER, NULL);
1349 		QL_INIT(&(bkt->pbkt_pattr_q));
1350 
1351 		/* first bucket contains the table size */
1352 		bkt->pbkt_tbl_sz = i == 0 ? tbl_sz : 0;
1353 	}
1354 	return (0);
1355 }
1356 
1357 /*
1358  * Attribute hash table kmem cache destructor routine.
1359  */
1360 /* ARGSUSED */
1361 static void
1362 pattbl_destructor(void *buf, void *cdrarg)
1363 {
1364 	patbkt_t *bkt;
1365 	uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1366 	uint_t i;
1367 
1368 	ASSERT(tbl_sz > 0);	/* table size can't be zero */
1369 
1370 	for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1371 		mutex_destroy(&(bkt->pbkt_lock));
1372 		ASSERT(bkt->pbkt_pattr_q.ql_next == &(bkt->pbkt_pattr_q));
1373 		ASSERT(i > 0 || bkt->pbkt_tbl_sz == tbl_sz);
1374 	}
1375 }
1376 
1377 /*
1378  * Destroy an attribute hash table, called by mmd_rempdesc or during free.
1379  */
1380 static void
1381 mmd_destroy_pattbl(patbkt_t **tbl)
1382 {
1383 	patbkt_t *bkt;
1384 	pattr_t *pa, *pa_next;
1385 	uint_t i, tbl_sz;
1386 
1387 	ASSERT(tbl != NULL);
1388 	bkt = *tbl;
1389 	tbl_sz = bkt->pbkt_tbl_sz;
1390 
1391 	/* make sure caller passes in the first bucket */
1392 	ASSERT(tbl_sz > 0);
1393 
1394 	/* destroy the contents of each bucket */
1395 	for (i = 0; i < tbl_sz; i++, bkt++) {
1396 		/* we ought to be exclusive at this point */
1397 		ASSERT(MUTEX_NOT_HELD(&(bkt->pbkt_lock)));
1398 
1399 		pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1400 		while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1401 			ASSERT(pa->pat_magic == PATTR_MAGIC);
1402 			pa_next = Q2PATTR(pa->pat_next);
1403 			remque(&(pa->pat_next));
1404 			kmem_free(pa, pa->pat_buflen);
1405 			pa = pa_next;
1406 		}
1407 	}
1408 
1409 	kmem_cache_free(pattbl_cache, *tbl);
1410 	*tbl = NULL;
1411 
1412 	/* commit all previous stores */
1413 	membar_producer();
1414 }
1415 
1416 /*
1417  * Copy the contents of an attribute hash table, called by mmd_copy.
1418  */
1419 static int
1420 mmd_copy_pattbl(patbkt_t *src_tbl, multidata_t *n_mmd, pdesc_t *n_pd,
1421     int kmflags)
1422 {
1423 	patbkt_t *bkt;
1424 	pattr_t *pa;
1425 	pattrinfo_t pai;
1426 	uint_t i, tbl_sz;
1427 
1428 	ASSERT(src_tbl != NULL);
1429 	bkt = src_tbl;
1430 	tbl_sz = bkt->pbkt_tbl_sz;
1431 
1432 	/* make sure caller passes in the first bucket */
1433 	ASSERT(tbl_sz > 0);
1434 
1435 	for (i = 0; i < tbl_sz; i++, bkt++) {
1436 		mutex_enter(&(bkt->pbkt_lock));
1437 		pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1438 		while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1439 			pattr_t *pa_next = Q2PATTR(pa->pat_next);
1440 
1441 			/* skip if it's removed */
1442 			if (pa->pat_flags & PATTR_REM_DEFER) {
1443 				pa = pa_next;
1444 				continue;
1445 			}
1446 
1447 			pai.type = pa->pat_type;
1448 			pai.len = pa->pat_buflen - sizeof (*pa);
1449 			if (mmd_addpattr(n_mmd, n_pd, &pai, (pa->pat_flags &
1450 			    PATTR_PERSIST) != 0, kmflags) == NULL) {
1451 				mutex_exit(&(bkt->pbkt_lock));
1452 				return (-1);
1453 			}
1454 
1455 			/* copy over the contents */
1456 			if (pai.buf != NULL)
1457 				bcopy(pa + 1, pai.buf, pai.len);
1458 
1459 			pa = pa_next;
1460 		}
1461 		mutex_exit(&(bkt->pbkt_lock));
1462 	}
1463 
1464 	return (0);
1465 }
1466 
1467 /*
1468  * Search for an attribute type within an attribute hash bucket.
1469  */
1470 static pattr_t *
1471 mmd_find_pattr(patbkt_t *bkt, uint_t type)
1472 {
1473 	pattr_t *pa_head, *pa;
1474 
1475 	mutex_enter(&(bkt->pbkt_lock));
1476 	pa_head = Q2PATTR(&(bkt->pbkt_pattr_q));
1477 	pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1478 
1479 	while (pa != pa_head) {
1480 		ASSERT(pa->pat_magic == PATTR_MAGIC);
1481 
1482 		/* return a match; we treat removed entry as non-existent */
1483 		if (pa->pat_type == type && !(pa->pat_flags & PATTR_REM_DEFER))
1484 			break;
1485 		pa = Q2PATTR(pa->pat_next);
1486 	}
1487 	mutex_exit(&(bkt->pbkt_lock));
1488 
1489 	return (pa == pa_head ? NULL : pa);
1490 }
1491 
1492 /*
1493  * Remove an attribute from a Multidata.
1494  */
1495 void
1496 mmd_rempattr(pattr_t *pa)
1497 {
1498 	kmutex_t *pat_lock = pa->pat_lock;
1499 
1500 	ASSERT(pa->pat_magic == PATTR_MAGIC);
1501 
1502 	/* ignore if attribute was marked as persistent */
1503 	if ((pa->pat_flags & PATTR_PERSIST) != 0)
1504 		return;
1505 
1506 	mutex_enter(pat_lock);
1507 	/*
1508 	 * We can't deallocate the associated resources if the Multidata
1509 	 * is shared with other threads, because it's possible that the
1510 	 * attribute handle value is held by those threads.  That's why
1511 	 * we simply mark the entry as "removed".  If there are no other
1512 	 * threads, then we free the attribute.
1513 	 */
1514 	if (pa->pat_mmd->mmd_dp->db_ref > 1) {
1515 		pa->pat_flags |= PATTR_REM_DEFER;
1516 	} else {
1517 		remque(&(pa->pat_next));
1518 		kmem_free(pa, pa->pat_buflen);
1519 	}
1520 	mutex_exit(pat_lock);
1521 }
1522 
1523 /*
1524  * Find an attribute (according to its type) and return its handle.
1525  */
1526 pattr_t *
1527 mmd_getpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai)
1528 {
1529 	patbkt_t *tbl, *bkt;
1530 	pattr_t *pa;
1531 
1532 	ASSERT(mmd != NULL);
1533 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1534 	ASSERT(pai != NULL);
1535 
1536 	/* get the right attribute hash table (local or global) */
1537 	tbl = pd != NULL ? pd->pd_pattbl : mmd->mmd_pattbl;
1538 
1539 	/* attribute hash table doesn't exist? */
1540 	if (tbl == NULL)
1541 		return (NULL);
1542 
1543 	ASSERT(tbl->pbkt_tbl_sz > 0);
1544 	bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1545 
1546 	if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL) {
1547 		ASSERT(pa->pat_buflen >= sizeof (*pa));
1548 		pai->len = pa->pat_buflen - sizeof (*pa);
1549 		pai->buf = pai->len > 0 ?
1550 		    (uchar_t *)pa + sizeof (pattr_t) : NULL;
1551 	}
1552 	ASSERT(pa == NULL || pa->pat_magic == PATTR_MAGIC);
1553 	return (pa);
1554 }
1555 
1556 /*
1557  * Return total size of buffers and total size of areas referenced
1558  * by all in-use (unremoved) packet descriptors.
1559  */
1560 void
1561 mmd_getsize(multidata_t *mmd, uint_t *ptotal, uint_t *pinuse)
1562 {
1563 	pdesc_t *pd;
1564 	pdescinfo_t *pdi;
1565 	int i;
1566 
1567 	ASSERT(mmd != NULL);
1568 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1569 
1570 	mutex_enter(&mmd->mmd_pd_slab_lock);
1571 	if (ptotal != NULL) {
1572 		*ptotal = 0;
1573 
1574 		if (mmd->mmd_hbuf != NULL)
1575 			*ptotal += MBLKL(mmd->mmd_hbuf);
1576 
1577 		for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1578 			ASSERT(mmd->mmd_pbuf[i] != NULL);
1579 			*ptotal += MBLKL(mmd->mmd_pbuf[i]);
1580 		}
1581 	}
1582 	if (pinuse != NULL) {
1583 		*pinuse = 0;
1584 
1585 		/* first pdesc */
1586 		pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);
1587 		while (pd != NULL) {
1588 			pdi = &pd->pd_pdi;
1589 
1590 			/* next pdesc */
1591 			pd = mmd_getpdesc(mmd, pd, NULL, 1, B_TRUE);
1592 
1593 			/* skip over removed descriptor */
1594 			if (pdi->flags & PDESC_REM_DEFER)
1595 				continue;
1596 
1597 			if (pdi->flags & PDESC_HBUF_REF)
1598 				*pinuse += PDESC_HDRL(pdi);
1599 
1600 			if (pdi->flags & PDESC_PBUF_REF) {
1601 				for (i = 0; i < pdi->pld_cnt; i++)
1602 					*pinuse += PDESC_PLDL(pdi, i);
1603 			}
1604 		}
1605 	}
1606 	mutex_exit(&mmd->mmd_pd_slab_lock);
1607 }
1608