xref: /titanic_41/usr/src/uts/common/io/multidata.c (revision 0b6016e6ff70af39f99c9cc28e0c2207c8f5413c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Multidata, as described in the following papers:
31  *
32  * Adi Masputra,
33  * Multidata V.2: VA-Disjoint Packet Extents Framework Interface
34  * Design Specification.  August 2004.
35  * Available as http://sac.sfbay/PSARC/2004/594/materials/mmd2.pdf.
36  *
37  * Adi Masputra,
38  * Multidata Interface Design Specification.  Sep 2002.
39  * Available as http://sac.sfbay/PSARC/2002/276/materials/mmd.pdf.
40  *
41  * Adi Masputra, Frank DiMambro, Kacheong Poon,
42  * An Efficient Networking Transmit Mechanism for Solaris:
43  * Multidata Transmit (MDT).  May 2002.
44  * Available as http://sac.sfbay/PSARC/2002/276/materials/mdt.pdf.
45  */
46 
47 #include <sys/types.h>
48 #include <sys/stream.h>
49 #include <sys/dlpi.h>
50 #include <sys/stropts.h>
51 #include <sys/strsun.h>
52 #include <sys/strlog.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/cmn_err.h>
56 #include <sys/debug.h>
57 #include <sys/kmem.h>
58 #include <sys/atomic.h>
59 
60 #include <sys/multidata.h>
61 #include <sys/multidata_impl.h>
62 
63 static int mmd_constructor(void *, void *, int);
64 static void mmd_destructor(void *, void *);
65 static int pdslab_constructor(void *, void *, int);
66 static void pdslab_destructor(void *, void *);
67 static int pattbl_constructor(void *, void *, int);
68 static void pattbl_destructor(void *, void *);
69 static void mmd_esballoc_free(caddr_t);
70 static int mmd_copy_pattbl(patbkt_t *, multidata_t *, pdesc_t *, int);
71 
72 static boolean_t pbuf_ref_valid(multidata_t *, pdescinfo_t *);
73 #pragma inline(pbuf_ref_valid)
74 
75 static boolean_t pdi_in_range(pdescinfo_t *, pdescinfo_t *);
76 #pragma inline(pdi_in_range)
77 
78 static pdesc_t *mmd_addpdesc_int(multidata_t *, pdescinfo_t *, int *, int);
79 #pragma inline(mmd_addpdesc_int)
80 
81 static void mmd_destroy_pattbl(patbkt_t **);
82 #pragma inline(mmd_destroy_pattbl)
83 
84 static pattr_t *mmd_find_pattr(patbkt_t *, uint_t);
85 #pragma inline(mmd_find_pattr)
86 
87 static pdesc_t *mmd_destroy_pdesc(multidata_t *, pdesc_t *);
88 #pragma inline(mmd_destroy_pdesc)
89 
90 static pdesc_t *mmd_getpdesc(multidata_t *, pdesc_t *, pdescinfo_t *, uint_t,
91     boolean_t);
92 #pragma inline(mmd_getpdesc)
93 
94 static struct kmem_cache *mmd_cache;
95 static struct kmem_cache *pd_slab_cache;
96 static struct kmem_cache *pattbl_cache;
97 
98 int mmd_debug = 1;
99 #define	MMD_DEBUG(s)	if (mmd_debug > 0) cmn_err s
100 
101 /*
102  * Set to this to true to bypass pdesc bounds checking.
103  */
104 boolean_t mmd_speed_over_safety = B_FALSE;
105 
106 /*
107  * Patchable kmem_cache flags.
108  */
109 int mmd_kmem_flags = 0;
110 int pdslab_kmem_flags = 0;
111 int pattbl_kmem_flags = 0;
112 
113 /*
114  * Alignment (in bytes) of our kmem caches.
115  */
116 #define	MULTIDATA_CACHE_ALIGN	64
117 
118 /*
119  * Default number of packet descriptors per descriptor slab.  Making
120  * this too small will trigger more descriptor slab allocation; making
121  * it too large will create too many unclaimed descriptors.
122  */
123 #define	PDSLAB_SZ	15
124 uint_t pdslab_sz = PDSLAB_SZ;
125 
126 /*
127  * Default attribute hash table size.  It's okay to set this to a small
128  * value (even to 1) because there aren't that many attributes currently
129  * defined, and because we assume there won't be many attributes associated
130  * with a Multidata at a given time.  Increasing the size will reduce
131  * attribute search time (given a large number of attributes in a Multidata),
132  * and decreasing it will reduce the memory footprints and the overhead
133  * associated with managing the table.
134  */
135 #define	PATTBL_SZ	1
136 uint_t pattbl_sz = PATTBL_SZ;
137 
138 /*
139  * Attribute hash key.
140  */
141 #define	PATTBL_HASH(x, sz)	((x) % (sz))
142 
143 /*
144  * Structure that precedes each Multidata metadata.
145  */
146 struct mmd_buf_info {
147 	frtn_t	frp;		/* free routine */
148 	uint_t	buf_len;	/* length of kmem buffer */
149 };
150 
151 /*
152  * The size of each metadata buffer.
153  */
154 #define	MMD_CACHE_SIZE	\
155 	(sizeof (struct mmd_buf_info) + sizeof (multidata_t))
156 
157 /*
158  * Called during startup in order to create the Multidata kmem caches.
159  */
160 void
161 mmd_init(void)
162 {
163 	pdslab_sz = MAX(1, pdslab_sz);	/* at least 1 descriptor */
164 	pattbl_sz = MAX(1, pattbl_sz);	/* at least 1 bucket */
165 
166 	mmd_cache = kmem_cache_create("multidata", MMD_CACHE_SIZE,
167 	    MULTIDATA_CACHE_ALIGN, mmd_constructor, mmd_destructor,
168 	    NULL, NULL, NULL, mmd_kmem_flags);
169 
170 	pd_slab_cache = kmem_cache_create("multidata_pdslab",
171 	    PDESC_SLAB_SIZE(pdslab_sz), MULTIDATA_CACHE_ALIGN,
172 	    pdslab_constructor, pdslab_destructor, NULL,
173 	    (void *)(uintptr_t)pdslab_sz, NULL, pdslab_kmem_flags);
174 
175 	pattbl_cache = kmem_cache_create("multidata_pattbl",
176 	    sizeof (patbkt_t) * pattbl_sz, MULTIDATA_CACHE_ALIGN,
177 	    pattbl_constructor, pattbl_destructor, NULL,
178 	    (void *)(uintptr_t)pattbl_sz, NULL, pattbl_kmem_flags);
179 }
180 
181 /*
182  * Create a Multidata message block.
183  */
184 multidata_t *
185 mmd_alloc(mblk_t *hdr_mp, mblk_t **mmd_mp, int kmflags)
186 {
187 	uchar_t *buf;
188 	multidata_t *mmd;
189 	uint_t mmd_mplen;
190 	struct mmd_buf_info *buf_info;
191 
192 	ASSERT(hdr_mp != NULL);
193 	ASSERT(mmd_mp != NULL);
194 
195 	/*
196 	 * Caller should never pass in a chain of mblks since we
197 	 * only care about the first one, hence the assertions.
198 	 */
199 	ASSERT(hdr_mp->b_cont == NULL);
200 
201 	if ((buf = kmem_cache_alloc(mmd_cache, kmflags)) == NULL)
202 		return (NULL);
203 
204 	buf_info = (struct mmd_buf_info *)buf;
205 	buf_info->frp.free_arg = (caddr_t)buf;
206 
207 	mmd = (multidata_t *)(buf_info + 1);
208 	mmd_mplen = sizeof (*mmd);
209 
210 	if ((*mmd_mp = desballoc((uchar_t *)mmd, mmd_mplen, BPRI_HI,
211 	    &(buf_info->frp))) == NULL) {
212 		kmem_cache_free(mmd_cache, buf);
213 		return (NULL);
214 	}
215 
216 	DB_TYPE(*mmd_mp) = M_MULTIDATA;
217 	(*mmd_mp)->b_wptr += mmd_mplen;
218 	mmd->mmd_dp = (*mmd_mp)->b_datap;
219 	mmd->mmd_hbuf = hdr_mp;
220 
221 	return (mmd);
222 }
223 
224 /*
225  * Associate additional payload buffer to the Multidata.
226  */
227 int
228 mmd_addpldbuf(multidata_t *mmd, mblk_t *pld_mp)
229 {
230 	int i;
231 
232 	ASSERT(mmd != NULL);
233 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
234 	ASSERT(pld_mp != NULL);
235 
236 	mutex_enter(&mmd->mmd_pd_slab_lock);
237 	for (i = 0; i < MULTIDATA_MAX_PBUFS &&
238 	    mmd->mmd_pbuf_cnt < MULTIDATA_MAX_PBUFS; i++) {
239 		if (mmd->mmd_pbuf[i] == pld_mp) {
240 			/* duplicate entry */
241 			MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding "
242 			    "pld 0x%p to mmd 0x%p since it has been "
243 			    "previously added into slot %d (total %d)\n",
244 			    (void *)pld_mp, (void *)mmd, i, mmd->mmd_pbuf_cnt));
245 			mutex_exit(&mmd->mmd_pd_slab_lock);
246 			return (-1);
247 		} else if (mmd->mmd_pbuf[i] == NULL) {
248 			mmd->mmd_pbuf[i] = pld_mp;
249 			mmd->mmd_pbuf_cnt++;
250 			mutex_exit(&mmd->mmd_pd_slab_lock);
251 			return (i);
252 		}
253 	}
254 
255 	/* all slots are taken */
256 	MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding pld 0x%p to mmd 0x%p "
257 	    "since no slot space is left (total %d max %d)\n", (void *)pld_mp,
258 	    (void *)mmd, mmd->mmd_pbuf_cnt, MULTIDATA_MAX_PBUFS));
259 	mutex_exit(&mmd->mmd_pd_slab_lock);
260 
261 	return (-1);
262 }
263 
264 /*
265  * Multidata metadata kmem cache constructor routine.
266  */
267 /* ARGSUSED */
268 static int
269 mmd_constructor(void *buf, void *cdrarg, int kmflags)
270 {
271 	struct mmd_buf_info *buf_info;
272 	multidata_t *mmd;
273 
274 	bzero((void *)buf, MMD_CACHE_SIZE);
275 
276 	buf_info = (struct mmd_buf_info *)buf;
277 	buf_info->frp.free_func = mmd_esballoc_free;
278 	buf_info->buf_len = MMD_CACHE_SIZE;
279 
280 	mmd = (multidata_t *)(buf_info + 1);
281 	mmd->mmd_magic = MULTIDATA_MAGIC;
282 
283 	mutex_init(&(mmd->mmd_pd_slab_lock), NULL, MUTEX_DRIVER, NULL);
284 	QL_INIT(&(mmd->mmd_pd_slab_q));
285 	QL_INIT(&(mmd->mmd_pd_q));
286 
287 	return (0);
288 }
289 
290 /*
291  * Multidata metadata kmem cache destructor routine.
292  */
293 /* ARGSUSED */
294 static void
295 mmd_destructor(void *buf, void *cdrarg)
296 {
297 	multidata_t *mmd;
298 #ifdef DEBUG
299 	int i;
300 #endif
301 
302 	mmd = (multidata_t *)((uchar_t *)buf + sizeof (struct mmd_buf_info));
303 
304 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
305 	ASSERT(mmd->mmd_dp == NULL);
306 	ASSERT(mmd->mmd_hbuf == NULL);
307 	ASSERT(mmd->mmd_pbuf_cnt == 0);
308 #ifdef DEBUG
309 	for (i = 0; i < MULTIDATA_MAX_PBUFS; i++)
310 		ASSERT(mmd->mmd_pbuf[i] == NULL);
311 #endif
312 	ASSERT(mmd->mmd_pattbl == NULL);
313 
314 	mutex_destroy(&(mmd->mmd_pd_slab_lock));
315 	ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
316 	ASSERT(mmd->mmd_slab_cnt == 0);
317 	ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
318 	ASSERT(mmd->mmd_pd_cnt == 0);
319 	ASSERT(mmd->mmd_hbuf_ref == 0);
320 	ASSERT(mmd->mmd_pbuf_ref == 0);
321 }
322 
323 /*
324  * Multidata message block free callback routine.
325  */
326 static void
327 mmd_esballoc_free(caddr_t buf)
328 {
329 	multidata_t *mmd;
330 	pdesc_t *pd;
331 	pdesc_slab_t *slab;
332 	int i;
333 
334 	ASSERT(buf != NULL);
335 	ASSERT(((struct mmd_buf_info *)buf)->buf_len == MMD_CACHE_SIZE);
336 
337 	mmd = (multidata_t *)(buf + sizeof (struct mmd_buf_info));
338 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
339 
340 	ASSERT(mmd->mmd_dp != NULL);
341 	ASSERT(mmd->mmd_dp->db_ref == 1);
342 
343 	/* remove all packet descriptors and private attributes */
344 	pd = Q2PD(mmd->mmd_pd_q.ql_next);
345 	while (pd != Q2PD(&(mmd->mmd_pd_q)))
346 		pd = mmd_destroy_pdesc(mmd, pd);
347 
348 	ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
349 	ASSERT(mmd->mmd_pd_cnt == 0);
350 	ASSERT(mmd->mmd_hbuf_ref == 0);
351 	ASSERT(mmd->mmd_pbuf_ref == 0);
352 
353 	/* remove all global attributes */
354 	if (mmd->mmd_pattbl != NULL)
355 		mmd_destroy_pattbl(&(mmd->mmd_pattbl));
356 
357 	/* remove all descriptor slabs */
358 	slab = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_next);
359 	while (slab != Q2PDSLAB(&(mmd->mmd_pd_slab_q))) {
360 		pdesc_slab_t *slab_next = Q2PDSLAB(slab->pds_next);
361 
362 		remque(&(slab->pds_next));
363 		slab->pds_next = NULL;
364 		slab->pds_prev = NULL;
365 		slab->pds_mmd = NULL;
366 		slab->pds_used = 0;
367 		kmem_cache_free(pd_slab_cache, slab);
368 
369 		ASSERT(mmd->mmd_slab_cnt > 0);
370 		mmd->mmd_slab_cnt--;
371 		slab = slab_next;
372 	}
373 	ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
374 	ASSERT(mmd->mmd_slab_cnt == 0);
375 
376 	mmd->mmd_dp = NULL;
377 
378 	/* finally, free all associated message blocks */
379 	if (mmd->mmd_hbuf != NULL) {
380 		freeb(mmd->mmd_hbuf);
381 		mmd->mmd_hbuf = NULL;
382 	}
383 
384 	for (i = 0; i < MULTIDATA_MAX_PBUFS; i++) {
385 		if (mmd->mmd_pbuf[i] != NULL) {
386 			freeb(mmd->mmd_pbuf[i]);
387 			mmd->mmd_pbuf[i] = NULL;
388 			ASSERT(mmd->mmd_pbuf_cnt > 0);
389 			mmd->mmd_pbuf_cnt--;
390 		}
391 	}
392 
393 	ASSERT(mmd->mmd_pbuf_cnt == 0);
394 	ASSERT(MUTEX_NOT_HELD(&(mmd->mmd_pd_slab_lock)));
395 	kmem_cache_free(mmd_cache, buf);
396 }
397 
398 /*
399  * Multidata message block copy routine, called by copyb() when it
400  * encounters a M_MULTIDATA data block type.  This routine should
401  * not be called by anyone other than copyb(), since it may go away
402  * (read: become static to this module) once some sort of copy callback
403  * routine is made available.
404  */
405 mblk_t *
406 mmd_copy(mblk_t *bp, int kmflags)
407 {
408 	multidata_t *mmd, *n_mmd;
409 	mblk_t *n_hbuf = NULL, *n_pbuf[MULTIDATA_MAX_PBUFS];
410 	mblk_t **pmp_last = &n_pbuf[MULTIDATA_MAX_PBUFS - 1];
411 	mblk_t **pmp;
412 	mblk_t *n_bp = NULL;
413 	pdesc_t *pd;
414 	uint_t n_pbuf_cnt = 0;
415 	int idx, i;
416 
417 #define	FREE_PBUFS() {					\
418 	for (pmp = &n_pbuf[0]; pmp <= pmp_last; pmp++)	\
419 		if (*pmp != NULL) freeb(*pmp);		\
420 }
421 
422 #define	REL_OFF(p, base, n_base)			\
423 	((uchar_t *)(n_base) + ((uchar_t *)(p) - (uchar_t *)base))
424 
425 	ASSERT(bp != NULL && DB_TYPE(bp) == M_MULTIDATA);
426 	mmd = mmd_getmultidata(bp);
427 
428 	/* copy the header buffer */
429 	if (mmd->mmd_hbuf != NULL && (n_hbuf = copyb(mmd->mmd_hbuf)) == NULL)
430 		return (NULL);
431 
432 	/* copy the payload buffer(s) */
433 	mutex_enter(&mmd->mmd_pd_slab_lock);
434 	bzero((void *)&n_pbuf[0], sizeof (mblk_t *) * MULTIDATA_MAX_PBUFS);
435 	n_pbuf_cnt = mmd->mmd_pbuf_cnt;
436 	for (i = 0; i < n_pbuf_cnt; i++) {
437 		ASSERT(mmd->mmd_pbuf[i] != NULL);
438 		n_pbuf[i] = copyb(mmd->mmd_pbuf[i]);
439 		if (n_pbuf[i] == NULL) {
440 			FREE_PBUFS();
441 			mutex_exit(&mmd->mmd_pd_slab_lock);
442 			return (NULL);
443 		}
444 	}
445 
446 	/* allocate new Multidata */
447 	n_mmd = mmd_alloc(n_hbuf, &n_bp, kmflags);
448 	if (n_mmd == NULL) {
449 		if (n_hbuf != NULL)
450 			freeb(n_hbuf);
451 		if (n_pbuf_cnt != 0)
452 			FREE_PBUFS();
453 		mutex_exit(&mmd->mmd_pd_slab_lock);
454 		return (NULL);
455 	}
456 
457 	/*
458 	 * Add payload buffer(s); upon success, leave n_pbuf array
459 	 * alone, as the newly-created Multidata had already contained
460 	 * the mblk pointers stored in the array.  These will be freed
461 	 * along with the Multidata itself.
462 	 */
463 	for (i = 0, pmp = &n_pbuf[0]; i < n_pbuf_cnt; i++, pmp++) {
464 		idx = mmd_addpldbuf(n_mmd, *pmp);
465 		if (idx < 0) {
466 			FREE_PBUFS();
467 			freeb(n_bp);
468 			mutex_exit(&mmd->mmd_pd_slab_lock);
469 			return (NULL);
470 		}
471 	}
472 
473 	/* copy over global attributes */
474 	if (mmd->mmd_pattbl != NULL &&
475 	    mmd_copy_pattbl(mmd->mmd_pattbl, n_mmd, NULL, kmflags) < 0) {
476 		freeb(n_bp);
477 		mutex_exit(&mmd->mmd_pd_slab_lock);
478 		return (NULL);
479 	}
480 
481 	/* copy over packet descriptors and their atttributes */
482 	pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);	/* first pdesc */
483 	while (pd != NULL) {
484 		pdesc_t *n_pd;
485 		pdescinfo_t *pdi, n_pdi;
486 		uchar_t *n_base, *base;
487 		pdesc_t *pd_next;
488 
489 		/* next pdesc */
490 		pd_next = mmd_getpdesc(pd->pd_slab->pds_mmd, pd, NULL,
491 		    1, B_TRUE);
492 
493 		/* skip if already removed */
494 		if (pd->pd_flags & PDESC_REM_DEFER) {
495 			pd = pd_next;
496 			continue;
497 		}
498 
499 		pdi = &(pd->pd_pdi);
500 		bzero(&n_pdi, sizeof (n_pdi));
501 
502 		/*
503 		 * Calculate new descriptor values based on the offset of
504 		 * each pointer relative to the associated buffer(s).
505 		 */
506 		ASSERT(pdi->flags & PDESC_HAS_REF);
507 		if (pdi->flags & PDESC_HBUF_REF) {
508 			n_base = n_mmd->mmd_hbuf->b_rptr;
509 			base = mmd->mmd_hbuf->b_rptr;
510 
511 			n_pdi.flags |= PDESC_HBUF_REF;
512 			n_pdi.hdr_base = REL_OFF(pdi->hdr_base, base, n_base);
513 			n_pdi.hdr_rptr = REL_OFF(pdi->hdr_rptr, base, n_base);
514 			n_pdi.hdr_wptr = REL_OFF(pdi->hdr_wptr, base, n_base);
515 			n_pdi.hdr_lim = REL_OFF(pdi->hdr_lim, base, n_base);
516 		}
517 
518 		if (pdi->flags & PDESC_PBUF_REF) {
519 			n_pdi.flags |= PDESC_PBUF_REF;
520 			n_pdi.pld_cnt = pdi->pld_cnt;
521 
522 			for (i = 0; i < pdi->pld_cnt; i++) {
523 				idx = pdi->pld_ary[i].pld_pbuf_idx;
524 				ASSERT(idx < MULTIDATA_MAX_PBUFS);
525 				ASSERT(n_mmd->mmd_pbuf[idx] != NULL);
526 				ASSERT(mmd->mmd_pbuf[idx] != NULL);
527 
528 				n_base = n_mmd->mmd_pbuf[idx]->b_rptr;
529 				base = mmd->mmd_pbuf[idx]->b_rptr;
530 
531 				n_pdi.pld_ary[i].pld_pbuf_idx = idx;
532 
533 				/*
534 				 * We can't copy the pointers just like that,
535 				 * so calculate the relative offset.
536 				 */
537 				n_pdi.pld_ary[i].pld_rptr =
538 				    REL_OFF(pdi->pld_ary[i].pld_rptr,
539 					base, n_base);
540 				n_pdi.pld_ary[i].pld_wptr =
541 				    REL_OFF(pdi->pld_ary[i].pld_wptr,
542 					base, n_base);
543 			}
544 		}
545 
546 		/* add the new descriptor to the new Multidata */
547 		n_pd = mmd_addpdesc_int(n_mmd, &n_pdi, NULL, kmflags);
548 
549 		if (n_pd == NULL || (pd->pd_pattbl != NULL &&
550 		    mmd_copy_pattbl(pd->pd_pattbl, n_mmd, n_pd, kmflags) < 0)) {
551 			freeb(n_bp);
552 			mutex_exit(&mmd->mmd_pd_slab_lock);
553 			return (NULL);
554 		}
555 
556 		pd = pd_next;
557 	}
558 #undef REL_OFF
559 #undef FREE_PBUFS
560 
561 	mutex_exit(&mmd->mmd_pd_slab_lock);
562 	return (n_bp);
563 }
564 
565 /*
566  * Given a Multidata message block, return the Multidata metadata handle.
567  */
568 multidata_t *
569 mmd_getmultidata(mblk_t *mp)
570 {
571 	multidata_t *mmd;
572 
573 	ASSERT(mp != NULL);
574 
575 	if (DB_TYPE(mp) != M_MULTIDATA)
576 		return (NULL);
577 
578 	mmd = (multidata_t *)mp->b_rptr;
579 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
580 
581 	return (mmd);
582 }
583 
584 /*
585  * Return the start and end addresses of the associated buffer(s).
586  */
587 void
588 mmd_getregions(multidata_t *mmd, mbufinfo_t *mbi)
589 {
590 	int i;
591 
592 	ASSERT(mmd != NULL);
593 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
594 	ASSERT(mbi != NULL);
595 
596 	bzero((void *)mbi, sizeof (mbufinfo_t));
597 
598 	if (mmd->mmd_hbuf != NULL) {
599 		mbi->hbuf_rptr = mmd->mmd_hbuf->b_rptr;
600 		mbi->hbuf_wptr = mmd->mmd_hbuf->b_wptr;
601 	}
602 
603 	mutex_enter(&mmd->mmd_pd_slab_lock);
604 	for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
605 		ASSERT(mmd->mmd_pbuf[i] != NULL);
606 		mbi->pbuf_ary[i].pbuf_rptr = mmd->mmd_pbuf[i]->b_rptr;
607 		mbi->pbuf_ary[i].pbuf_wptr = mmd->mmd_pbuf[i]->b_wptr;
608 
609 	}
610 	mbi->pbuf_cnt = mmd->mmd_pbuf_cnt;
611 	mutex_exit(&mmd->mmd_pd_slab_lock);
612 }
613 
614 /*
615  * Return the Multidata statistics.
616  */
617 uint_t
618 mmd_getcnt(multidata_t *mmd, uint_t *hbuf_ref, uint_t *pbuf_ref)
619 {
620 	uint_t pd_cnt;
621 
622 	ASSERT(mmd != NULL);
623 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
624 
625 	mutex_enter(&(mmd->mmd_pd_slab_lock));
626 	if (hbuf_ref != NULL)
627 		*hbuf_ref = mmd->mmd_hbuf_ref;
628 	if (pbuf_ref != NULL)
629 		*pbuf_ref = mmd->mmd_pbuf_ref;
630 	pd_cnt = mmd->mmd_pd_cnt;
631 	mutex_exit(&(mmd->mmd_pd_slab_lock));
632 
633 	return (pd_cnt);
634 }
635 
636 #define	HBUF_REF_VALID(mmd, pdi)					\
637 	((mmd)->mmd_hbuf != NULL && (pdi)->hdr_rptr != NULL &&		\
638 	(pdi)->hdr_wptr != NULL && (pdi)->hdr_base != NULL &&		\
639 	(pdi)->hdr_lim != NULL && (pdi)->hdr_lim >= (pdi)->hdr_base &&	\
640 	(pdi)->hdr_wptr >= (pdi)->hdr_rptr &&				\
641 	(pdi)->hdr_base <= (pdi)->hdr_rptr &&				\
642 	(pdi)->hdr_lim >= (pdi)->hdr_wptr &&				\
643 	(pdi)->hdr_base >= (mmd)->mmd_hbuf->b_rptr &&			\
644 	MBLKIN((mmd)->mmd_hbuf,						\
645 	(pdi->hdr_base - (mmd)->mmd_hbuf->b_rptr),			\
646 	PDESC_HDRSIZE(pdi)))
647 
648 /*
649  * Bounds check payload area(s).
650  */
651 static boolean_t
652 pbuf_ref_valid(multidata_t *mmd, pdescinfo_t *pdi)
653 {
654 	int i = 0, idx;
655 	boolean_t valid = B_TRUE;
656 	struct pld_ary_s *pa;
657 
658 	mutex_enter(&mmd->mmd_pd_slab_lock);
659 	if (pdi->pld_cnt == 0 || pdi->pld_cnt > mmd->mmd_pbuf_cnt) {
660 		mutex_exit(&mmd->mmd_pd_slab_lock);
661 		return (B_FALSE);
662 	}
663 
664 	pa = &pdi->pld_ary[0];
665 	while (valid && i < pdi->pld_cnt) {
666 		valid = (((idx = pa->pld_pbuf_idx) < mmd->mmd_pbuf_cnt) &&
667 		    pa->pld_rptr != NULL && pa->pld_wptr != NULL &&
668 		    pa->pld_wptr >= pa->pld_rptr &&
669 		    pa->pld_rptr >= mmd->mmd_pbuf[idx]->b_rptr &&
670 		    MBLKIN(mmd->mmd_pbuf[idx], (pa->pld_rptr -
671 			mmd->mmd_pbuf[idx]->b_rptr),
672 			PDESC_PLD_SPAN_SIZE(pdi, i)));
673 
674 		if (!valid) {
675 			MMD_DEBUG((CE_WARN,
676 			    "pbuf_ref_valid: pdi 0x%p pld out of bound; "
677 			    "index %d has pld_cnt %d pbuf_idx %d "
678 			    "(mmd_pbuf_cnt %d), "
679 			    "pld_rptr 0x%p pld_wptr 0x%p len %d "
680 			    "(valid 0x%p-0x%p len %d)\n", (void *)pdi,
681 			    i, pdi->pld_cnt, idx, mmd->mmd_pbuf_cnt,
682 			    (void *)pa->pld_rptr,
683 			    (void *)pa->pld_wptr,
684 			    (int)PDESC_PLD_SPAN_SIZE(pdi, i),
685 			    (void *)mmd->mmd_pbuf[idx]->b_rptr,
686 			    (void *)mmd->mmd_pbuf[idx]->b_wptr,
687 			    (int)MBLKL(mmd->mmd_pbuf[idx])));
688 		}
689 
690 		/* advance to next entry */
691 		i++;
692 		pa++;
693 	}
694 
695 	mutex_exit(&mmd->mmd_pd_slab_lock);
696 	return (valid);
697 }
698 
699 /*
700  * Add a packet descriptor to the Multidata.
701  */
702 pdesc_t *
703 mmd_addpdesc(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
704 {
705 	ASSERT(mmd != NULL);
706 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
707 	ASSERT(pdi != NULL);
708 	ASSERT(pdi->flags & PDESC_HAS_REF);
709 
710 	/* do the references refer to invalid memory regions? */
711 	if (!mmd_speed_over_safety &&
712 	    (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
713 	    ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi)))) {
714 		if (err != NULL)
715 			*err = EINVAL;
716 		return (NULL);
717 	}
718 
719 	return (mmd_addpdesc_int(mmd, pdi, err, kmflags));
720 }
721 
722 /*
723  * Internal routine to add a packet descriptor, called when mmd_addpdesc
724  * or mmd_copy tries to allocate and add a descriptor to a Multidata.
725  */
726 static pdesc_t *
727 mmd_addpdesc_int(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
728 {
729 	pdesc_slab_t *slab, *slab_last;
730 	pdesc_t *pd;
731 
732 	ASSERT(pdi->flags & PDESC_HAS_REF);
733 	ASSERT(!(pdi->flags & PDESC_HBUF_REF) || HBUF_REF_VALID(mmd, pdi));
734 	ASSERT(!(pdi->flags & PDESC_PBUF_REF) || pbuf_ref_valid(mmd, pdi));
735 
736 	if (err != NULL)
737 		*err = 0;
738 
739 	mutex_enter(&(mmd->mmd_pd_slab_lock));
740 	/*
741 	 * Is slab list empty or the last-added slab is full?  If so,
742 	 * allocate new slab for the descriptor; otherwise, use the
743 	 * last-added slab instead.
744 	 */
745 	slab_last = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_prev);
746 	if (mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q) ||
747 	    slab_last->pds_used == slab_last->pds_sz) {
748 		slab = kmem_cache_alloc(pd_slab_cache, kmflags);
749 		if (slab == NULL) {
750 			if (err != NULL)
751 				*err = ENOMEM;
752 			mutex_exit(&(mmd->mmd_pd_slab_lock));
753 			return (NULL);
754 		}
755 		slab->pds_mmd = mmd;
756 
757 		ASSERT(slab->pds_used == 0);
758 		ASSERT(slab->pds_next == NULL && slab->pds_prev == NULL);
759 
760 		/* insert slab at end of list */
761 		insque(&(slab->pds_next), mmd->mmd_pd_slab_q.ql_prev);
762 		mmd->mmd_slab_cnt++;
763 	} else {
764 		slab = slab_last;
765 	}
766 	ASSERT(slab->pds_used < slab->pds_sz);
767 	pd = &(slab->pds_free_desc[slab->pds_used++]);
768 	ASSERT(pd->pd_magic == PDESC_MAGIC);
769 	pd->pd_next = NULL;
770 	pd->pd_prev = NULL;
771 	pd->pd_slab = slab;
772 	pd->pd_pattbl = NULL;
773 
774 	/* copy over the descriptor info from caller */
775 	PDI_COPY(pdi, &(pd->pd_pdi));
776 
777 	if (pd->pd_flags & PDESC_HBUF_REF)
778 		mmd->mmd_hbuf_ref++;
779 	if (pd->pd_flags & PDESC_PBUF_REF)
780 		mmd->mmd_pbuf_ref += pd->pd_pdi.pld_cnt;
781 	mmd->mmd_pd_cnt++;
782 
783 	/* insert descriptor at end of list */
784 	insque(&(pd->pd_next), mmd->mmd_pd_q.ql_prev);
785 	mutex_exit(&(mmd->mmd_pd_slab_lock));
786 
787 	return (pd);
788 }
789 
790 /*
791  * Packet descriptor slab kmem cache constructor routine.
792  */
793 /* ARGSUSED */
794 static int
795 pdslab_constructor(void *buf, void *cdrarg, int kmflags)
796 {
797 	pdesc_slab_t *slab;
798 	uint_t cnt = (uint_t)(uintptr_t)cdrarg;
799 	int i;
800 
801 	ASSERT(cnt > 0);	/* slab size can't be zero */
802 
803 	slab = (pdesc_slab_t *)buf;
804 	slab->pds_next = NULL;
805 	slab->pds_prev = NULL;
806 	slab->pds_mmd = NULL;
807 	slab->pds_used = 0;
808 	slab->pds_sz = cnt;
809 
810 	for (i = 0; i < cnt; i++) {
811 		pdesc_t *pd = &(slab->pds_free_desc[i]);
812 		pd->pd_magic = PDESC_MAGIC;
813 	}
814 	return (0);
815 }
816 
817 /*
818  * Packet descriptor slab kmem cache destructor routine.
819  */
820 /* ARGSUSED */
821 static void
822 pdslab_destructor(void *buf, void *cdrarg)
823 {
824 	pdesc_slab_t *slab;
825 
826 	slab = (pdesc_slab_t *)buf;
827 	ASSERT(slab->pds_next == NULL);
828 	ASSERT(slab->pds_prev == NULL);
829 	ASSERT(slab->pds_mmd == NULL);
830 	ASSERT(slab->pds_used == 0);
831 	ASSERT(slab->pds_sz > 0);
832 }
833 
834 /*
835  * Remove a packet descriptor from the in-use descriptor list,
836  * called by mmd_rempdesc or during free.
837  */
838 static pdesc_t *
839 mmd_destroy_pdesc(multidata_t *mmd, pdesc_t *pd)
840 {
841 	pdesc_t *pd_next;
842 
843 	pd_next = Q2PD(pd->pd_next);
844 	remque(&(pd->pd_next));
845 
846 	/* remove all local attributes */
847 	if (pd->pd_pattbl != NULL)
848 		mmd_destroy_pattbl(&(pd->pd_pattbl));
849 
850 	/* don't decrease counts for a removed descriptor */
851 	if (!(pd->pd_flags & PDESC_REM_DEFER)) {
852 		if (pd->pd_flags & PDESC_HBUF_REF) {
853 			ASSERT(mmd->mmd_hbuf_ref > 0);
854 			mmd->mmd_hbuf_ref--;
855 		}
856 		if (pd->pd_flags & PDESC_PBUF_REF) {
857 			ASSERT(mmd->mmd_pbuf_ref > 0);
858 			mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
859 		}
860 		ASSERT(mmd->mmd_pd_cnt > 0);
861 		mmd->mmd_pd_cnt--;
862 	}
863 	return (pd_next);
864 }
865 
866 /*
867  * Remove a packet descriptor from the Multidata.
868  */
869 void
870 mmd_rempdesc(pdesc_t *pd)
871 {
872 	multidata_t *mmd;
873 
874 	ASSERT(pd->pd_magic == PDESC_MAGIC);
875 	ASSERT(pd->pd_slab != NULL);
876 
877 	mmd = pd->pd_slab->pds_mmd;
878 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
879 
880 	mutex_enter(&(mmd->mmd_pd_slab_lock));
881 	/*
882 	 * We can't deallocate the associated resources if the Multidata
883 	 * is shared with other threads, because it's possible that the
884 	 * descriptor handle value is held by those threads.  That's why
885 	 * we simply mark the entry as "removed" and decrement the counts.
886 	 * If there are no other threads, then we free the descriptor.
887 	 */
888 	if (mmd->mmd_dp->db_ref > 1) {
889 		pd->pd_flags |= PDESC_REM_DEFER;
890 		if (pd->pd_flags & PDESC_HBUF_REF) {
891 			ASSERT(mmd->mmd_hbuf_ref > 0);
892 			mmd->mmd_hbuf_ref--;
893 		}
894 		if (pd->pd_flags & PDESC_PBUF_REF) {
895 			ASSERT(mmd->mmd_pbuf_ref > 0);
896 			mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
897 		}
898 		ASSERT(mmd->mmd_pd_cnt > 0);
899 		mmd->mmd_pd_cnt--;
900 	} else {
901 		(void) mmd_destroy_pdesc(mmd, pd);
902 	}
903 	mutex_exit(&(mmd->mmd_pd_slab_lock));
904 }
905 
906 /*
907  * A generic routine to traverse the packet descriptor in-use list.
908  */
909 static pdesc_t *
910 mmd_getpdesc(multidata_t *mmd, pdesc_t *pd, pdescinfo_t *pdi, uint_t forw,
911     boolean_t mutex_held)
912 {
913 	pdesc_t *pd_head;
914 
915 	ASSERT(pd == NULL || pd->pd_slab->pds_mmd == mmd);
916 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
917 	ASSERT(!mutex_held || MUTEX_HELD(&(mmd->mmd_pd_slab_lock)));
918 
919 	if (!mutex_held)
920 		mutex_enter(&(mmd->mmd_pd_slab_lock));
921 	pd_head = Q2PD(&(mmd->mmd_pd_q));
922 
923 	if (pd == NULL) {
924 		/*
925 		 * We're called by mmd_get{first,last}pdesc, and so
926 		 * return either the first or last list element.
927 		 */
928 		pd = forw ? Q2PD(mmd->mmd_pd_q.ql_next) :
929 		    Q2PD(mmd->mmd_pd_q.ql_prev);
930 	} else {
931 		/*
932 		 * We're called by mmd_get{next,prev}pdesc, and so
933 		 * return either the next or previous list element.
934 		 */
935 		pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
936 	}
937 
938 	while (pd != pd_head) {
939 		/* skip element if it has been removed */
940 		if (!(pd->pd_flags & PDESC_REM_DEFER))
941 			break;
942 		pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
943 	}
944 	if (!mutex_held)
945 		mutex_exit(&(mmd->mmd_pd_slab_lock));
946 
947 	/* return NULL if we're back at the beginning */
948 	if (pd == pd_head)
949 		pd = NULL;
950 
951 	/* got an entry; copy descriptor info to caller */
952 	if (pd != NULL && pdi != NULL)
953 		PDI_COPY(&(pd->pd_pdi), pdi);
954 
955 	ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
956 	return (pd);
957 
958 }
959 
960 /*
961  * Return the first packet descriptor in the in-use list.
962  */
963 pdesc_t *
964 mmd_getfirstpdesc(multidata_t *mmd, pdescinfo_t *pdi)
965 {
966 	return (mmd_getpdesc(mmd, NULL, pdi, 1, B_FALSE));
967 }
968 
969 /*
970  * Return the last packet descriptor in the in-use list.
971  */
972 pdesc_t *
973 mmd_getlastpdesc(multidata_t *mmd, pdescinfo_t *pdi)
974 {
975 	return (mmd_getpdesc(mmd, NULL, pdi, 0, B_FALSE));
976 }
977 
978 /*
979  * Return the next packet descriptor in the in-use list.
980  */
981 pdesc_t *
982 mmd_getnextpdesc(pdesc_t *pd, pdescinfo_t *pdi)
983 {
984 	return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 1, B_FALSE));
985 }
986 
987 /*
988  * Return the previous packet descriptor in the in-use list.
989  */
990 pdesc_t *
991 mmd_getprevpdesc(pdesc_t *pd, pdescinfo_t *pdi)
992 {
993 	return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 0, B_FALSE));
994 }
995 
996 /*
997  * Check to see if pdi stretches over c_pdi; used to ensure that a packet
998  * descriptor's header and payload span may not be extended beyond the
999  * current boundaries.
1000  */
1001 static boolean_t
1002 pdi_in_range(pdescinfo_t *pdi, pdescinfo_t *c_pdi)
1003 {
1004 	int i;
1005 	struct pld_ary_s *pa = &pdi->pld_ary[0];
1006 	struct pld_ary_s *c_pa = &c_pdi->pld_ary[0];
1007 
1008 	if (pdi->hdr_base < c_pdi->hdr_base || pdi->hdr_lim > c_pdi->hdr_lim)
1009 		return (B_FALSE);
1010 
1011 	/*
1012 	 * We don't allow the number of span to be reduced, for the sake
1013 	 * of simplicity.  Instead, we provide PDESC_PLD_SPAN_CLEAR() to
1014 	 * clear a packet descriptor.  Note that we allow the span count to
1015 	 * be increased, and the bounds check for the new one happens
1016 	 * in pbuf_ref_valid.
1017 	 */
1018 	if (pdi->pld_cnt < c_pdi->pld_cnt)
1019 		return (B_FALSE);
1020 
1021 	/* compare only those which are currently defined */
1022 	for (i = 0; i < c_pdi->pld_cnt; i++, pa++, c_pa++) {
1023 		if (pa->pld_pbuf_idx != c_pa->pld_pbuf_idx ||
1024 		    pa->pld_rptr < c_pa->pld_rptr ||
1025 		    pa->pld_wptr > c_pa->pld_wptr)
1026 			return (B_FALSE);
1027 	}
1028 	return (B_TRUE);
1029 }
1030 
1031 /*
1032  * Modify the layout of a packet descriptor.
1033  */
1034 pdesc_t *
1035 mmd_adjpdesc(pdesc_t *pd, pdescinfo_t *pdi)
1036 {
1037 	multidata_t *mmd;
1038 	pdescinfo_t *c_pdi;
1039 
1040 	ASSERT(pd != NULL);
1041 	ASSERT(pdi != NULL);
1042 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1043 
1044 	mmd = pd->pd_slab->pds_mmd;
1045 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1046 
1047 	/* entry has been removed */
1048 	if (pd->pd_flags & PDESC_REM_DEFER)
1049 		return (NULL);
1050 
1051 	/* caller doesn't intend to specify any buffer reference? */
1052 	if (!(pdi->flags & PDESC_HAS_REF))
1053 		return (NULL);
1054 
1055 	/* do the references refer to invalid memory regions? */
1056 	if (!mmd_speed_over_safety &&
1057 	    (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
1058 	    ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi))))
1059 		return (NULL);
1060 
1061 	/* they're not subsets of current references? */
1062 	c_pdi = &(pd->pd_pdi);
1063 	if (!pdi_in_range(pdi, c_pdi))
1064 		return (NULL);
1065 
1066 	/* copy over the descriptor info from caller */
1067 	PDI_COPY(pdi, c_pdi);
1068 
1069 	return (pd);
1070 }
1071 
1072 /*
1073  * Copy the contents of a packet descriptor into a new buffer.  If the
1074  * descriptor points to more than one buffer fragments, the contents
1075  * of both fragments will be joined, with the header buffer fragment
1076  * preceding the payload buffer fragment(s).
1077  */
1078 mblk_t *
1079 mmd_transform(pdesc_t *pd)
1080 {
1081 	multidata_t *mmd;
1082 	pdescinfo_t *pdi;
1083 	mblk_t *mp;
1084 	int h_size = 0, p_size = 0;
1085 	int i, len;
1086 
1087 	ASSERT(pd != NULL);
1088 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1089 
1090 	mmd = pd->pd_slab->pds_mmd;
1091 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1092 
1093 	/* entry has been removed */
1094 	if (pd->pd_flags & PDESC_REM_DEFER)
1095 		return (NULL);
1096 
1097 	mutex_enter(&mmd->mmd_pd_slab_lock);
1098 	pdi = &(pd->pd_pdi);
1099 	if (pdi->flags & PDESC_HBUF_REF)
1100 		h_size = PDESC_HDRL(pdi);
1101 	if (pdi->flags & PDESC_PBUF_REF) {
1102 		for (i = 0; i < pdi->pld_cnt; i++)
1103 			p_size += PDESC_PLD_SPAN_SIZE(pdi, i);
1104 	}
1105 
1106 	/* allocate space large enough to hold the fragment(s) */
1107 	ASSERT(h_size + p_size >= 0);
1108 	if ((mp = allocb(h_size + p_size, BPRI_HI)) == NULL) {
1109 		mutex_exit(&mmd->mmd_pd_slab_lock);
1110 		return (NULL);
1111 	}
1112 
1113 	/* copy over the header fragment */
1114 	if ((pdi->flags & PDESC_HBUF_REF) && h_size > 0) {
1115 		bcopy(pdi->hdr_rptr, mp->b_wptr, h_size);
1116 		mp->b_wptr += h_size;
1117 	}
1118 
1119 	/* copy over the payload fragment */
1120 	if ((pdi->flags & PDESC_PBUF_REF) && p_size > 0) {
1121 		for (i = 0; i < pdi->pld_cnt; i++) {
1122 			len = PDESC_PLD_SPAN_SIZE(pdi, i);
1123 			if (len > 0) {
1124 				bcopy(pdi->pld_ary[i].pld_rptr,
1125 				    mp->b_wptr, len);
1126 				mp->b_wptr += len;
1127 			}
1128 		}
1129 	}
1130 
1131 	mutex_exit(&mmd->mmd_pd_slab_lock);
1132 	return (mp);
1133 }
1134 
1135 /*
1136  * Return a chain of mblks representing the Multidata packet.
1137  */
1138 mblk_t *
1139 mmd_transform_link(pdesc_t *pd)
1140 {
1141 	multidata_t *mmd;
1142 	pdescinfo_t *pdi;
1143 	mblk_t *nmp = NULL;
1144 
1145 	ASSERT(pd != NULL);
1146 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1147 
1148 	mmd = pd->pd_slab->pds_mmd;
1149 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1150 
1151 	/* entry has been removed */
1152 	if (pd->pd_flags & PDESC_REM_DEFER)
1153 		return (NULL);
1154 
1155 	pdi = &(pd->pd_pdi);
1156 
1157 	/* duplicate header buffer */
1158 	if ((pdi->flags & PDESC_HBUF_REF)) {
1159 		if ((nmp = dupb(mmd->mmd_hbuf)) == NULL)
1160 			return (NULL);
1161 		nmp->b_rptr = pdi->hdr_rptr;
1162 		nmp->b_wptr = pdi->hdr_wptr;
1163 	}
1164 
1165 	/* duplicate payload buffer(s) */
1166 	if (pdi->flags & PDESC_PBUF_REF) {
1167 		int i;
1168 		mblk_t *mp;
1169 		struct pld_ary_s *pa = &pdi->pld_ary[0];
1170 
1171 		mutex_enter(&mmd->mmd_pd_slab_lock);
1172 		for (i = 0; i < pdi->pld_cnt; i++, pa++) {
1173 			ASSERT(mmd->mmd_pbuf[pa->pld_pbuf_idx] != NULL);
1174 
1175 			/* skip empty ones */
1176 			if (PDESC_PLD_SPAN_SIZE(pdi, i) == 0)
1177 				continue;
1178 
1179 			mp = dupb(mmd->mmd_pbuf[pa->pld_pbuf_idx]);
1180 			if (mp == NULL) {
1181 				if (nmp != NULL)
1182 					freemsg(nmp);
1183 				mutex_exit(&mmd->mmd_pd_slab_lock);
1184 				return (NULL);
1185 			}
1186 			mp->b_rptr = pa->pld_rptr;
1187 			mp->b_wptr = pa->pld_wptr;
1188 			if (nmp == NULL)
1189 				nmp = mp;
1190 			else
1191 				linkb(nmp, mp);
1192 		}
1193 		mutex_exit(&mmd->mmd_pd_slab_lock);
1194 	}
1195 
1196 	return (nmp);
1197 }
1198 
1199 /*
1200  * Return duplicate message block(s) of the associated buffer(s).
1201  */
1202 int
1203 mmd_dupbufs(multidata_t *mmd, mblk_t **hmp, mblk_t **pmp)
1204 {
1205 	ASSERT(mmd != NULL);
1206 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1207 
1208 	if (hmp != NULL) {
1209 		*hmp = NULL;
1210 		if (mmd->mmd_hbuf != NULL &&
1211 		    (*hmp = dupb(mmd->mmd_hbuf)) == NULL)
1212 			return (-1);
1213 	}
1214 
1215 	if (pmp != NULL) {
1216 		int i;
1217 		mblk_t *mp;
1218 
1219 		mutex_enter(&mmd->mmd_pd_slab_lock);
1220 		*pmp = NULL;
1221 		for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1222 			ASSERT(mmd->mmd_pbuf[i] != NULL);
1223 			mp = dupb(mmd->mmd_pbuf[i]);
1224 			if (mp == NULL) {
1225 				if (hmp != NULL && *hmp != NULL)
1226 					freeb(*hmp);
1227 				if (*pmp != NULL)
1228 					freemsg(*pmp);
1229 				mutex_exit(&mmd->mmd_pd_slab_lock);
1230 				return (-1);
1231 			}
1232 			if (*pmp == NULL)
1233 				*pmp = mp;
1234 			else
1235 				linkb(*pmp, mp);
1236 		}
1237 		mutex_exit(&mmd->mmd_pd_slab_lock);
1238 	}
1239 
1240 	return (0);
1241 }
1242 
1243 /*
1244  * Return the layout of a packet descriptor.
1245  */
1246 int
1247 mmd_getpdescinfo(pdesc_t *pd, pdescinfo_t *pdi)
1248 {
1249 	ASSERT(pd != NULL);
1250 	ASSERT(pd->pd_magic == PDESC_MAGIC);
1251 	ASSERT(pd->pd_slab != NULL);
1252 	ASSERT(pd->pd_slab->pds_mmd->mmd_magic == MULTIDATA_MAGIC);
1253 	ASSERT(pdi != NULL);
1254 
1255 	/* entry has been removed */
1256 	if (pd->pd_flags & PDESC_REM_DEFER)
1257 		return (-1);
1258 
1259 	/* copy descriptor info to caller */
1260 	PDI_COPY(&(pd->pd_pdi), pdi);
1261 
1262 	return (0);
1263 }
1264 
1265 /*
1266  * Add a global or local attribute to a Multidata.  Global attribute
1267  * association is specified by a NULL packet descriptor.
1268  */
1269 pattr_t *
1270 mmd_addpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai,
1271     boolean_t persistent, int kmflags)
1272 {
1273 	patbkt_t **tbl_p;
1274 	patbkt_t *tbl, *o_tbl;
1275 	patbkt_t *bkt;
1276 	pattr_t *pa;
1277 	uint_t size;
1278 
1279 	ASSERT(mmd != NULL);
1280 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1281 	ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
1282 	ASSERT(pai != NULL);
1283 
1284 	/* pointer to the attribute hash table (local or global) */
1285 	tbl_p = pd != NULL ? &(pd->pd_pattbl) : &(mmd->mmd_pattbl);
1286 
1287 	/*
1288 	 * See if the hash table has not yet been created; if so,
1289 	 * we create the table and store its address atomically.
1290 	 */
1291 	if ((tbl = *tbl_p) == NULL) {
1292 		tbl = kmem_cache_alloc(pattbl_cache, kmflags);
1293 		if (tbl == NULL)
1294 			return (NULL);
1295 
1296 		/* if someone got there first, use his table instead */
1297 		if ((o_tbl = casptr(tbl_p, NULL, tbl)) != NULL) {
1298 			kmem_cache_free(pattbl_cache, tbl);
1299 			tbl = o_tbl;
1300 		}
1301 	}
1302 
1303 	ASSERT(tbl->pbkt_tbl_sz > 0);
1304 	bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1305 
1306 	/* attribute of the same type already exists? */
1307 	if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL)
1308 		return (NULL);
1309 
1310 	size = sizeof (*pa) + pai->len;
1311 	if ((pa = kmem_zalloc(size, kmflags)) == NULL)
1312 		return (NULL);
1313 
1314 	pa->pat_magic = PATTR_MAGIC;
1315 	pa->pat_lock = &(bkt->pbkt_lock);
1316 	pa->pat_mmd = mmd;
1317 	pa->pat_buflen = size;
1318 	pa->pat_type = pai->type;
1319 	pai->buf = pai->len > 0 ? ((uchar_t *)(pa + 1)) : NULL;
1320 
1321 	if (persistent)
1322 		pa->pat_flags = PATTR_PERSIST;
1323 
1324 	/* insert attribute at end of hash chain */
1325 	mutex_enter(&(bkt->pbkt_lock));
1326 	insque(&(pa->pat_next), bkt->pbkt_pattr_q.ql_prev);
1327 	mutex_exit(&(bkt->pbkt_lock));
1328 
1329 	return (pa);
1330 }
1331 
1332 /*
1333  * Attribute hash table kmem cache constructor routine.
1334  */
1335 /* ARGSUSED */
1336 static int
1337 pattbl_constructor(void *buf, void *cdrarg, int kmflags)
1338 {
1339 	patbkt_t *bkt;
1340 	uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1341 	uint_t i;
1342 
1343 	ASSERT(tbl_sz > 0);	/* table size can't be zero */
1344 
1345 	for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1346 		mutex_init(&(bkt->pbkt_lock), NULL, MUTEX_DRIVER, NULL);
1347 		QL_INIT(&(bkt->pbkt_pattr_q));
1348 
1349 		/* first bucket contains the table size */
1350 		bkt->pbkt_tbl_sz = i == 0 ? tbl_sz : 0;
1351 	}
1352 	return (0);
1353 }
1354 
1355 /*
1356  * Attribute hash table kmem cache destructor routine.
1357  */
1358 /* ARGSUSED */
1359 static void
1360 pattbl_destructor(void *buf, void *cdrarg)
1361 {
1362 	patbkt_t *bkt;
1363 	uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1364 	uint_t i;
1365 
1366 	ASSERT(tbl_sz > 0);	/* table size can't be zero */
1367 
1368 	for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1369 		mutex_destroy(&(bkt->pbkt_lock));
1370 		ASSERT(bkt->pbkt_pattr_q.ql_next == &(bkt->pbkt_pattr_q));
1371 		ASSERT(i > 0 || bkt->pbkt_tbl_sz == tbl_sz);
1372 	}
1373 }
1374 
1375 /*
1376  * Destroy an attribute hash table, called by mmd_rempdesc or during free.
1377  */
1378 static void
1379 mmd_destroy_pattbl(patbkt_t **tbl)
1380 {
1381 	patbkt_t *bkt;
1382 	pattr_t *pa, *pa_next;
1383 	uint_t i, tbl_sz;
1384 
1385 	ASSERT(tbl != NULL);
1386 	bkt = *tbl;
1387 	tbl_sz = bkt->pbkt_tbl_sz;
1388 
1389 	/* make sure caller passes in the first bucket */
1390 	ASSERT(tbl_sz > 0);
1391 
1392 	/* destroy the contents of each bucket */
1393 	for (i = 0; i < tbl_sz; i++, bkt++) {
1394 		/* we ought to be exclusive at this point */
1395 		ASSERT(MUTEX_NOT_HELD(&(bkt->pbkt_lock)));
1396 
1397 		pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1398 		while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1399 			ASSERT(pa->pat_magic == PATTR_MAGIC);
1400 			pa_next = Q2PATTR(pa->pat_next);
1401 			remque(&(pa->pat_next));
1402 			kmem_free(pa, pa->pat_buflen);
1403 			pa = pa_next;
1404 		}
1405 	}
1406 
1407 	kmem_cache_free(pattbl_cache, *tbl);
1408 	*tbl = NULL;
1409 
1410 	/* commit all previous stores */
1411 	membar_producer();
1412 }
1413 
1414 /*
1415  * Copy the contents of an attribute hash table, called by mmd_copy.
1416  */
1417 static int
1418 mmd_copy_pattbl(patbkt_t *src_tbl, multidata_t *n_mmd, pdesc_t *n_pd,
1419     int kmflags)
1420 {
1421 	patbkt_t *bkt;
1422 	pattr_t *pa;
1423 	pattrinfo_t pai;
1424 	uint_t i, tbl_sz;
1425 
1426 	ASSERT(src_tbl != NULL);
1427 	bkt = src_tbl;
1428 	tbl_sz = bkt->pbkt_tbl_sz;
1429 
1430 	/* make sure caller passes in the first bucket */
1431 	ASSERT(tbl_sz > 0);
1432 
1433 	for (i = 0; i < tbl_sz; i++, bkt++) {
1434 		mutex_enter(&(bkt->pbkt_lock));
1435 		pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1436 		while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1437 			pattr_t *pa_next = Q2PATTR(pa->pat_next);
1438 
1439 			/* skip if it's removed */
1440 			if (pa->pat_flags & PATTR_REM_DEFER) {
1441 				pa = pa_next;
1442 				continue;
1443 			}
1444 
1445 			pai.type = pa->pat_type;
1446 			pai.len = pa->pat_buflen - sizeof (*pa);
1447 			if (mmd_addpattr(n_mmd, n_pd, &pai, (pa->pat_flags &
1448 			    PATTR_PERSIST) != 0, kmflags) == NULL) {
1449 				mutex_exit(&(bkt->pbkt_lock));
1450 				return (-1);
1451 			}
1452 
1453 			/* copy over the contents */
1454 			if (pai.buf != NULL)
1455 				bcopy(pa + 1, pai.buf, pai.len);
1456 
1457 			pa = pa_next;
1458 		}
1459 		mutex_exit(&(bkt->pbkt_lock));
1460 	}
1461 
1462 	return (0);
1463 }
1464 
1465 /*
1466  * Search for an attribute type within an attribute hash bucket.
1467  */
1468 static pattr_t *
1469 mmd_find_pattr(patbkt_t *bkt, uint_t type)
1470 {
1471 	pattr_t *pa_head, *pa;
1472 
1473 	mutex_enter(&(bkt->pbkt_lock));
1474 	pa_head = Q2PATTR(&(bkt->pbkt_pattr_q));
1475 	pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1476 
1477 	while (pa != pa_head) {
1478 		ASSERT(pa->pat_magic == PATTR_MAGIC);
1479 
1480 		/* return a match; we treat removed entry as non-existent */
1481 		if (pa->pat_type == type && !(pa->pat_flags & PATTR_REM_DEFER))
1482 			break;
1483 		pa = Q2PATTR(pa->pat_next);
1484 	}
1485 	mutex_exit(&(bkt->pbkt_lock));
1486 
1487 	return (pa == pa_head ? NULL : pa);
1488 }
1489 
1490 /*
1491  * Remove an attribute from a Multidata.
1492  */
1493 void
1494 mmd_rempattr(pattr_t *pa)
1495 {
1496 	kmutex_t *pat_lock = pa->pat_lock;
1497 
1498 	ASSERT(pa->pat_magic == PATTR_MAGIC);
1499 
1500 	/* ignore if attribute was marked as persistent */
1501 	if ((pa->pat_flags & PATTR_PERSIST) != 0)
1502 		return;
1503 
1504 	mutex_enter(pat_lock);
1505 	/*
1506 	 * We can't deallocate the associated resources if the Multidata
1507 	 * is shared with other threads, because it's possible that the
1508 	 * attribute handle value is held by those threads.  That's why
1509 	 * we simply mark the entry as "removed".  If there are no other
1510 	 * threads, then we free the attribute.
1511 	 */
1512 	if (pa->pat_mmd->mmd_dp->db_ref > 1) {
1513 		pa->pat_flags |= PATTR_REM_DEFER;
1514 	} else {
1515 		remque(&(pa->pat_next));
1516 		kmem_free(pa, pa->pat_buflen);
1517 	}
1518 	mutex_exit(pat_lock);
1519 }
1520 
1521 /*
1522  * Find an attribute (according to its type) and return its handle.
1523  */
1524 pattr_t *
1525 mmd_getpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai)
1526 {
1527 	patbkt_t *tbl, *bkt;
1528 	pattr_t *pa;
1529 
1530 	ASSERT(mmd != NULL);
1531 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1532 	ASSERT(pai != NULL);
1533 
1534 	/* get the right attribute hash table (local or global) */
1535 	tbl = pd != NULL ? pd->pd_pattbl : mmd->mmd_pattbl;
1536 
1537 	/* attribute hash table doesn't exist? */
1538 	if (tbl == NULL)
1539 		return (NULL);
1540 
1541 	ASSERT(tbl->pbkt_tbl_sz > 0);
1542 	bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1543 
1544 	if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL) {
1545 		ASSERT(pa->pat_buflen >= sizeof (*pa));
1546 		pai->len = pa->pat_buflen - sizeof (*pa);
1547 		pai->buf = pai->len > 0 ?
1548 		    (uchar_t *)pa + sizeof (pattr_t) : NULL;
1549 	}
1550 	ASSERT(pa == NULL || pa->pat_magic == PATTR_MAGIC);
1551 	return (pa);
1552 }
1553 
1554 /*
1555  * Return total size of buffers and total size of areas referenced
1556  * by all in-use (unremoved) packet descriptors.
1557  */
1558 void
1559 mmd_getsize(multidata_t *mmd, uint_t *ptotal, uint_t *pinuse)
1560 {
1561 	pdesc_t *pd;
1562 	pdescinfo_t *pdi;
1563 	int i;
1564 
1565 	ASSERT(mmd != NULL);
1566 	ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1567 
1568 	mutex_enter(&mmd->mmd_pd_slab_lock);
1569 	if (ptotal != NULL) {
1570 		*ptotal = 0;
1571 
1572 		if (mmd->mmd_hbuf != NULL)
1573 			*ptotal += MBLKL(mmd->mmd_hbuf);
1574 
1575 		for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1576 			ASSERT(mmd->mmd_pbuf[i] != NULL);
1577 			*ptotal += MBLKL(mmd->mmd_pbuf[i]);
1578 		}
1579 	}
1580 	if (pinuse != NULL) {
1581 		*pinuse = 0;
1582 
1583 		/* first pdesc */
1584 		pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);
1585 		while (pd != NULL) {
1586 			pdi = &pd->pd_pdi;
1587 
1588 			/* next pdesc */
1589 			pd = mmd_getpdesc(mmd, pd, NULL, 1, B_TRUE);
1590 
1591 			/* skip over removed descriptor */
1592 			if (pdi->flags & PDESC_REM_DEFER)
1593 				continue;
1594 
1595 			if (pdi->flags & PDESC_HBUF_REF)
1596 				*pinuse += PDESC_HDRL(pdi);
1597 
1598 			if (pdi->flags & PDESC_PBUF_REF) {
1599 				for (i = 0; i < pdi->pld_cnt; i++)
1600 					*pinuse += PDESC_PLDL(pdi, i);
1601 			}
1602 		}
1603 	}
1604 	mutex_exit(&mmd->mmd_pd_slab_lock);
1605 }
1606