xref: /titanic_50/usr/src/uts/common/io/stream.c (revision b9e93c10c0a2a4bb069d38bb311021a9478c4711)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22 /*	  All Rights Reserved  	*/
23 
24 /*
25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
26  * Use is subject to license terms.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/sysmacros.h>
33 #include <sys/stropts.h>
34 #include <sys/stream.h>
35 #include <sys/strsubr.h>
36 #include <sys/strsun.h>
37 #include <sys/conf.h>
38 #include <sys/debug.h>
39 #include <sys/cmn_err.h>
40 #include <sys/kmem.h>
41 #include <sys/atomic.h>
42 #include <sys/errno.h>
43 #include <sys/vtrace.h>
44 #include <sys/ftrace.h>
45 #include <sys/ontrap.h>
46 #include <sys/multidata.h>
47 #include <sys/multidata_impl.h>
48 #include <sys/sdt.h>
49 #include <sys/strft.h>
50 
51 #ifdef DEBUG
52 #include <sys/kmem_impl.h>
53 #endif
54 
55 /*
56  * This file contains all the STREAMS utility routines that may
57  * be used by modules and drivers.
58  */
59 
60 /*
61  * STREAMS message allocator: principles of operation
62  *
63  * The streams message allocator consists of all the routines that
64  * allocate, dup and free streams messages: allocb(), [d]esballoc[a],
65  * dupb(), freeb() and freemsg().  What follows is a high-level view
66  * of how the allocator works.
67  *
68  * Every streams message consists of one or more mblks, a dblk, and data.
69  * All mblks for all types of messages come from a common mblk_cache.
70  * The dblk and data come in several flavors, depending on how the
71  * message is allocated:
72  *
73  * (1) mblks up to DBLK_MAX_CACHE size are allocated from a collection of
74  *     fixed-size dblk/data caches. For message sizes that are multiples of
75  *     PAGESIZE, dblks are allocated separately from the buffer.
76  *     The associated buffer is allocated by the constructor using kmem_alloc().
77  *     For all other message sizes, dblk and its associated data is allocated
78  *     as a single contiguous chunk of memory.
79  *     Objects in these caches consist of a dblk plus its associated data.
80  *     allocb() determines the nearest-size cache by table lookup:
81  *     the dblk_cache[] array provides the mapping from size to dblk cache.
82  *
83  * (2) Large messages (size > DBLK_MAX_CACHE) are constructed by
84  *     kmem_alloc()'ing a buffer for the data and supplying that
85  *     buffer to gesballoc(), described below.
86  *
87  * (3) The four flavors of [d]esballoc[a] are all implemented by a
88  *     common routine, gesballoc() ("generic esballoc").  gesballoc()
89  *     allocates a dblk from the global dblk_esb_cache and sets db_base,
90  *     db_lim and db_frtnp to describe the caller-supplied buffer.
91  *
92  * While there are several routines to allocate messages, there is only
93  * one routine to free messages: freeb().  freeb() simply invokes the
94  * dblk's free method, dbp->db_free(), which is set at allocation time.
95  *
96  * dupb() creates a new reference to a message by allocating a new mblk,
97  * incrementing the dblk reference count and setting the dblk's free
98  * method to dblk_decref().  The dblk's original free method is retained
99  * in db_lastfree.  dblk_decref() decrements the reference count on each
100  * freeb().  If this is not the last reference it just frees the mblk;
101  * if this *is* the last reference, it restores db_free to db_lastfree,
102  * sets db_mblk to the current mblk (see below), and invokes db_lastfree.
103  *
104  * The implementation makes aggressive use of kmem object caching for
105  * maximum performance.  This makes the code simple and compact, but
106  * also a bit abstruse in some places.  The invariants that constitute a
107  * message's constructed state, described below, are more subtle than usual.
108  *
109  * Every dblk has an "attached mblk" as part of its constructed state.
110  * The mblk is allocated by the dblk's constructor and remains attached
111  * until the message is either dup'ed or pulled up.  In the dupb() case
112  * the mblk association doesn't matter until the last free, at which time
113  * dblk_decref() attaches the last mblk to the dblk.  pullupmsg() affects
114  * the mblk association because it swaps the leading mblks of two messages,
115  * so it is responsible for swapping their db_mblk pointers accordingly.
116  * From a constructed-state viewpoint it doesn't matter that a dblk's
117  * attached mblk can change while the message is allocated; all that
118  * matters is that the dblk has *some* attached mblk when it's freed.
119  *
120  * The sizes of the allocb() small-message caches are not magical.
121  * They represent a good trade-off between internal and external
122  * fragmentation for current workloads.  They should be reevaluated
123  * periodically, especially if allocations larger than DBLK_MAX_CACHE
124  * become common.  We use 64-byte alignment so that dblks don't
125  * straddle cache lines unnecessarily.
126  */
127 #define	DBLK_MAX_CACHE		73728
128 #define	DBLK_CACHE_ALIGN	64
129 #define	DBLK_MIN_SIZE		8
130 #define	DBLK_SIZE_SHIFT		3
131 
132 #ifdef _BIG_ENDIAN
133 #define	DBLK_RTFU_SHIFT(field)	\
134 	(8 * (&((dblk_t *)0)->db_struioflag - &((dblk_t *)0)->field))
135 #else
136 #define	DBLK_RTFU_SHIFT(field)	\
137 	(8 * (&((dblk_t *)0)->field - &((dblk_t *)0)->db_ref))
138 #endif
139 
140 #define	DBLK_RTFU(ref, type, flags, uioflag)	\
141 	(((ref) << DBLK_RTFU_SHIFT(db_ref)) | \
142 	((type) << DBLK_RTFU_SHIFT(db_type)) | \
143 	(((flags) | (ref - 1)) << DBLK_RTFU_SHIFT(db_flags)) | \
144 	((uioflag) << DBLK_RTFU_SHIFT(db_struioflag)))
145 #define	DBLK_RTFU_REF_MASK	(DBLK_REFMAX << DBLK_RTFU_SHIFT(db_ref))
146 #define	DBLK_RTFU_WORD(dbp)	(*((uint32_t *)&(dbp)->db_ref))
147 #define	MBLK_BAND_FLAG_WORD(mp)	(*((uint32_t *)&(mp)->b_band))
148 
149 static size_t dblk_sizes[] = {
150 #ifdef _LP64
151 	16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3856,
152 	8192, 12048, 16384, 20240, 24576, 28432, 32768, 36624,
153 	40960, 44816, 49152, 53008, 57344, 61200, 65536, 69392,
154 #else
155 	64, 128, 320, 576, 1088, 1536, 1984, 2624, 3904,
156 	8192, 12096, 16384, 20288, 24576, 28480, 32768, 36672,
157 	40960, 44864, 49152, 53056, 57344, 61248, 65536, 69440,
158 #endif
159 	DBLK_MAX_CACHE, 0
160 };
161 
162 static struct kmem_cache *dblk_cache[DBLK_MAX_CACHE / DBLK_MIN_SIZE];
163 static struct kmem_cache *mblk_cache;
164 static struct kmem_cache *dblk_esb_cache;
165 static struct kmem_cache *fthdr_cache;
166 static struct kmem_cache *ftblk_cache;
167 
168 static void dblk_lastfree(mblk_t *mp, dblk_t *dbp);
169 static mblk_t *allocb_oversize(size_t size, int flags);
170 static int allocb_tryhard_fails;
171 static void frnop_func(void *arg);
172 frtn_t frnop = { frnop_func };
173 static void bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp);
174 
175 static boolean_t rwnext_enter(queue_t *qp);
176 static void rwnext_exit(queue_t *qp);
177 
178 /*
179  * Patchable mblk/dblk kmem_cache flags.
180  */
181 int dblk_kmem_flags = 0;
182 int mblk_kmem_flags = 0;
183 
184 static int
185 dblk_constructor(void *buf, void *cdrarg, int kmflags)
186 {
187 	dblk_t *dbp = buf;
188 	ssize_t msg_size = (ssize_t)cdrarg;
189 	size_t index;
190 
191 	ASSERT(msg_size != 0);
192 
193 	index = (msg_size - 1) >> DBLK_SIZE_SHIFT;
194 
195 	ASSERT(index < (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT));
196 
197 	if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
198 		return (-1);
199 	if ((msg_size & PAGEOFFSET) == 0) {
200 		dbp->db_base = kmem_alloc(msg_size, kmflags);
201 		if (dbp->db_base == NULL) {
202 			kmem_cache_free(mblk_cache, dbp->db_mblk);
203 			return (-1);
204 		}
205 	} else {
206 		dbp->db_base = (unsigned char *)&dbp[1];
207 	}
208 
209 	dbp->db_mblk->b_datap = dbp;
210 	dbp->db_cache = dblk_cache[index];
211 	dbp->db_lim = dbp->db_base + msg_size;
212 	dbp->db_free = dbp->db_lastfree = dblk_lastfree;
213 	dbp->db_frtnp = NULL;
214 	dbp->db_fthdr = NULL;
215 	dbp->db_credp = NULL;
216 	dbp->db_cpid = -1;
217 	dbp->db_struioflag = 0;
218 	dbp->db_struioun.cksum.flags = 0;
219 	return (0);
220 }
221 
222 /*ARGSUSED*/
223 static int
224 dblk_esb_constructor(void *buf, void *cdrarg, int kmflags)
225 {
226 	dblk_t *dbp = buf;
227 
228 	if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
229 		return (-1);
230 	dbp->db_mblk->b_datap = dbp;
231 	dbp->db_cache = dblk_esb_cache;
232 	dbp->db_fthdr = NULL;
233 	dbp->db_credp = NULL;
234 	dbp->db_cpid = -1;
235 	dbp->db_struioflag = 0;
236 	dbp->db_struioun.cksum.flags = 0;
237 	return (0);
238 }
239 
240 static int
241 bcache_dblk_constructor(void *buf, void *cdrarg, int kmflags)
242 {
243 	dblk_t *dbp = buf;
244 	bcache_t *bcp = cdrarg;
245 
246 	if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
247 		return (-1);
248 
249 	dbp->db_base = kmem_cache_alloc(bcp->buffer_cache, kmflags);
250 	if (dbp->db_base == NULL) {
251 		kmem_cache_free(mblk_cache, dbp->db_mblk);
252 		return (-1);
253 	}
254 
255 	dbp->db_mblk->b_datap = dbp;
256 	dbp->db_cache = (void *)bcp;
257 	dbp->db_lim = dbp->db_base + bcp->size;
258 	dbp->db_free = dbp->db_lastfree = bcache_dblk_lastfree;
259 	dbp->db_frtnp = NULL;
260 	dbp->db_fthdr = NULL;
261 	dbp->db_credp = NULL;
262 	dbp->db_cpid = -1;
263 	dbp->db_struioflag = 0;
264 	dbp->db_struioun.cksum.flags = 0;
265 	return (0);
266 }
267 
268 /*ARGSUSED*/
269 static void
270 dblk_destructor(void *buf, void *cdrarg)
271 {
272 	dblk_t *dbp = buf;
273 	ssize_t msg_size = (ssize_t)cdrarg;
274 
275 	ASSERT(dbp->db_mblk->b_datap == dbp);
276 	ASSERT(msg_size != 0);
277 	ASSERT(dbp->db_struioflag == 0);
278 	ASSERT(dbp->db_struioun.cksum.flags == 0);
279 
280 	if ((msg_size & PAGEOFFSET) == 0) {
281 		kmem_free(dbp->db_base, msg_size);
282 	}
283 
284 	kmem_cache_free(mblk_cache, dbp->db_mblk);
285 }
286 
287 static void
288 bcache_dblk_destructor(void *buf, void *cdrarg)
289 {
290 	dblk_t *dbp = buf;
291 	bcache_t *bcp = cdrarg;
292 
293 	kmem_cache_free(bcp->buffer_cache, dbp->db_base);
294 
295 	ASSERT(dbp->db_mblk->b_datap == dbp);
296 	ASSERT(dbp->db_struioflag == 0);
297 	ASSERT(dbp->db_struioun.cksum.flags == 0);
298 
299 	kmem_cache_free(mblk_cache, dbp->db_mblk);
300 }
301 
302 /* ARGSUSED */
303 static int
304 ftblk_constructor(void *buf, void *cdrarg, int kmflags)
305 {
306 	ftblk_t *fbp = buf;
307 	int i;
308 
309 	bzero(fbp, sizeof (ftblk_t));
310 	if (str_ftstack != 0) {
311 		for (i = 0; i < FTBLK_EVNTS; i++)
312 			fbp->ev[i].stk = kmem_alloc(sizeof (ftstk_t), kmflags);
313 	}
314 
315 	return (0);
316 }
317 
318 /* ARGSUSED */
319 static void
320 ftblk_destructor(void *buf, void *cdrarg)
321 {
322 	ftblk_t *fbp = buf;
323 	int i;
324 
325 	if (str_ftstack != 0) {
326 		for (i = 0; i < FTBLK_EVNTS; i++) {
327 			if (fbp->ev[i].stk != NULL) {
328 				kmem_free(fbp->ev[i].stk, sizeof (ftstk_t));
329 				fbp->ev[i].stk = NULL;
330 			}
331 		}
332 	}
333 }
334 
335 static int
336 fthdr_constructor(void *buf, void *cdrarg, int kmflags)
337 {
338 	fthdr_t *fhp = buf;
339 
340 	return (ftblk_constructor(&fhp->first, cdrarg, kmflags));
341 }
342 
343 static void
344 fthdr_destructor(void *buf, void *cdrarg)
345 {
346 	fthdr_t *fhp = buf;
347 
348 	ftblk_destructor(&fhp->first, cdrarg);
349 }
350 
351 void
352 streams_msg_init(void)
353 {
354 	char name[40];
355 	size_t size;
356 	size_t lastsize = DBLK_MIN_SIZE;
357 	size_t *sizep;
358 	struct kmem_cache *cp;
359 	size_t tot_size;
360 	int offset;
361 
362 	mblk_cache = kmem_cache_create("streams_mblk", sizeof (mblk_t), 32,
363 	    NULL, NULL, NULL, NULL, NULL, mblk_kmem_flags);
364 
365 	for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) {
366 
367 		if ((offset = (size & PAGEOFFSET)) != 0) {
368 			/*
369 			 * We are in the middle of a page, dblk should
370 			 * be allocated on the same page
371 			 */
372 			tot_size = size + sizeof (dblk_t);
373 			ASSERT((offset + sizeof (dblk_t) + sizeof (kmem_slab_t))
374 			    < PAGESIZE);
375 			ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0);
376 
377 		} else {
378 
379 			/*
380 			 * buf size is multiple of page size, dblk and
381 			 * buffer are allocated separately.
382 			 */
383 
384 			ASSERT((size & (DBLK_CACHE_ALIGN - 1)) == 0);
385 			tot_size = sizeof (dblk_t);
386 		}
387 
388 		(void) sprintf(name, "streams_dblk_%ld", size);
389 		cp = kmem_cache_create(name, tot_size, DBLK_CACHE_ALIGN,
390 		    dblk_constructor, dblk_destructor, NULL, (void *)(size),
391 		    NULL, dblk_kmem_flags);
392 
393 		while (lastsize <= size) {
394 			dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp;
395 			lastsize += DBLK_MIN_SIZE;
396 		}
397 	}
398 
399 	dblk_esb_cache = kmem_cache_create("streams_dblk_esb", sizeof (dblk_t),
400 	    DBLK_CACHE_ALIGN, dblk_esb_constructor, dblk_destructor, NULL,
401 	    (void *)sizeof (dblk_t), NULL, dblk_kmem_flags);
402 	fthdr_cache = kmem_cache_create("streams_fthdr", sizeof (fthdr_t), 32,
403 	    fthdr_constructor, fthdr_destructor, NULL, NULL, NULL, 0);
404 	ftblk_cache = kmem_cache_create("streams_ftblk", sizeof (ftblk_t), 32,
405 	    ftblk_constructor, ftblk_destructor, NULL, NULL, NULL, 0);
406 
407 	/* Initialize Multidata caches */
408 	mmd_init();
409 
410 	/* initialize throttling queue for esballoc */
411 	esballoc_queue_init();
412 }
413 
414 /*ARGSUSED*/
415 mblk_t *
416 allocb(size_t size, uint_t pri)
417 {
418 	dblk_t *dbp;
419 	mblk_t *mp;
420 	size_t index;
421 
422 	index =  (size - 1)  >> DBLK_SIZE_SHIFT;
423 
424 	if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
425 		if (size != 0) {
426 			mp = allocb_oversize(size, KM_NOSLEEP);
427 			goto out;
428 		}
429 		index = 0;
430 	}
431 
432 	if ((dbp = kmem_cache_alloc(dblk_cache[index], KM_NOSLEEP)) == NULL) {
433 		mp = NULL;
434 		goto out;
435 	}
436 
437 	mp = dbp->db_mblk;
438 	DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
439 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
440 	mp->b_rptr = mp->b_wptr = dbp->db_base;
441 	mp->b_queue = NULL;
442 	MBLK_BAND_FLAG_WORD(mp) = 0;
443 	STR_FTALLOC(&dbp->db_fthdr, FTEV_ALLOCB, size);
444 out:
445 	FTRACE_1("allocb(): mp=0x%p", (uintptr_t)mp);
446 
447 	return (mp);
448 }
449 
450 /*
451  * Allocate an mblk taking db_credp and db_cpid from the template.
452  * Allow the cred to be NULL.
453  */
454 mblk_t *
455 allocb_tmpl(size_t size, const mblk_t *tmpl)
456 {
457 	mblk_t *mp = allocb(size, 0);
458 
459 	if (mp != NULL) {
460 		dblk_t *src = tmpl->b_datap;
461 		dblk_t *dst = mp->b_datap;
462 		cred_t *cr = src->db_credp;
463 
464 		if (cr != NULL)
465 			crhold(dst->db_credp = cr);
466 		dst->db_cpid = src->db_cpid;
467 		dst->db_type = src->db_type;
468 	}
469 	return (mp);
470 }
471 
472 mblk_t *
473 allocb_cred(size_t size, cred_t *cr, pid_t cpid)
474 {
475 	mblk_t *mp = allocb(size, 0);
476 
477 	ASSERT(cr != NULL);
478 	if (mp != NULL) {
479 		dblk_t *dbp = mp->b_datap;
480 
481 		crhold(dbp->db_credp = cr);
482 		dbp->db_cpid = cpid;
483 	}
484 	return (mp);
485 }
486 
487 mblk_t *
488 allocb_cred_wait(size_t size, uint_t flags, int *error, cred_t *cr, pid_t cpid)
489 {
490 	mblk_t *mp = allocb_wait(size, 0, flags, error);
491 
492 	ASSERT(cr != NULL);
493 	if (mp != NULL) {
494 		dblk_t *dbp = mp->b_datap;
495 
496 		crhold(dbp->db_credp = cr);
497 		dbp->db_cpid = cpid;
498 	}
499 
500 	return (mp);
501 }
502 
503 /*
504  * Extract the db_cred (and optionally db_cpid) from a message.
505  * We find the first mblk which has a non-NULL db_cred and use that.
506  * If none found we return NULL.
507  * Does NOT get a hold on the cred.
508  */
509 cred_t *
510 msg_getcred(const mblk_t *mp, pid_t *cpidp)
511 {
512 	cred_t *cr = NULL;
513 	cred_t *cr2;
514 
515 	while (mp != NULL) {
516 		dblk_t *dbp = mp->b_datap;
517 
518 		cr = dbp->db_credp;
519 		if (cr == NULL) {
520 			mp = mp->b_cont;
521 			continue;
522 		}
523 		if (cpidp != NULL)
524 			*cpidp = dbp->db_cpid;
525 
526 #ifdef DEBUG
527 		/*
528 		 * Normally there should at most one db_credp in a message.
529 		 * But if there are multiple (as in the case of some M_IOC*
530 		 * and some internal messages in TCP/IP bind logic) then
531 		 * they must be identical in the normal case.
532 		 * However, a socket can be shared between different uids
533 		 * in which case data queued in TCP would be from different
534 		 * creds. Thus we can only assert for the zoneid being the
535 		 * same. Due to Multi-level Level Ports for TX, some
536 		 * cred_t can have a NULL cr_zone, and we skip the comparison
537 		 * in that case.
538 		 */
539 		cr2 = msg_getcred(mp->b_cont, NULL);
540 		if (cr2 != NULL) {
541 			DTRACE_PROBE2(msg__getcred,
542 			    cred_t *, cr, cred_t *, cr2);
543 			ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
544 			    crgetzone(cr) == NULL ||
545 			    crgetzone(cr2) == NULL);
546 		}
547 #endif
548 		return (cr);
549 	}
550 	if (cpidp != NULL)
551 		*cpidp = NOPID;
552 	return (NULL);
553 }
554 
555 /*
556  * Variant of msg_getcred which, when a cred is found
557  * 1. Returns with a hold on the cred
558  * 2. Clears the first cred in the mblk.
559  * This is more efficient to use than a msg_getcred() + crhold() when
560  * the message is freed after the cred has been extracted.
561  *
562  * The caller is responsible for ensuring that there is no other reference
563  * on the message since db_credp can not be cleared when there are other
564  * references.
565  */
566 cred_t *
567 msg_extractcred(mblk_t *mp, pid_t *cpidp)
568 {
569 	cred_t *cr = NULL;
570 	cred_t *cr2;
571 
572 	while (mp != NULL) {
573 		dblk_t *dbp = mp->b_datap;
574 
575 		cr = dbp->db_credp;
576 		if (cr == NULL) {
577 			mp = mp->b_cont;
578 			continue;
579 		}
580 		ASSERT(dbp->db_ref == 1);
581 		dbp->db_credp = NULL;
582 		if (cpidp != NULL)
583 			*cpidp = dbp->db_cpid;
584 #ifdef DEBUG
585 		/*
586 		 * Normally there should at most one db_credp in a message.
587 		 * But if there are multiple (as in the case of some M_IOC*
588 		 * and some internal messages in TCP/IP bind logic) then
589 		 * they must be identical in the normal case.
590 		 * However, a socket can be shared between different uids
591 		 * in which case data queued in TCP would be from different
592 		 * creds. Thus we can only assert for the zoneid being the
593 		 * same. Due to Multi-level Level Ports for TX, some
594 		 * cred_t can have a NULL cr_zone, and we skip the comparison
595 		 * in that case.
596 		 */
597 		cr2 = msg_getcred(mp->b_cont, NULL);
598 		if (cr2 != NULL) {
599 			DTRACE_PROBE2(msg__extractcred,
600 			    cred_t *, cr, cred_t *, cr2);
601 			ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
602 			    crgetzone(cr) == NULL ||
603 			    crgetzone(cr2) == NULL);
604 		}
605 #endif
606 		return (cr);
607 	}
608 	return (NULL);
609 }
610 /*
611  * Get the label for a message. Uses the first mblk in the message
612  * which has a non-NULL db_credp.
613  * Returns NULL if there is no credp.
614  */
615 extern struct ts_label_s *
616 msg_getlabel(const mblk_t *mp)
617 {
618 	cred_t *cr = msg_getcred(mp, NULL);
619 
620 	if (cr == NULL)
621 		return (NULL);
622 
623 	return (crgetlabel(cr));
624 }
625 
626 void
627 freeb(mblk_t *mp)
628 {
629 	dblk_t *dbp = mp->b_datap;
630 
631 	ASSERT(dbp->db_ref > 0);
632 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
633 	FTRACE_1("freeb(): mp=0x%lx", (uintptr_t)mp);
634 
635 	STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
636 
637 	dbp->db_free(mp, dbp);
638 }
639 
640 void
641 freemsg(mblk_t *mp)
642 {
643 	FTRACE_1("freemsg(): mp=0x%lx", (uintptr_t)mp);
644 	while (mp) {
645 		dblk_t *dbp = mp->b_datap;
646 		mblk_t *mp_cont = mp->b_cont;
647 
648 		ASSERT(dbp->db_ref > 0);
649 		ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
650 
651 		STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
652 
653 		dbp->db_free(mp, dbp);
654 		mp = mp_cont;
655 	}
656 }
657 
658 /*
659  * Reallocate a block for another use.  Try hard to use the old block.
660  * If the old data is wanted (copy), leave b_wptr at the end of the data,
661  * otherwise return b_wptr = b_rptr.
662  *
663  * This routine is private and unstable.
664  */
665 mblk_t	*
666 reallocb(mblk_t *mp, size_t size, uint_t copy)
667 {
668 	mblk_t		*mp1;
669 	unsigned char	*old_rptr;
670 	ptrdiff_t	cur_size;
671 
672 	if (mp == NULL)
673 		return (allocb(size, BPRI_HI));
674 
675 	cur_size = mp->b_wptr - mp->b_rptr;
676 	old_rptr = mp->b_rptr;
677 
678 	ASSERT(mp->b_datap->db_ref != 0);
679 
680 	if (mp->b_datap->db_ref == 1 && MBLKSIZE(mp) >= size) {
681 		/*
682 		 * If the data is wanted and it will fit where it is, no
683 		 * work is required.
684 		 */
685 		if (copy && mp->b_datap->db_lim - mp->b_rptr >= size)
686 			return (mp);
687 
688 		mp->b_wptr = mp->b_rptr = mp->b_datap->db_base;
689 		mp1 = mp;
690 	} else if ((mp1 = allocb_tmpl(size, mp)) != NULL) {
691 		/* XXX other mp state could be copied too, db_flags ... ? */
692 		mp1->b_cont = mp->b_cont;
693 	} else {
694 		return (NULL);
695 	}
696 
697 	if (copy) {
698 		bcopy(old_rptr, mp1->b_rptr, cur_size);
699 		mp1->b_wptr = mp1->b_rptr + cur_size;
700 	}
701 
702 	if (mp != mp1)
703 		freeb(mp);
704 
705 	return (mp1);
706 }
707 
708 static void
709 dblk_lastfree(mblk_t *mp, dblk_t *dbp)
710 {
711 	ASSERT(dbp->db_mblk == mp);
712 	if (dbp->db_fthdr != NULL)
713 		str_ftfree(dbp);
714 
715 	/* set credp and projid to be 'unspecified' before returning to cache */
716 	if (dbp->db_credp != NULL) {
717 		crfree(dbp->db_credp);
718 		dbp->db_credp = NULL;
719 	}
720 	dbp->db_cpid = -1;
721 
722 	/* Reset the struioflag and the checksum flag fields */
723 	dbp->db_struioflag = 0;
724 	dbp->db_struioun.cksum.flags = 0;
725 
726 	/* and the COOKED and/or UIOA flag(s) */
727 	dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA);
728 
729 	kmem_cache_free(dbp->db_cache, dbp);
730 }
731 
732 static void
733 dblk_decref(mblk_t *mp, dblk_t *dbp)
734 {
735 	if (dbp->db_ref != 1) {
736 		uint32_t rtfu = atomic_add_32_nv(&DBLK_RTFU_WORD(dbp),
737 		    -(1 << DBLK_RTFU_SHIFT(db_ref)));
738 		/*
739 		 * atomic_add_32_nv() just decremented db_ref, so we no longer
740 		 * have a reference to the dblk, which means another thread
741 		 * could free it.  Therefore we cannot examine the dblk to
742 		 * determine whether ours was the last reference.  Instead,
743 		 * we extract the new and minimum reference counts from rtfu.
744 		 * Note that all we're really saying is "if (ref != refmin)".
745 		 */
746 		if (((rtfu >> DBLK_RTFU_SHIFT(db_ref)) & DBLK_REFMAX) !=
747 		    ((rtfu >> DBLK_RTFU_SHIFT(db_flags)) & DBLK_REFMIN)) {
748 			kmem_cache_free(mblk_cache, mp);
749 			return;
750 		}
751 	}
752 	dbp->db_mblk = mp;
753 	dbp->db_free = dbp->db_lastfree;
754 	dbp->db_lastfree(mp, dbp);
755 }
756 
757 mblk_t *
758 dupb(mblk_t *mp)
759 {
760 	dblk_t *dbp = mp->b_datap;
761 	mblk_t *new_mp;
762 	uint32_t oldrtfu, newrtfu;
763 
764 	if ((new_mp = kmem_cache_alloc(mblk_cache, KM_NOSLEEP)) == NULL)
765 		goto out;
766 
767 	new_mp->b_next = new_mp->b_prev = new_mp->b_cont = NULL;
768 	new_mp->b_rptr = mp->b_rptr;
769 	new_mp->b_wptr = mp->b_wptr;
770 	new_mp->b_datap = dbp;
771 	new_mp->b_queue = NULL;
772 	MBLK_BAND_FLAG_WORD(new_mp) = MBLK_BAND_FLAG_WORD(mp);
773 
774 	STR_FTEVENT_MBLK(mp, caller(), FTEV_DUPB, dbp->db_ref);
775 
776 	dbp->db_free = dblk_decref;
777 	do {
778 		ASSERT(dbp->db_ref > 0);
779 		oldrtfu = DBLK_RTFU_WORD(dbp);
780 		newrtfu = oldrtfu + (1 << DBLK_RTFU_SHIFT(db_ref));
781 		/*
782 		 * If db_ref is maxed out we can't dup this message anymore.
783 		 */
784 		if ((oldrtfu & DBLK_RTFU_REF_MASK) == DBLK_RTFU_REF_MASK) {
785 			kmem_cache_free(mblk_cache, new_mp);
786 			new_mp = NULL;
787 			goto out;
788 		}
789 	} while (cas32(&DBLK_RTFU_WORD(dbp), oldrtfu, newrtfu) != oldrtfu);
790 
791 out:
792 	FTRACE_1("dupb(): new_mp=0x%lx", (uintptr_t)new_mp);
793 	return (new_mp);
794 }
795 
796 static void
797 dblk_lastfree_desb(mblk_t *mp, dblk_t *dbp)
798 {
799 	frtn_t *frp = dbp->db_frtnp;
800 
801 	ASSERT(dbp->db_mblk == mp);
802 	frp->free_func(frp->free_arg);
803 	if (dbp->db_fthdr != NULL)
804 		str_ftfree(dbp);
805 
806 	/* set credp and projid to be 'unspecified' before returning to cache */
807 	if (dbp->db_credp != NULL) {
808 		crfree(dbp->db_credp);
809 		dbp->db_credp = NULL;
810 	}
811 	dbp->db_cpid = -1;
812 	dbp->db_struioflag = 0;
813 	dbp->db_struioun.cksum.flags = 0;
814 
815 	kmem_cache_free(dbp->db_cache, dbp);
816 }
817 
818 /*ARGSUSED*/
819 static void
820 frnop_func(void *arg)
821 {
822 }
823 
824 /*
825  * Generic esballoc used to implement the four flavors: [d]esballoc[a].
826  */
827 static mblk_t *
828 gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp,
829 	void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
830 {
831 	dblk_t *dbp;
832 	mblk_t *mp;
833 
834 	ASSERT(base != NULL && frp != NULL);
835 
836 	if ((dbp = kmem_cache_alloc(dblk_esb_cache, kmflags)) == NULL) {
837 		mp = NULL;
838 		goto out;
839 	}
840 
841 	mp = dbp->db_mblk;
842 	dbp->db_base = base;
843 	dbp->db_lim = base + size;
844 	dbp->db_free = dbp->db_lastfree = lastfree;
845 	dbp->db_frtnp = frp;
846 	DBLK_RTFU_WORD(dbp) = db_rtfu;
847 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
848 	mp->b_rptr = mp->b_wptr = base;
849 	mp->b_queue = NULL;
850 	MBLK_BAND_FLAG_WORD(mp) = 0;
851 
852 out:
853 	FTRACE_1("gesballoc(): mp=0x%lx", (uintptr_t)mp);
854 	return (mp);
855 }
856 
857 /*ARGSUSED*/
858 mblk_t *
859 esballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
860 {
861 	mblk_t *mp;
862 
863 	/*
864 	 * Note that this is structured to allow the common case (i.e.
865 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
866 	 * call optimization.
867 	 */
868 	if (!str_ftnever) {
869 		mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
870 		    frp, freebs_enqueue, KM_NOSLEEP);
871 
872 		if (mp != NULL)
873 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
874 		return (mp);
875 	}
876 
877 	return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
878 	    frp, freebs_enqueue, KM_NOSLEEP));
879 }
880 
881 /*
882  * Same as esballoc() but sleeps waiting for memory.
883  */
884 /*ARGSUSED*/
885 mblk_t *
886 esballoc_wait(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
887 {
888 	mblk_t *mp;
889 
890 	/*
891 	 * Note that this is structured to allow the common case (i.e.
892 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
893 	 * call optimization.
894 	 */
895 	if (!str_ftnever) {
896 		mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
897 		    frp, freebs_enqueue, KM_SLEEP);
898 
899 		STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
900 		return (mp);
901 	}
902 
903 	return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
904 	    frp, freebs_enqueue, KM_SLEEP));
905 }
906 
907 /*ARGSUSED*/
908 mblk_t *
909 desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
910 {
911 	mblk_t *mp;
912 
913 	/*
914 	 * Note that this is structured to allow the common case (i.e.
915 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
916 	 * call optimization.
917 	 */
918 	if (!str_ftnever) {
919 		mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
920 		    frp, dblk_lastfree_desb, KM_NOSLEEP);
921 
922 		if (mp != NULL)
923 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size);
924 		return (mp);
925 	}
926 
927 	return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
928 	    frp, dblk_lastfree_desb, KM_NOSLEEP));
929 }
930 
931 /*ARGSUSED*/
932 mblk_t *
933 esballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
934 {
935 	mblk_t *mp;
936 
937 	/*
938 	 * Note that this is structured to allow the common case (i.e.
939 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
940 	 * call optimization.
941 	 */
942 	if (!str_ftnever) {
943 		mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
944 		    frp, freebs_enqueue, KM_NOSLEEP);
945 
946 		if (mp != NULL)
947 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOCA, size);
948 		return (mp);
949 	}
950 
951 	return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
952 	    frp, freebs_enqueue, KM_NOSLEEP));
953 }
954 
955 /*ARGSUSED*/
956 mblk_t *
957 desballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
958 {
959 	mblk_t *mp;
960 
961 	/*
962 	 * Note that this is structured to allow the common case (i.e.
963 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
964 	 * call optimization.
965 	 */
966 	if (!str_ftnever) {
967 		mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
968 		    frp, dblk_lastfree_desb, KM_NOSLEEP);
969 
970 		if (mp != NULL)
971 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOCA, size);
972 		return (mp);
973 	}
974 
975 	return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
976 	    frp, dblk_lastfree_desb, KM_NOSLEEP));
977 }
978 
979 static void
980 bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp)
981 {
982 	bcache_t *bcp = dbp->db_cache;
983 
984 	ASSERT(dbp->db_mblk == mp);
985 	if (dbp->db_fthdr != NULL)
986 		str_ftfree(dbp);
987 
988 	/* set credp and projid to be 'unspecified' before returning to cache */
989 	if (dbp->db_credp != NULL) {
990 		crfree(dbp->db_credp);
991 		dbp->db_credp = NULL;
992 	}
993 	dbp->db_cpid = -1;
994 	dbp->db_struioflag = 0;
995 	dbp->db_struioun.cksum.flags = 0;
996 
997 	mutex_enter(&bcp->mutex);
998 	kmem_cache_free(bcp->dblk_cache, dbp);
999 	bcp->alloc--;
1000 
1001 	if (bcp->alloc == 0 && bcp->destroy != 0) {
1002 		kmem_cache_destroy(bcp->dblk_cache);
1003 		kmem_cache_destroy(bcp->buffer_cache);
1004 		mutex_exit(&bcp->mutex);
1005 		mutex_destroy(&bcp->mutex);
1006 		kmem_free(bcp, sizeof (bcache_t));
1007 	} else {
1008 		mutex_exit(&bcp->mutex);
1009 	}
1010 }
1011 
1012 bcache_t *
1013 bcache_create(char *name, size_t size, uint_t align)
1014 {
1015 	bcache_t *bcp;
1016 	char buffer[255];
1017 
1018 	ASSERT((align & (align - 1)) == 0);
1019 
1020 	if ((bcp = kmem_alloc(sizeof (bcache_t), KM_NOSLEEP)) == NULL)
1021 		return (NULL);
1022 
1023 	bcp->size = size;
1024 	bcp->align = align;
1025 	bcp->alloc = 0;
1026 	bcp->destroy = 0;
1027 
1028 	mutex_init(&bcp->mutex, NULL, MUTEX_DRIVER, NULL);
1029 
1030 	(void) sprintf(buffer, "%s_buffer_cache", name);
1031 	bcp->buffer_cache = kmem_cache_create(buffer, size, align, NULL, NULL,
1032 	    NULL, NULL, NULL, 0);
1033 	(void) sprintf(buffer, "%s_dblk_cache", name);
1034 	bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t),
1035 	    DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor,
1036 	    NULL, (void *)bcp, NULL, 0);
1037 
1038 	return (bcp);
1039 }
1040 
1041 void
1042 bcache_destroy(bcache_t *bcp)
1043 {
1044 	ASSERT(bcp != NULL);
1045 
1046 	mutex_enter(&bcp->mutex);
1047 	if (bcp->alloc == 0) {
1048 		kmem_cache_destroy(bcp->dblk_cache);
1049 		kmem_cache_destroy(bcp->buffer_cache);
1050 		mutex_exit(&bcp->mutex);
1051 		mutex_destroy(&bcp->mutex);
1052 		kmem_free(bcp, sizeof (bcache_t));
1053 	} else {
1054 		bcp->destroy++;
1055 		mutex_exit(&bcp->mutex);
1056 	}
1057 }
1058 
1059 /*ARGSUSED*/
1060 mblk_t *
1061 bcache_allocb(bcache_t *bcp, uint_t pri)
1062 {
1063 	dblk_t *dbp;
1064 	mblk_t *mp = NULL;
1065 
1066 	ASSERT(bcp != NULL);
1067 
1068 	mutex_enter(&bcp->mutex);
1069 	if (bcp->destroy != 0) {
1070 		mutex_exit(&bcp->mutex);
1071 		goto out;
1072 	}
1073 
1074 	if ((dbp = kmem_cache_alloc(bcp->dblk_cache, KM_NOSLEEP)) == NULL) {
1075 		mutex_exit(&bcp->mutex);
1076 		goto out;
1077 	}
1078 	bcp->alloc++;
1079 	mutex_exit(&bcp->mutex);
1080 
1081 	ASSERT(((uintptr_t)(dbp->db_base) & (bcp->align - 1)) == 0);
1082 
1083 	mp = dbp->db_mblk;
1084 	DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1085 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
1086 	mp->b_rptr = mp->b_wptr = dbp->db_base;
1087 	mp->b_queue = NULL;
1088 	MBLK_BAND_FLAG_WORD(mp) = 0;
1089 	STR_FTALLOC(&dbp->db_fthdr, FTEV_BCALLOCB, bcp->size);
1090 out:
1091 	FTRACE_1("bcache_allocb(): mp=0x%p", (uintptr_t)mp);
1092 
1093 	return (mp);
1094 }
1095 
1096 static void
1097 dblk_lastfree_oversize(mblk_t *mp, dblk_t *dbp)
1098 {
1099 	ASSERT(dbp->db_mblk == mp);
1100 	if (dbp->db_fthdr != NULL)
1101 		str_ftfree(dbp);
1102 
1103 	/* set credp and projid to be 'unspecified' before returning to cache */
1104 	if (dbp->db_credp != NULL) {
1105 		crfree(dbp->db_credp);
1106 		dbp->db_credp = NULL;
1107 	}
1108 	dbp->db_cpid = -1;
1109 	dbp->db_struioflag = 0;
1110 	dbp->db_struioun.cksum.flags = 0;
1111 
1112 	kmem_free(dbp->db_base, dbp->db_lim - dbp->db_base);
1113 	kmem_cache_free(dbp->db_cache, dbp);
1114 }
1115 
1116 static mblk_t *
1117 allocb_oversize(size_t size, int kmflags)
1118 {
1119 	mblk_t *mp;
1120 	void *buf;
1121 
1122 	size = P2ROUNDUP(size, DBLK_CACHE_ALIGN);
1123 	if ((buf = kmem_alloc(size, kmflags)) == NULL)
1124 		return (NULL);
1125 	if ((mp = gesballoc(buf, size, DBLK_RTFU(1, M_DATA, 0, 0),
1126 	    &frnop, dblk_lastfree_oversize, kmflags)) == NULL)
1127 		kmem_free(buf, size);
1128 
1129 	if (mp != NULL)
1130 		STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBIG, size);
1131 
1132 	return (mp);
1133 }
1134 
1135 mblk_t *
1136 allocb_tryhard(size_t target_size)
1137 {
1138 	size_t size;
1139 	mblk_t *bp;
1140 
1141 	for (size = target_size; size < target_size + 512;
1142 	    size += DBLK_CACHE_ALIGN)
1143 		if ((bp = allocb(size, BPRI_HI)) != NULL)
1144 			return (bp);
1145 	allocb_tryhard_fails++;
1146 	return (NULL);
1147 }
1148 
1149 /*
1150  * This routine is consolidation private for STREAMS internal use
1151  * This routine may only be called from sync routines (i.e., not
1152  * from put or service procedures).  It is located here (rather
1153  * than strsubr.c) so that we don't have to expose all of the
1154  * allocb() implementation details in header files.
1155  */
1156 mblk_t *
1157 allocb_wait(size_t size, uint_t pri, uint_t flags, int *error)
1158 {
1159 	dblk_t *dbp;
1160 	mblk_t *mp;
1161 	size_t index;
1162 
1163 	index = (size -1) >> DBLK_SIZE_SHIFT;
1164 
1165 	if (flags & STR_NOSIG) {
1166 		if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
1167 			if (size != 0) {
1168 				mp = allocb_oversize(size, KM_SLEEP);
1169 				FTRACE_1("allocb_wait (NOSIG): mp=0x%lx",
1170 				    (uintptr_t)mp);
1171 				return (mp);
1172 			}
1173 			index = 0;
1174 		}
1175 
1176 		dbp = kmem_cache_alloc(dblk_cache[index], KM_SLEEP);
1177 		mp = dbp->db_mblk;
1178 		DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1179 		mp->b_next = mp->b_prev = mp->b_cont = NULL;
1180 		mp->b_rptr = mp->b_wptr = dbp->db_base;
1181 		mp->b_queue = NULL;
1182 		MBLK_BAND_FLAG_WORD(mp) = 0;
1183 		STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBW, size);
1184 
1185 		FTRACE_1("allocb_wait (NOSIG): mp=0x%lx", (uintptr_t)mp);
1186 
1187 	} else {
1188 		while ((mp = allocb(size, pri)) == NULL) {
1189 			if ((*error = strwaitbuf(size, BPRI_HI)) != 0)
1190 				return (NULL);
1191 		}
1192 	}
1193 
1194 	return (mp);
1195 }
1196 
1197 /*
1198  * Call function 'func' with 'arg' when a class zero block can
1199  * be allocated with priority 'pri'.
1200  */
1201 bufcall_id_t
1202 esbbcall(uint_t pri, void (*func)(void *), void *arg)
1203 {
1204 	return (bufcall(1, pri, func, arg));
1205 }
1206 
1207 /*
1208  * Allocates an iocblk (M_IOCTL) block. Properly sets the credentials
1209  * ioc_id, rval and error of the struct ioctl to set up an ioctl call.
1210  * This provides consistency for all internal allocators of ioctl.
1211  */
1212 mblk_t *
1213 mkiocb(uint_t cmd)
1214 {
1215 	struct iocblk	*ioc;
1216 	mblk_t		*mp;
1217 
1218 	/*
1219 	 * Allocate enough space for any of the ioctl related messages.
1220 	 */
1221 	if ((mp = allocb(sizeof (union ioctypes), BPRI_MED)) == NULL)
1222 		return (NULL);
1223 
1224 	bzero(mp->b_rptr, sizeof (union ioctypes));
1225 
1226 	/*
1227 	 * Set the mblk_t information and ptrs correctly.
1228 	 */
1229 	mp->b_wptr += sizeof (struct iocblk);
1230 	mp->b_datap->db_type = M_IOCTL;
1231 
1232 	/*
1233 	 * Fill in the fields.
1234 	 */
1235 	ioc		= (struct iocblk *)mp->b_rptr;
1236 	ioc->ioc_cmd	= cmd;
1237 	ioc->ioc_cr	= kcred;
1238 	ioc->ioc_id	= getiocseqno();
1239 	ioc->ioc_flag	= IOC_NATIVE;
1240 	return (mp);
1241 }
1242 
1243 /*
1244  * test if block of given size can be allocated with a request of
1245  * the given priority.
1246  * 'pri' is no longer used, but is retained for compatibility.
1247  */
1248 /* ARGSUSED */
1249 int
1250 testb(size_t size, uint_t pri)
1251 {
1252 	return ((size + sizeof (dblk_t)) <= kmem_avail());
1253 }
1254 
1255 /*
1256  * Call function 'func' with argument 'arg' when there is a reasonably
1257  * good chance that a block of size 'size' can be allocated.
1258  * 'pri' is no longer used, but is retained for compatibility.
1259  */
1260 /* ARGSUSED */
1261 bufcall_id_t
1262 bufcall(size_t size, uint_t pri, void (*func)(void *), void *arg)
1263 {
1264 	static long bid = 1;	/* always odd to save checking for zero */
1265 	bufcall_id_t bc_id;
1266 	struct strbufcall *bcp;
1267 
1268 	if ((bcp = kmem_alloc(sizeof (strbufcall_t), KM_NOSLEEP)) == NULL)
1269 		return (0);
1270 
1271 	bcp->bc_func = func;
1272 	bcp->bc_arg = arg;
1273 	bcp->bc_size = size;
1274 	bcp->bc_next = NULL;
1275 	bcp->bc_executor = NULL;
1276 
1277 	mutex_enter(&strbcall_lock);
1278 	/*
1279 	 * After bcp is linked into strbcalls and strbcall_lock is dropped there
1280 	 * should be no references to bcp since it may be freed by
1281 	 * runbufcalls(). Since bcp_id field is returned, we save its value in
1282 	 * the local var.
1283 	 */
1284 	bc_id = bcp->bc_id = (bufcall_id_t)(bid += 2);	/* keep it odd */
1285 
1286 	/*
1287 	 * add newly allocated stream event to existing
1288 	 * linked list of events.
1289 	 */
1290 	if (strbcalls.bc_head == NULL) {
1291 		strbcalls.bc_head = strbcalls.bc_tail = bcp;
1292 	} else {
1293 		strbcalls.bc_tail->bc_next = bcp;
1294 		strbcalls.bc_tail = bcp;
1295 	}
1296 
1297 	cv_signal(&strbcall_cv);
1298 	mutex_exit(&strbcall_lock);
1299 	return (bc_id);
1300 }
1301 
1302 /*
1303  * Cancel a bufcall request.
1304  */
1305 void
1306 unbufcall(bufcall_id_t id)
1307 {
1308 	strbufcall_t *bcp, *pbcp;
1309 
1310 	mutex_enter(&strbcall_lock);
1311 again:
1312 	pbcp = NULL;
1313 	for (bcp = strbcalls.bc_head; bcp; bcp = bcp->bc_next) {
1314 		if (id == bcp->bc_id)
1315 			break;
1316 		pbcp = bcp;
1317 	}
1318 	if (bcp) {
1319 		if (bcp->bc_executor != NULL) {
1320 			if (bcp->bc_executor != curthread) {
1321 				cv_wait(&bcall_cv, &strbcall_lock);
1322 				goto again;
1323 			}
1324 		} else {
1325 			if (pbcp)
1326 				pbcp->bc_next = bcp->bc_next;
1327 			else
1328 				strbcalls.bc_head = bcp->bc_next;
1329 			if (bcp == strbcalls.bc_tail)
1330 				strbcalls.bc_tail = pbcp;
1331 			kmem_free(bcp, sizeof (strbufcall_t));
1332 		}
1333 	}
1334 	mutex_exit(&strbcall_lock);
1335 }
1336 
1337 /*
1338  * Duplicate a message block by block (uses dupb), returning
1339  * a pointer to the duplicate message.
1340  * Returns a non-NULL value only if the entire message
1341  * was dup'd.
1342  */
1343 mblk_t *
1344 dupmsg(mblk_t *bp)
1345 {
1346 	mblk_t *head, *nbp;
1347 
1348 	if (!bp || !(nbp = head = dupb(bp)))
1349 		return (NULL);
1350 
1351 	while (bp->b_cont) {
1352 		if (!(nbp->b_cont = dupb(bp->b_cont))) {
1353 			freemsg(head);
1354 			return (NULL);
1355 		}
1356 		nbp = nbp->b_cont;
1357 		bp = bp->b_cont;
1358 	}
1359 	return (head);
1360 }
1361 
1362 #define	DUPB_NOLOAN(bp) \
1363 	((((bp)->b_datap->db_struioflag & STRUIO_ZC) != 0) ? \
1364 	copyb((bp)) : dupb((bp)))
1365 
1366 mblk_t *
1367 dupmsg_noloan(mblk_t *bp)
1368 {
1369 	mblk_t *head, *nbp;
1370 
1371 	if (bp == NULL || DB_TYPE(bp) != M_DATA ||
1372 	    ((nbp = head = DUPB_NOLOAN(bp)) == NULL))
1373 		return (NULL);
1374 
1375 	while (bp->b_cont) {
1376 		if ((nbp->b_cont = DUPB_NOLOAN(bp->b_cont)) == NULL) {
1377 			freemsg(head);
1378 			return (NULL);
1379 		}
1380 		nbp = nbp->b_cont;
1381 		bp = bp->b_cont;
1382 	}
1383 	return (head);
1384 }
1385 
1386 /*
1387  * Copy data from message and data block to newly allocated message and
1388  * data block. Returns new message block pointer, or NULL if error.
1389  * The alignment of rptr (w.r.t. word alignment) will be the same in the copy
1390  * as in the original even when db_base is not word aligned. (bug 1052877)
1391  */
1392 mblk_t *
1393 copyb(mblk_t *bp)
1394 {
1395 	mblk_t	*nbp;
1396 	dblk_t	*dp, *ndp;
1397 	uchar_t *base;
1398 	size_t	size;
1399 	size_t	unaligned;
1400 
1401 	ASSERT(bp->b_wptr >= bp->b_rptr);
1402 
1403 	dp = bp->b_datap;
1404 	if (dp->db_fthdr != NULL)
1405 		STR_FTEVENT_MBLK(bp, caller(), FTEV_COPYB, 0);
1406 
1407 	/*
1408 	 * Special handling for Multidata message; this should be
1409 	 * removed once a copy-callback routine is made available.
1410 	 */
1411 	if (dp->db_type == M_MULTIDATA) {
1412 		cred_t *cr;
1413 
1414 		if ((nbp = mmd_copy(bp, KM_NOSLEEP)) == NULL)
1415 			return (NULL);
1416 
1417 		nbp->b_flag = bp->b_flag;
1418 		nbp->b_band = bp->b_band;
1419 		ndp = nbp->b_datap;
1420 
1421 		/* See comments below on potential issues. */
1422 		STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1423 
1424 		ASSERT(ndp->db_type == dp->db_type);
1425 		cr = dp->db_credp;
1426 		if (cr != NULL)
1427 			crhold(ndp->db_credp = cr);
1428 		ndp->db_cpid = dp->db_cpid;
1429 		return (nbp);
1430 	}
1431 
1432 	size = dp->db_lim - dp->db_base;
1433 	unaligned = P2PHASE((uintptr_t)dp->db_base, sizeof (uint_t));
1434 	if ((nbp = allocb_tmpl(size + unaligned, bp)) == NULL)
1435 		return (NULL);
1436 	nbp->b_flag = bp->b_flag;
1437 	nbp->b_band = bp->b_band;
1438 	ndp = nbp->b_datap;
1439 
1440 	/*
1441 	 * Well, here is a potential issue.  If we are trying to
1442 	 * trace a flow, and we copy the message, we might lose
1443 	 * information about where this message might have been.
1444 	 * So we should inherit the FT data.  On the other hand,
1445 	 * a user might be interested only in alloc to free data.
1446 	 * So I guess the real answer is to provide a tunable.
1447 	 */
1448 	STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1449 
1450 	base = ndp->db_base + unaligned;
1451 	bcopy(dp->db_base, ndp->db_base + unaligned, size);
1452 
1453 	nbp->b_rptr = base + (bp->b_rptr - dp->db_base);
1454 	nbp->b_wptr = nbp->b_rptr + MBLKL(bp);
1455 
1456 	return (nbp);
1457 }
1458 
1459 /*
1460  * Copy data from message to newly allocated message using new
1461  * data blocks.  Returns a pointer to the new message, or NULL if error.
1462  */
1463 mblk_t *
1464 copymsg(mblk_t *bp)
1465 {
1466 	mblk_t *head, *nbp;
1467 
1468 	if (!bp || !(nbp = head = copyb(bp)))
1469 		return (NULL);
1470 
1471 	while (bp->b_cont) {
1472 		if (!(nbp->b_cont = copyb(bp->b_cont))) {
1473 			freemsg(head);
1474 			return (NULL);
1475 		}
1476 		nbp = nbp->b_cont;
1477 		bp = bp->b_cont;
1478 	}
1479 	return (head);
1480 }
1481 
1482 /*
1483  * link a message block to tail of message
1484  */
1485 void
1486 linkb(mblk_t *mp, mblk_t *bp)
1487 {
1488 	ASSERT(mp && bp);
1489 
1490 	for (; mp->b_cont; mp = mp->b_cont)
1491 		;
1492 	mp->b_cont = bp;
1493 }
1494 
1495 /*
1496  * unlink a message block from head of message
1497  * return pointer to new message.
1498  * NULL if message becomes empty.
1499  */
1500 mblk_t *
1501 unlinkb(mblk_t *bp)
1502 {
1503 	mblk_t *bp1;
1504 
1505 	bp1 = bp->b_cont;
1506 	bp->b_cont = NULL;
1507 	return (bp1);
1508 }
1509 
1510 /*
1511  * remove a message block "bp" from message "mp"
1512  *
1513  * Return pointer to new message or NULL if no message remains.
1514  * Return -1 if bp is not found in message.
1515  */
1516 mblk_t *
1517 rmvb(mblk_t *mp, mblk_t *bp)
1518 {
1519 	mblk_t *tmp;
1520 	mblk_t *lastp = NULL;
1521 
1522 	ASSERT(mp && bp);
1523 	for (tmp = mp; tmp; tmp = tmp->b_cont) {
1524 		if (tmp == bp) {
1525 			if (lastp)
1526 				lastp->b_cont = tmp->b_cont;
1527 			else
1528 				mp = tmp->b_cont;
1529 			tmp->b_cont = NULL;
1530 			return (mp);
1531 		}
1532 		lastp = tmp;
1533 	}
1534 	return ((mblk_t *)-1);
1535 }
1536 
1537 /*
1538  * Concatenate and align first len bytes of common
1539  * message type.  Len == -1, means concat everything.
1540  * Returns 1 on success, 0 on failure
1541  * After the pullup, mp points to the pulled up data.
1542  */
1543 int
1544 pullupmsg(mblk_t *mp, ssize_t len)
1545 {
1546 	mblk_t *bp, *b_cont;
1547 	dblk_t *dbp;
1548 	ssize_t n;
1549 
1550 	ASSERT(mp->b_datap->db_ref > 0);
1551 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1552 
1553 	/*
1554 	 * We won't handle Multidata message, since it contains
1555 	 * metadata which this function has no knowledge of; we
1556 	 * assert on DEBUG, and return failure otherwise.
1557 	 */
1558 	ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1559 	if (mp->b_datap->db_type == M_MULTIDATA)
1560 		return (0);
1561 
1562 	if (len == -1) {
1563 		if (mp->b_cont == NULL && str_aligned(mp->b_rptr))
1564 			return (1);
1565 		len = xmsgsize(mp);
1566 	} else {
1567 		ssize_t first_mblk_len = mp->b_wptr - mp->b_rptr;
1568 		ASSERT(first_mblk_len >= 0);
1569 		/*
1570 		 * If the length is less than that of the first mblk,
1571 		 * we want to pull up the message into an aligned mblk.
1572 		 * Though not part of the spec, some callers assume it.
1573 		 */
1574 		if (len <= first_mblk_len) {
1575 			if (str_aligned(mp->b_rptr))
1576 				return (1);
1577 			len = first_mblk_len;
1578 		} else if (xmsgsize(mp) < len)
1579 			return (0);
1580 	}
1581 
1582 	if ((bp = allocb_tmpl(len, mp)) == NULL)
1583 		return (0);
1584 
1585 	dbp = bp->b_datap;
1586 	*bp = *mp;		/* swap mblks so bp heads the old msg... */
1587 	mp->b_datap = dbp;	/* ... and mp heads the new message */
1588 	mp->b_datap->db_mblk = mp;
1589 	bp->b_datap->db_mblk = bp;
1590 	mp->b_rptr = mp->b_wptr = dbp->db_base;
1591 
1592 	do {
1593 		ASSERT(bp->b_datap->db_ref > 0);
1594 		ASSERT(bp->b_wptr >= bp->b_rptr);
1595 		n = MIN(bp->b_wptr - bp->b_rptr, len);
1596 		bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
1597 		mp->b_wptr += n;
1598 		bp->b_rptr += n;
1599 		len -= n;
1600 		if (bp->b_rptr != bp->b_wptr)
1601 			break;
1602 		b_cont = bp->b_cont;
1603 		freeb(bp);
1604 		bp = b_cont;
1605 	} while (len && bp);
1606 
1607 	mp->b_cont = bp;	/* tack on whatever wasn't pulled up */
1608 
1609 	return (1);
1610 }
1611 
1612 /*
1613  * Concatenate and align at least the first len bytes of common message
1614  * type.  Len == -1 means concatenate everything.  The original message is
1615  * unaltered.  Returns a pointer to a new message on success, otherwise
1616  * returns NULL.
1617  */
1618 mblk_t *
1619 msgpullup(mblk_t *mp, ssize_t len)
1620 {
1621 	mblk_t	*newmp;
1622 	ssize_t	totlen;
1623 	ssize_t	n;
1624 
1625 	/*
1626 	 * We won't handle Multidata message, since it contains
1627 	 * metadata which this function has no knowledge of; we
1628 	 * assert on DEBUG, and return failure otherwise.
1629 	 */
1630 	ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1631 	if (mp->b_datap->db_type == M_MULTIDATA)
1632 		return (NULL);
1633 
1634 	totlen = xmsgsize(mp);
1635 
1636 	if ((len > 0) && (len > totlen))
1637 		return (NULL);
1638 
1639 	/*
1640 	 * Copy all of the first msg type into one new mblk, then dupmsg
1641 	 * and link the rest onto this.
1642 	 */
1643 
1644 	len = totlen;
1645 
1646 	if ((newmp = allocb_tmpl(len, mp)) == NULL)
1647 		return (NULL);
1648 
1649 	newmp->b_flag = mp->b_flag;
1650 	newmp->b_band = mp->b_band;
1651 
1652 	while (len > 0) {
1653 		n = mp->b_wptr - mp->b_rptr;
1654 		ASSERT(n >= 0);		/* allow zero-length mblk_t's */
1655 		if (n > 0)
1656 			bcopy(mp->b_rptr, newmp->b_wptr, n);
1657 		newmp->b_wptr += n;
1658 		len -= n;
1659 		mp = mp->b_cont;
1660 	}
1661 
1662 	if (mp != NULL) {
1663 		newmp->b_cont = dupmsg(mp);
1664 		if (newmp->b_cont == NULL) {
1665 			freemsg(newmp);
1666 			return (NULL);
1667 		}
1668 	}
1669 
1670 	return (newmp);
1671 }
1672 
1673 /*
1674  * Trim bytes from message
1675  *  len > 0, trim from head
1676  *  len < 0, trim from tail
1677  * Returns 1 on success, 0 on failure.
1678  */
1679 int
1680 adjmsg(mblk_t *mp, ssize_t len)
1681 {
1682 	mblk_t *bp;
1683 	mblk_t *save_bp = NULL;
1684 	mblk_t *prev_bp;
1685 	mblk_t *bcont;
1686 	unsigned char type;
1687 	ssize_t n;
1688 	int fromhead;
1689 	int first;
1690 
1691 	ASSERT(mp != NULL);
1692 	/*
1693 	 * We won't handle Multidata message, since it contains
1694 	 * metadata which this function has no knowledge of; we
1695 	 * assert on DEBUG, and return failure otherwise.
1696 	 */
1697 	ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1698 	if (mp->b_datap->db_type == M_MULTIDATA)
1699 		return (0);
1700 
1701 	if (len < 0) {
1702 		fromhead = 0;
1703 		len = -len;
1704 	} else {
1705 		fromhead = 1;
1706 	}
1707 
1708 	if (xmsgsize(mp) < len)
1709 		return (0);
1710 
1711 	if (fromhead) {
1712 		first = 1;
1713 		while (len) {
1714 			ASSERT(mp->b_wptr >= mp->b_rptr);
1715 			n = MIN(mp->b_wptr - mp->b_rptr, len);
1716 			mp->b_rptr += n;
1717 			len -= n;
1718 
1719 			/*
1720 			 * If this is not the first zero length
1721 			 * message remove it
1722 			 */
1723 			if (!first && (mp->b_wptr == mp->b_rptr)) {
1724 				bcont = mp->b_cont;
1725 				freeb(mp);
1726 				mp = save_bp->b_cont = bcont;
1727 			} else {
1728 				save_bp = mp;
1729 				mp = mp->b_cont;
1730 			}
1731 			first = 0;
1732 		}
1733 	} else {
1734 		type = mp->b_datap->db_type;
1735 		while (len) {
1736 			bp = mp;
1737 			save_bp = NULL;
1738 
1739 			/*
1740 			 * Find the last message of same type
1741 			 */
1742 			while (bp && bp->b_datap->db_type == type) {
1743 				ASSERT(bp->b_wptr >= bp->b_rptr);
1744 				prev_bp = save_bp;
1745 				save_bp = bp;
1746 				bp = bp->b_cont;
1747 			}
1748 			if (save_bp == NULL)
1749 				break;
1750 			n = MIN(save_bp->b_wptr - save_bp->b_rptr, len);
1751 			save_bp->b_wptr -= n;
1752 			len -= n;
1753 
1754 			/*
1755 			 * If this is not the first message
1756 			 * and we have taken away everything
1757 			 * from this message, remove it
1758 			 */
1759 
1760 			if ((save_bp != mp) &&
1761 			    (save_bp->b_wptr == save_bp->b_rptr)) {
1762 				bcont = save_bp->b_cont;
1763 				freeb(save_bp);
1764 				prev_bp->b_cont = bcont;
1765 			}
1766 		}
1767 	}
1768 	return (1);
1769 }
1770 
1771 /*
1772  * get number of data bytes in message
1773  */
1774 size_t
1775 msgdsize(mblk_t *bp)
1776 {
1777 	size_t count = 0;
1778 
1779 	for (; bp; bp = bp->b_cont)
1780 		if (bp->b_datap->db_type == M_DATA) {
1781 			ASSERT(bp->b_wptr >= bp->b_rptr);
1782 			count += bp->b_wptr - bp->b_rptr;
1783 		}
1784 	return (count);
1785 }
1786 
1787 /*
1788  * Get a message off head of queue
1789  *
1790  * If queue has no buffers then mark queue
1791  * with QWANTR. (queue wants to be read by
1792  * someone when data becomes available)
1793  *
1794  * If there is something to take off then do so.
1795  * If queue falls below hi water mark turn off QFULL
1796  * flag.  Decrement weighted count of queue.
1797  * Also turn off QWANTR because queue is being read.
1798  *
1799  * The queue count is maintained on a per-band basis.
1800  * Priority band 0 (normal messages) uses q_count,
1801  * q_lowat, etc.  Non-zero priority bands use the
1802  * fields in their respective qband structures
1803  * (qb_count, qb_lowat, etc.)  All messages appear
1804  * on the same list, linked via their b_next pointers.
1805  * q_first is the head of the list.  q_count does
1806  * not reflect the size of all the messages on the
1807  * queue.  It only reflects those messages in the
1808  * normal band of flow.  The one exception to this
1809  * deals with high priority messages.  They are in
1810  * their own conceptual "band", but are accounted
1811  * against q_count.
1812  *
1813  * If queue count is below the lo water mark and QWANTW
1814  * is set, enable the closest backq which has a service
1815  * procedure and turn off the QWANTW flag.
1816  *
1817  * getq could be built on top of rmvq, but isn't because
1818  * of performance considerations.
1819  *
1820  * A note on the use of q_count and q_mblkcnt:
1821  *   q_count is the traditional byte count for messages that
1822  *   have been put on a queue.  Documentation tells us that
1823  *   we shouldn't rely on that count, but some drivers/modules
1824  *   do.  What was needed, however, is a mechanism to prevent
1825  *   runaway streams from consuming all of the resources,
1826  *   and particularly be able to flow control zero-length
1827  *   messages.  q_mblkcnt is used for this purpose.  It
1828  *   counts the number of mblk's that are being put on
1829  *   the queue.  The intention here, is that each mblk should
1830  *   contain one byte of data and, for the purpose of
1831  *   flow-control, logically does.  A queue will become
1832  *   full when EITHER of these values (q_count and q_mblkcnt)
1833  *   reach the highwater mark.  It will clear when BOTH
1834  *   of them drop below the highwater mark.  And it will
1835  *   backenable when BOTH of them drop below the lowwater
1836  *   mark.
1837  *   With this algorithm, a driver/module might be able
1838  *   to find a reasonably accurate q_count, and the
1839  *   framework can still try and limit resource usage.
1840  */
1841 mblk_t *
1842 getq(queue_t *q)
1843 {
1844 	mblk_t *bp;
1845 	uchar_t band = 0;
1846 
1847 	bp = getq_noenab(q, 0);
1848 	if (bp != NULL)
1849 		band = bp->b_band;
1850 
1851 	/*
1852 	 * Inlined from qbackenable().
1853 	 * Quick check without holding the lock.
1854 	 */
1855 	if (band == 0 && (q->q_flag & (QWANTW|QWANTWSYNC)) == 0)
1856 		return (bp);
1857 
1858 	qbackenable(q, band);
1859 	return (bp);
1860 }
1861 
1862 /*
1863  * Calculate number of data bytes in a single data message block taking
1864  * multidata messages into account.
1865  */
1866 
1867 #define	ADD_MBLK_SIZE(mp, size) 					\
1868 	if (DB_TYPE(mp) != M_MULTIDATA) {				\
1869 		(size) += MBLKL(mp);					\
1870 	} else {							\
1871 		uint_t	pinuse;						\
1872 									\
1873 		mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse);	\
1874 		(size) += pinuse;					\
1875 	}
1876 
1877 /*
1878  * Returns the number of bytes in a message (a message is defined as a
1879  * chain of mblks linked by b_cont). If a non-NULL mblkcnt is supplied we
1880  * also return the number of distinct mblks in the message.
1881  */
1882 int
1883 mp_cont_len(mblk_t *bp, int *mblkcnt)
1884 {
1885 	mblk_t	*mp;
1886 	int	mblks = 0;
1887 	int	bytes = 0;
1888 
1889 	for (mp = bp; mp != NULL; mp = mp->b_cont) {
1890 		ADD_MBLK_SIZE(mp, bytes);
1891 		mblks++;
1892 	}
1893 
1894 	if (mblkcnt != NULL)
1895 		*mblkcnt = mblks;
1896 
1897 	return (bytes);
1898 }
1899 
1900 /*
1901  * Like getq() but does not backenable.  This is used by the stream
1902  * head when a putback() is likely.  The caller must call qbackenable()
1903  * after it is done with accessing the queue.
1904  * The rbytes arguments to getq_noneab() allows callers to specify a
1905  * the maximum number of bytes to return. If the current amount on the
1906  * queue is less than this then the entire message will be returned.
1907  * A value of 0 returns the entire message and is equivalent to the old
1908  * default behaviour prior to the addition of the rbytes argument.
1909  */
1910 mblk_t *
1911 getq_noenab(queue_t *q, ssize_t rbytes)
1912 {
1913 	mblk_t *bp, *mp1;
1914 	mblk_t *mp2 = NULL;
1915 	qband_t *qbp;
1916 	kthread_id_t freezer;
1917 	int	bytecnt = 0, mblkcnt = 0;
1918 
1919 	/* freezestr should allow its caller to call getq/putq */
1920 	freezer = STREAM(q)->sd_freezer;
1921 	if (freezer == curthread) {
1922 		ASSERT(frozenstr(q));
1923 		ASSERT(MUTEX_HELD(QLOCK(q)));
1924 	} else
1925 		mutex_enter(QLOCK(q));
1926 
1927 	if ((bp = q->q_first) == 0) {
1928 		q->q_flag |= QWANTR;
1929 	} else {
1930 		/*
1931 		 * If the caller supplied a byte threshold and there is
1932 		 * more than this amount on the queue then break up the
1933 		 * the message appropriately.  We can only safely do
1934 		 * this for M_DATA messages.
1935 		 */
1936 		if ((DB_TYPE(bp) == M_DATA) && (rbytes > 0) &&
1937 		    (q->q_count > rbytes)) {
1938 			/*
1939 			 * Inline version of mp_cont_len() which terminates
1940 			 * when we meet or exceed rbytes.
1941 			 */
1942 			for (mp1 = bp; mp1 != NULL; mp1 = mp1->b_cont) {
1943 				mblkcnt++;
1944 				ADD_MBLK_SIZE(mp1, bytecnt);
1945 				if (bytecnt  >= rbytes)
1946 					break;
1947 			}
1948 			/*
1949 			 * We need to account for the following scenarios:
1950 			 *
1951 			 * 1) Too much data in the first message:
1952 			 *	mp1 will be the mblk which puts us over our
1953 			 *	byte limit.
1954 			 * 2) Not enough data in the first message:
1955 			 *	mp1 will be NULL.
1956 			 * 3) Exactly the right amount of data contained within
1957 			 *    whole mblks:
1958 			 *	mp1->b_cont will be where we break the message.
1959 			 */
1960 			if (bytecnt > rbytes) {
1961 				/*
1962 				 * Dup/copy mp1 and put what we don't need
1963 				 * back onto the queue. Adjust the read/write
1964 				 * and continuation pointers appropriately
1965 				 * and decrement the current mblk count to
1966 				 * reflect we are putting an mblk back onto
1967 				 * the queue.
1968 				 * When adjusting the message pointers, it's
1969 				 * OK to use the existing bytecnt and the
1970 				 * requested amount (rbytes) to calculate the
1971 				 * the new write offset (b_wptr) of what we
1972 				 * are taking. However, we  cannot use these
1973 				 * values when calculating the read offset of
1974 				 * the mblk we are putting back on the queue.
1975 				 * This is because the begining (b_rptr) of the
1976 				 * mblk represents some arbitrary point within
1977 				 * the message.
1978 				 * It's simplest to do this by advancing b_rptr
1979 				 * by the new length of mp1 as we don't have to
1980 				 * remember any intermediate state.
1981 				 */
1982 				ASSERT(mp1 != NULL);
1983 				mblkcnt--;
1984 				if ((mp2 = dupb(mp1)) == NULL &&
1985 				    (mp2 = copyb(mp1)) == NULL) {
1986 					bytecnt = mblkcnt = 0;
1987 					goto dup_failed;
1988 				}
1989 				mp2->b_cont = mp1->b_cont;
1990 				mp1->b_wptr -= bytecnt - rbytes;
1991 				mp2->b_rptr += mp1->b_wptr - mp1->b_rptr;
1992 				mp1->b_cont = NULL;
1993 				bytecnt = rbytes;
1994 			} else {
1995 				/*
1996 				 * Either there is not enough data in the first
1997 				 * message or there is no excess data to deal
1998 				 * with. If mp1 is NULL, we are taking the
1999 				 * whole message. No need to do anything.
2000 				 * Otherwise we assign mp1->b_cont to mp2 as
2001 				 * we will be putting this back onto the head of
2002 				 * the queue.
2003 				 */
2004 				if (mp1 != NULL) {
2005 					mp2 = mp1->b_cont;
2006 					mp1->b_cont = NULL;
2007 				}
2008 			}
2009 			/*
2010 			 * If mp2 is not NULL then we have part of the message
2011 			 * to put back onto the queue.
2012 			 */
2013 			if (mp2 != NULL) {
2014 				if ((mp2->b_next = bp->b_next) == NULL)
2015 					q->q_last = mp2;
2016 				else
2017 					bp->b_next->b_prev = mp2;
2018 				q->q_first = mp2;
2019 			} else {
2020 				if ((q->q_first = bp->b_next) == NULL)
2021 					q->q_last = NULL;
2022 				else
2023 					q->q_first->b_prev = NULL;
2024 			}
2025 		} else {
2026 			/*
2027 			 * Either no byte threshold was supplied, there is
2028 			 * not enough on the queue or we failed to
2029 			 * duplicate/copy a data block. In these cases we
2030 			 * just take the entire first message.
2031 			 */
2032 dup_failed:
2033 			bytecnt = mp_cont_len(bp, &mblkcnt);
2034 			if ((q->q_first = bp->b_next) == NULL)
2035 				q->q_last = NULL;
2036 			else
2037 				q->q_first->b_prev = NULL;
2038 		}
2039 		if (bp->b_band == 0) {
2040 			q->q_count -= bytecnt;
2041 			q->q_mblkcnt -= mblkcnt;
2042 			if (q->q_mblkcnt == 0 || ((q->q_count < q->q_hiwat) &&
2043 			    (q->q_mblkcnt < q->q_hiwat))) {
2044 				q->q_flag &= ~QFULL;
2045 			}
2046 		} else {
2047 			int i;
2048 
2049 			ASSERT(bp->b_band <= q->q_nband);
2050 			ASSERT(q->q_bandp != NULL);
2051 			ASSERT(MUTEX_HELD(QLOCK(q)));
2052 			qbp = q->q_bandp;
2053 			i = bp->b_band;
2054 			while (--i > 0)
2055 				qbp = qbp->qb_next;
2056 			if (qbp->qb_first == qbp->qb_last) {
2057 				qbp->qb_first = NULL;
2058 				qbp->qb_last = NULL;
2059 			} else {
2060 				qbp->qb_first = bp->b_next;
2061 			}
2062 			qbp->qb_count -= bytecnt;
2063 			qbp->qb_mblkcnt -= mblkcnt;
2064 			if (qbp->qb_mblkcnt == 0 ||
2065 			    ((qbp->qb_count < qbp->qb_hiwat) &&
2066 			    (qbp->qb_mblkcnt < qbp->qb_hiwat))) {
2067 				qbp->qb_flag &= ~QB_FULL;
2068 			}
2069 		}
2070 		q->q_flag &= ~QWANTR;
2071 		bp->b_next = NULL;
2072 		bp->b_prev = NULL;
2073 	}
2074 	if (freezer != curthread)
2075 		mutex_exit(QLOCK(q));
2076 
2077 	STR_FTEVENT_MSG(bp, q, FTEV_GETQ, NULL);
2078 
2079 	return (bp);
2080 }
2081 
2082 /*
2083  * Determine if a backenable is needed after removing a message in the
2084  * specified band.
2085  * NOTE: This routine assumes that something like getq_noenab() has been
2086  * already called.
2087  *
2088  * For the read side it is ok to hold sd_lock across calling this (and the
2089  * stream head often does).
2090  * But for the write side strwakeq might be invoked and it acquires sd_lock.
2091  */
2092 void
2093 qbackenable(queue_t *q, uchar_t band)
2094 {
2095 	int backenab = 0;
2096 	qband_t *qbp;
2097 	kthread_id_t freezer;
2098 
2099 	ASSERT(q);
2100 	ASSERT((q->q_flag & QREADR) || MUTEX_NOT_HELD(&STREAM(q)->sd_lock));
2101 
2102 	/*
2103 	 * Quick check without holding the lock.
2104 	 * OK since after getq() has lowered the q_count these flags
2105 	 * would not change unless either the qbackenable() is done by
2106 	 * another thread (which is ok) or the queue has gotten QFULL
2107 	 * in which case another backenable will take place when the queue
2108 	 * drops below q_lowat.
2109 	 */
2110 	if (band == 0 && (q->q_flag & (QWANTW|QWANTWSYNC)) == 0)
2111 		return;
2112 
2113 	/* freezestr should allow its caller to call getq/putq */
2114 	freezer = STREAM(q)->sd_freezer;
2115 	if (freezer == curthread) {
2116 		ASSERT(frozenstr(q));
2117 		ASSERT(MUTEX_HELD(QLOCK(q)));
2118 	} else
2119 		mutex_enter(QLOCK(q));
2120 
2121 	if (band == 0) {
2122 		if (q->q_lowat == 0 || (q->q_count < q->q_lowat &&
2123 		    q->q_mblkcnt < q->q_lowat)) {
2124 			backenab = q->q_flag & (QWANTW|QWANTWSYNC);
2125 		}
2126 	} else {
2127 		int i;
2128 
2129 		ASSERT((unsigned)band <= q->q_nband);
2130 		ASSERT(q->q_bandp != NULL);
2131 
2132 		qbp = q->q_bandp;
2133 		i = band;
2134 		while (--i > 0)
2135 			qbp = qbp->qb_next;
2136 
2137 		if (qbp->qb_lowat == 0 || (qbp->qb_count < qbp->qb_lowat &&
2138 		    qbp->qb_mblkcnt < qbp->qb_lowat)) {
2139 			backenab = qbp->qb_flag & QB_WANTW;
2140 		}
2141 	}
2142 
2143 	if (backenab == 0) {
2144 		if (freezer != curthread)
2145 			mutex_exit(QLOCK(q));
2146 		return;
2147 	}
2148 
2149 	/* Have to drop the lock across strwakeq and backenable */
2150 	if (backenab & QWANTWSYNC)
2151 		q->q_flag &= ~QWANTWSYNC;
2152 	if (backenab & (QWANTW|QB_WANTW)) {
2153 		if (band != 0)
2154 			qbp->qb_flag &= ~QB_WANTW;
2155 		else {
2156 			q->q_flag &= ~QWANTW;
2157 		}
2158 	}
2159 
2160 	if (freezer != curthread)
2161 		mutex_exit(QLOCK(q));
2162 
2163 	if (backenab & QWANTWSYNC)
2164 		strwakeq(q, QWANTWSYNC);
2165 	if (backenab & (QWANTW|QB_WANTW))
2166 		backenable(q, band);
2167 }
2168 
2169 /*
2170  * Remove a message from a queue.  The queue count and other
2171  * flow control parameters are adjusted and the back queue
2172  * enabled if necessary.
2173  *
2174  * rmvq can be called with the stream frozen, but other utility functions
2175  * holding QLOCK, and by streams modules without any locks/frozen.
2176  */
2177 void
2178 rmvq(queue_t *q, mblk_t *mp)
2179 {
2180 	ASSERT(mp != NULL);
2181 
2182 	rmvq_noenab(q, mp);
2183 	if (curthread != STREAM(q)->sd_freezer && MUTEX_HELD(QLOCK(q))) {
2184 		/*
2185 		 * qbackenable can handle a frozen stream but not a "random"
2186 		 * qlock being held. Drop lock across qbackenable.
2187 		 */
2188 		mutex_exit(QLOCK(q));
2189 		qbackenable(q, mp->b_band);
2190 		mutex_enter(QLOCK(q));
2191 	} else {
2192 		qbackenable(q, mp->b_band);
2193 	}
2194 }
2195 
2196 /*
2197  * Like rmvq() but without any backenabling.
2198  * This exists to handle SR_CONSOL_DATA in strrput().
2199  */
2200 void
2201 rmvq_noenab(queue_t *q, mblk_t *mp)
2202 {
2203 	int i;
2204 	qband_t *qbp = NULL;
2205 	kthread_id_t freezer;
2206 	int	bytecnt = 0, mblkcnt = 0;
2207 
2208 	freezer = STREAM(q)->sd_freezer;
2209 	if (freezer == curthread) {
2210 		ASSERT(frozenstr(q));
2211 		ASSERT(MUTEX_HELD(QLOCK(q)));
2212 	} else if (MUTEX_HELD(QLOCK(q))) {
2213 		/* Don't drop lock on exit */
2214 		freezer = curthread;
2215 	} else
2216 		mutex_enter(QLOCK(q));
2217 
2218 	ASSERT(mp->b_band <= q->q_nband);
2219 	if (mp->b_band != 0) {		/* Adjust band pointers */
2220 		ASSERT(q->q_bandp != NULL);
2221 		qbp = q->q_bandp;
2222 		i = mp->b_band;
2223 		while (--i > 0)
2224 			qbp = qbp->qb_next;
2225 		if (mp == qbp->qb_first) {
2226 			if (mp->b_next && mp->b_band == mp->b_next->b_band)
2227 				qbp->qb_first = mp->b_next;
2228 			else
2229 				qbp->qb_first = NULL;
2230 		}
2231 		if (mp == qbp->qb_last) {
2232 			if (mp->b_prev && mp->b_band == mp->b_prev->b_band)
2233 				qbp->qb_last = mp->b_prev;
2234 			else
2235 				qbp->qb_last = NULL;
2236 		}
2237 	}
2238 
2239 	/*
2240 	 * Remove the message from the list.
2241 	 */
2242 	if (mp->b_prev)
2243 		mp->b_prev->b_next = mp->b_next;
2244 	else
2245 		q->q_first = mp->b_next;
2246 	if (mp->b_next)
2247 		mp->b_next->b_prev = mp->b_prev;
2248 	else
2249 		q->q_last = mp->b_prev;
2250 	mp->b_next = NULL;
2251 	mp->b_prev = NULL;
2252 
2253 	/* Get the size of the message for q_count accounting */
2254 	bytecnt = mp_cont_len(mp, &mblkcnt);
2255 
2256 	if (mp->b_band == 0) {		/* Perform q_count accounting */
2257 		q->q_count -= bytecnt;
2258 		q->q_mblkcnt -= mblkcnt;
2259 		if (q->q_mblkcnt == 0 || ((q->q_count < q->q_hiwat) &&
2260 		    (q->q_mblkcnt < q->q_hiwat))) {
2261 			q->q_flag &= ~QFULL;
2262 		}
2263 	} else {			/* Perform qb_count accounting */
2264 		qbp->qb_count -= bytecnt;
2265 		qbp->qb_mblkcnt -= mblkcnt;
2266 		if (qbp->qb_mblkcnt == 0 || ((qbp->qb_count < qbp->qb_hiwat) &&
2267 		    (qbp->qb_mblkcnt < qbp->qb_hiwat))) {
2268 			qbp->qb_flag &= ~QB_FULL;
2269 		}
2270 	}
2271 	if (freezer != curthread)
2272 		mutex_exit(QLOCK(q));
2273 
2274 	STR_FTEVENT_MSG(mp, q, FTEV_RMVQ, NULL);
2275 }
2276 
2277 /*
2278  * Empty a queue.
2279  * If flag is set, remove all messages.  Otherwise, remove
2280  * only non-control messages.  If queue falls below its low
2281  * water mark, and QWANTW is set, enable the nearest upstream
2282  * service procedure.
2283  *
2284  * Historical note: when merging the M_FLUSH code in strrput with this
2285  * code one difference was discovered. flushq did not have a check
2286  * for q_lowat == 0 in the backenabling test.
2287  *
2288  * pcproto_flag specifies whether or not a M_PCPROTO message should be flushed
2289  * if one exists on the queue.
2290  */
2291 void
2292 flushq_common(queue_t *q, int flag, int pcproto_flag)
2293 {
2294 	mblk_t *mp, *nmp;
2295 	qband_t *qbp;
2296 	int backenab = 0;
2297 	unsigned char bpri;
2298 	unsigned char	qbf[NBAND];	/* band flushing backenable flags */
2299 
2300 	if (q->q_first == NULL)
2301 		return;
2302 
2303 	mutex_enter(QLOCK(q));
2304 	mp = q->q_first;
2305 	q->q_first = NULL;
2306 	q->q_last = NULL;
2307 	q->q_count = 0;
2308 	q->q_mblkcnt = 0;
2309 	for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
2310 		qbp->qb_first = NULL;
2311 		qbp->qb_last = NULL;
2312 		qbp->qb_count = 0;
2313 		qbp->qb_mblkcnt = 0;
2314 		qbp->qb_flag &= ~QB_FULL;
2315 	}
2316 	q->q_flag &= ~QFULL;
2317 	mutex_exit(QLOCK(q));
2318 	while (mp) {
2319 		nmp = mp->b_next;
2320 		mp->b_next = mp->b_prev = NULL;
2321 
2322 		STR_FTEVENT_MBLK(mp, q, FTEV_FLUSHQ, NULL);
2323 
2324 		if (pcproto_flag && (mp->b_datap->db_type == M_PCPROTO))
2325 			(void) putq(q, mp);
2326 		else if (flag || datamsg(mp->b_datap->db_type))
2327 			freemsg(mp);
2328 		else
2329 			(void) putq(q, mp);
2330 		mp = nmp;
2331 	}
2332 	bpri = 1;
2333 	mutex_enter(QLOCK(q));
2334 	for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
2335 		if ((qbp->qb_flag & QB_WANTW) &&
2336 		    (((qbp->qb_count < qbp->qb_lowat) &&
2337 		    (qbp->qb_mblkcnt < qbp->qb_lowat)) ||
2338 		    qbp->qb_lowat == 0)) {
2339 			qbp->qb_flag &= ~QB_WANTW;
2340 			backenab = 1;
2341 			qbf[bpri] = 1;
2342 		} else
2343 			qbf[bpri] = 0;
2344 		bpri++;
2345 	}
2346 	ASSERT(bpri == (unsigned char)(q->q_nband + 1));
2347 	if ((q->q_flag & QWANTW) &&
2348 	    (((q->q_count < q->q_lowat) &&
2349 	    (q->q_mblkcnt < q->q_lowat)) || q->q_lowat == 0)) {
2350 		q->q_flag &= ~QWANTW;
2351 		backenab = 1;
2352 		qbf[0] = 1;
2353 	} else
2354 		qbf[0] = 0;
2355 
2356 	/*
2357 	 * If any band can now be written to, and there is a writer
2358 	 * for that band, then backenable the closest service procedure.
2359 	 */
2360 	if (backenab) {
2361 		mutex_exit(QLOCK(q));
2362 		for (bpri = q->q_nband; bpri != 0; bpri--)
2363 			if (qbf[bpri])
2364 				backenable(q, bpri);
2365 		if (qbf[0])
2366 			backenable(q, 0);
2367 	} else
2368 		mutex_exit(QLOCK(q));
2369 }
2370 
2371 /*
2372  * The real flushing takes place in flushq_common. This is done so that
2373  * a flag which specifies whether or not M_PCPROTO messages should be flushed
2374  * or not. Currently the only place that uses this flag is the stream head.
2375  */
2376 void
2377 flushq(queue_t *q, int flag)
2378 {
2379 	flushq_common(q, flag, 0);
2380 }
2381 
2382 /*
2383  * Flush the queue of messages of the given priority band.
2384  * There is some duplication of code between flushq and flushband.
2385  * This is because we want to optimize the code as much as possible.
2386  * The assumption is that there will be more messages in the normal
2387  * (priority 0) band than in any other.
2388  *
2389  * Historical note: when merging the M_FLUSH code in strrput with this
2390  * code one difference was discovered. flushband had an extra check for
2391  * did not have a check for (mp->b_datap->db_type < QPCTL) in the band 0
2392  * case. That check does not match the man page for flushband and was not
2393  * in the strrput flush code hence it was removed.
2394  */
2395 void
2396 flushband(queue_t *q, unsigned char pri, int flag)
2397 {
2398 	mblk_t *mp;
2399 	mblk_t *nmp;
2400 	mblk_t *last;
2401 	qband_t *qbp;
2402 	int band;
2403 
2404 	ASSERT((flag == FLUSHDATA) || (flag == FLUSHALL));
2405 	if (pri > q->q_nband) {
2406 		return;
2407 	}
2408 	mutex_enter(QLOCK(q));
2409 	if (pri == 0) {
2410 		mp = q->q_first;
2411 		q->q_first = NULL;
2412 		q->q_last = NULL;
2413 		q->q_count = 0;
2414 		q->q_mblkcnt = 0;
2415 		for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
2416 			qbp->qb_first = NULL;
2417 			qbp->qb_last = NULL;
2418 			qbp->qb_count = 0;
2419 			qbp->qb_mblkcnt = 0;
2420 			qbp->qb_flag &= ~QB_FULL;
2421 		}
2422 		q->q_flag &= ~QFULL;
2423 		mutex_exit(QLOCK(q));
2424 		while (mp) {
2425 			nmp = mp->b_next;
2426 			mp->b_next = mp->b_prev = NULL;
2427 			if ((mp->b_band == 0) &&
2428 			    ((flag == FLUSHALL) ||
2429 			    datamsg(mp->b_datap->db_type)))
2430 				freemsg(mp);
2431 			else
2432 				(void) putq(q, mp);
2433 			mp = nmp;
2434 		}
2435 		mutex_enter(QLOCK(q));
2436 		if ((q->q_flag & QWANTW) &&
2437 		    (((q->q_count < q->q_lowat) &&
2438 		    (q->q_mblkcnt < q->q_lowat)) || q->q_lowat == 0)) {
2439 			q->q_flag &= ~QWANTW;
2440 			mutex_exit(QLOCK(q));
2441 
2442 			backenable(q, pri);
2443 		} else
2444 			mutex_exit(QLOCK(q));
2445 	} else {	/* pri != 0 */
2446 		boolean_t flushed = B_FALSE;
2447 		band = pri;
2448 
2449 		ASSERT(MUTEX_HELD(QLOCK(q)));
2450 		qbp = q->q_bandp;
2451 		while (--band > 0)
2452 			qbp = qbp->qb_next;
2453 		mp = qbp->qb_first;
2454 		if (mp == NULL) {
2455 			mutex_exit(QLOCK(q));
2456 			return;
2457 		}
2458 		last = qbp->qb_last->b_next;
2459 		/*
2460 		 * rmvq_noenab() and freemsg() are called for each mblk that
2461 		 * meets the criteria.  The loop is executed until the last
2462 		 * mblk has been processed.
2463 		 */
2464 		while (mp != last) {
2465 			ASSERT(mp->b_band == pri);
2466 			nmp = mp->b_next;
2467 			if (flag == FLUSHALL || datamsg(mp->b_datap->db_type)) {
2468 				rmvq_noenab(q, mp);
2469 				freemsg(mp);
2470 				flushed = B_TRUE;
2471 			}
2472 			mp = nmp;
2473 		}
2474 		mutex_exit(QLOCK(q));
2475 
2476 		/*
2477 		 * If any mblk(s) has been freed, we know that qbackenable()
2478 		 * will need to be called.
2479 		 */
2480 		if (flushed)
2481 			qbackenable(q, pri);
2482 	}
2483 }
2484 
2485 /*
2486  * Return 1 if the queue is not full.  If the queue is full, return
2487  * 0 (may not put message) and set QWANTW flag (caller wants to write
2488  * to the queue).
2489  */
2490 int
2491 canput(queue_t *q)
2492 {
2493 	TRACE_1(TR_FAC_STREAMS_FR, TR_CANPUT_IN, "canput:%p", q);
2494 
2495 	/* this is for loopback transports, they should not do a canput */
2496 	ASSERT(STRMATED(q->q_stream) || STREAM(q) == STREAM(q->q_nfsrv));
2497 
2498 	/* Find next forward module that has a service procedure */
2499 	q = q->q_nfsrv;
2500 
2501 	if (!(q->q_flag & QFULL)) {
2502 		TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 1);
2503 		return (1);
2504 	}
2505 	mutex_enter(QLOCK(q));
2506 	if (q->q_flag & QFULL) {
2507 		q->q_flag |= QWANTW;
2508 		mutex_exit(QLOCK(q));
2509 		TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 0);
2510 		return (0);
2511 	}
2512 	mutex_exit(QLOCK(q));
2513 	TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 1);
2514 	return (1);
2515 }
2516 
2517 /*
2518  * This is the new canput for use with priority bands.  Return 1 if the
2519  * band is not full.  If the band is full, return 0 (may not put message)
2520  * and set QWANTW(QB_WANTW) flag for zero(non-zero) band (caller wants to
2521  * write to the queue).
2522  */
2523 int
2524 bcanput(queue_t *q, unsigned char pri)
2525 {
2526 	qband_t *qbp;
2527 
2528 	TRACE_2(TR_FAC_STREAMS_FR, TR_BCANPUT_IN, "bcanput:%p %p", q, pri);
2529 	if (!q)
2530 		return (0);
2531 
2532 	/* Find next forward module that has a service procedure */
2533 	q = q->q_nfsrv;
2534 
2535 	mutex_enter(QLOCK(q));
2536 	if (pri == 0) {
2537 		if (q->q_flag & QFULL) {
2538 			q->q_flag |= QWANTW;
2539 			mutex_exit(QLOCK(q));
2540 			TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
2541 			    "bcanput:%p %X %d", q, pri, 0);
2542 			return (0);
2543 		}
2544 	} else {	/* pri != 0 */
2545 		if (pri > q->q_nband) {
2546 			/*
2547 			 * No band exists yet, so return success.
2548 			 */
2549 			mutex_exit(QLOCK(q));
2550 			TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
2551 			    "bcanput:%p %X %d", q, pri, 1);
2552 			return (1);
2553 		}
2554 		qbp = q->q_bandp;
2555 		while (--pri)
2556 			qbp = qbp->qb_next;
2557 		if (qbp->qb_flag & QB_FULL) {
2558 			qbp->qb_flag |= QB_WANTW;
2559 			mutex_exit(QLOCK(q));
2560 			TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
2561 			    "bcanput:%p %X %d", q, pri, 0);
2562 			return (0);
2563 		}
2564 	}
2565 	mutex_exit(QLOCK(q));
2566 	TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
2567 	    "bcanput:%p %X %d", q, pri, 1);
2568 	return (1);
2569 }
2570 
2571 /*
2572  * Put a message on a queue.
2573  *
2574  * Messages are enqueued on a priority basis.  The priority classes
2575  * are HIGH PRIORITY (type >= QPCTL), PRIORITY (type < QPCTL && band > 0),
2576  * and B_NORMAL (type < QPCTL && band == 0).
2577  *
2578  * Add appropriate weighted data block sizes to queue count.
2579  * If queue hits high water mark then set QFULL flag.
2580  *
2581  * If QNOENAB is not set (putq is allowed to enable the queue),
2582  * enable the queue only if the message is PRIORITY,
2583  * or the QWANTR flag is set (indicating that the service procedure
2584  * is ready to read the queue.  This implies that a service
2585  * procedure must NEVER put a high priority message back on its own
2586  * queue, as this would result in an infinite loop (!).
2587  */
2588 int
2589 putq(queue_t *q, mblk_t *bp)
2590 {
2591 	mblk_t *tmp;
2592 	qband_t *qbp = NULL;
2593 	int mcls = (int)queclass(bp);
2594 	kthread_id_t freezer;
2595 	int	bytecnt = 0, mblkcnt = 0;
2596 
2597 	freezer = STREAM(q)->sd_freezer;
2598 	if (freezer == curthread) {
2599 		ASSERT(frozenstr(q));
2600 		ASSERT(MUTEX_HELD(QLOCK(q)));
2601 	} else
2602 		mutex_enter(QLOCK(q));
2603 
2604 	/*
2605 	 * Make sanity checks and if qband structure is not yet
2606 	 * allocated, do so.
2607 	 */
2608 	if (mcls == QPCTL) {
2609 		if (bp->b_band != 0)
2610 			bp->b_band = 0;		/* force to be correct */
2611 	} else if (bp->b_band != 0) {
2612 		int i;
2613 		qband_t **qbpp;
2614 
2615 		if (bp->b_band > q->q_nband) {
2616 
2617 			/*
2618 			 * The qband structure for this priority band is
2619 			 * not on the queue yet, so we have to allocate
2620 			 * one on the fly.  It would be wasteful to
2621 			 * associate the qband structures with every
2622 			 * queue when the queues are allocated.  This is
2623 			 * because most queues will only need the normal
2624 			 * band of flow which can be described entirely
2625 			 * by the queue itself.
2626 			 */
2627 			qbpp = &q->q_bandp;
2628 			while (*qbpp)
2629 				qbpp = &(*qbpp)->qb_next;
2630 			while (bp->b_band > q->q_nband) {
2631 				if ((*qbpp = allocband()) == NULL) {
2632 					if (freezer != curthread)
2633 						mutex_exit(QLOCK(q));
2634 					return (0);
2635 				}
2636 				(*qbpp)->qb_hiwat = q->q_hiwat;
2637 				(*qbpp)->qb_lowat = q->q_lowat;
2638 				q->q_nband++;
2639 				qbpp = &(*qbpp)->qb_next;
2640 			}
2641 		}
2642 		ASSERT(MUTEX_HELD(QLOCK(q)));
2643 		qbp = q->q_bandp;
2644 		i = bp->b_band;
2645 		while (--i)
2646 			qbp = qbp->qb_next;
2647 	}
2648 
2649 	/*
2650 	 * If queue is empty, add the message and initialize the pointers.
2651 	 * Otherwise, adjust message pointers and queue pointers based on
2652 	 * the type of the message and where it belongs on the queue.  Some
2653 	 * code is duplicated to minimize the number of conditionals and
2654 	 * hopefully minimize the amount of time this routine takes.
2655 	 */
2656 	if (!q->q_first) {
2657 		bp->b_next = NULL;
2658 		bp->b_prev = NULL;
2659 		q->q_first = bp;
2660 		q->q_last = bp;
2661 		if (qbp) {
2662 			qbp->qb_first = bp;
2663 			qbp->qb_last = bp;
2664 		}
2665 	} else if (!qbp) {	/* bp->b_band == 0 */
2666 
2667 		/*
2668 		 * If queue class of message is less than or equal to
2669 		 * that of the last one on the queue, tack on to the end.
2670 		 */
2671 		tmp = q->q_last;
2672 		if (mcls <= (int)queclass(tmp)) {
2673 			bp->b_next = NULL;
2674 			bp->b_prev = tmp;
2675 			tmp->b_next = bp;
2676 			q->q_last = bp;
2677 		} else {
2678 			tmp = q->q_first;
2679 			while ((int)queclass(tmp) >= mcls)
2680 				tmp = tmp->b_next;
2681 
2682 			/*
2683 			 * Insert bp before tmp.
2684 			 */
2685 			bp->b_next = tmp;
2686 			bp->b_prev = tmp->b_prev;
2687 			if (tmp->b_prev)
2688 				tmp->b_prev->b_next = bp;
2689 			else
2690 				q->q_first = bp;
2691 			tmp->b_prev = bp;
2692 		}
2693 	} else {		/* bp->b_band != 0 */
2694 		if (qbp->qb_first) {
2695 			tmp = qbp->qb_last;
2696 
2697 			/*
2698 			 * Insert bp after the last message in this band.
2699 			 */
2700 			bp->b_next = tmp->b_next;
2701 			if (tmp->b_next)
2702 				tmp->b_next->b_prev = bp;
2703 			else
2704 				q->q_last = bp;
2705 			bp->b_prev = tmp;
2706 			tmp->b_next = bp;
2707 		} else {
2708 			tmp = q->q_last;
2709 			if ((mcls < (int)queclass(tmp)) ||
2710 			    (bp->b_band <= tmp->b_band)) {
2711 
2712 				/*
2713 				 * Tack bp on end of queue.
2714 				 */
2715 				bp->b_next = NULL;
2716 				bp->b_prev = tmp;
2717 				tmp->b_next = bp;
2718 				q->q_last = bp;
2719 			} else {
2720 				tmp = q->q_first;
2721 				while (tmp->b_datap->db_type >= QPCTL)
2722 					tmp = tmp->b_next;
2723 				while (tmp->b_band >= bp->b_band)
2724 					tmp = tmp->b_next;
2725 
2726 				/*
2727 				 * Insert bp before tmp.
2728 				 */
2729 				bp->b_next = tmp;
2730 				bp->b_prev = tmp->b_prev;
2731 				if (tmp->b_prev)
2732 					tmp->b_prev->b_next = bp;
2733 				else
2734 					q->q_first = bp;
2735 				tmp->b_prev = bp;
2736 			}
2737 			qbp->qb_first = bp;
2738 		}
2739 		qbp->qb_last = bp;
2740 	}
2741 
2742 	/* Get message byte count for q_count accounting */
2743 	bytecnt = mp_cont_len(bp, &mblkcnt);
2744 
2745 	if (qbp) {
2746 		qbp->qb_count += bytecnt;
2747 		qbp->qb_mblkcnt += mblkcnt;
2748 		if ((qbp->qb_count >= qbp->qb_hiwat) ||
2749 		    (qbp->qb_mblkcnt >= qbp->qb_hiwat)) {
2750 			qbp->qb_flag |= QB_FULL;
2751 		}
2752 	} else {
2753 		q->q_count += bytecnt;
2754 		q->q_mblkcnt += mblkcnt;
2755 		if ((q->q_count >= q->q_hiwat) ||
2756 		    (q->q_mblkcnt >= q->q_hiwat)) {
2757 			q->q_flag |= QFULL;
2758 		}
2759 	}
2760 
2761 	STR_FTEVENT_MSG(bp, q, FTEV_PUTQ, NULL);
2762 
2763 	if ((mcls > QNORM) ||
2764 	    (canenable(q) && (q->q_flag & QWANTR || bp->b_band)))
2765 		qenable_locked(q);
2766 	ASSERT(MUTEX_HELD(QLOCK(q)));
2767 	if (freezer != curthread)
2768 		mutex_exit(QLOCK(q));
2769 
2770 	return (1);
2771 }
2772 
2773 /*
2774  * Put stuff back at beginning of Q according to priority order.
2775  * See comment on putq above for details.
2776  */
2777 int
2778 putbq(queue_t *q, mblk_t *bp)
2779 {
2780 	mblk_t *tmp;
2781 	qband_t *qbp = NULL;
2782 	int mcls = (int)queclass(bp);
2783 	kthread_id_t freezer;
2784 	int	bytecnt = 0, mblkcnt = 0;
2785 
2786 	ASSERT(q && bp);
2787 	ASSERT(bp->b_next == NULL);
2788 	freezer = STREAM(q)->sd_freezer;
2789 	if (freezer == curthread) {
2790 		ASSERT(frozenstr(q));
2791 		ASSERT(MUTEX_HELD(QLOCK(q)));
2792 	} else
2793 		mutex_enter(QLOCK(q));
2794 
2795 	/*
2796 	 * Make sanity checks and if qband structure is not yet
2797 	 * allocated, do so.
2798 	 */
2799 	if (mcls == QPCTL) {
2800 		if (bp->b_band != 0)
2801 			bp->b_band = 0;		/* force to be correct */
2802 	} else if (bp->b_band != 0) {
2803 		int i;
2804 		qband_t **qbpp;
2805 
2806 		if (bp->b_band > q->q_nband) {
2807 			qbpp = &q->q_bandp;
2808 			while (*qbpp)
2809 				qbpp = &(*qbpp)->qb_next;
2810 			while (bp->b_band > q->q_nband) {
2811 				if ((*qbpp = allocband()) == NULL) {
2812 					if (freezer != curthread)
2813 						mutex_exit(QLOCK(q));
2814 					return (0);
2815 				}
2816 				(*qbpp)->qb_hiwat = q->q_hiwat;
2817 				(*qbpp)->qb_lowat = q->q_lowat;
2818 				q->q_nband++;
2819 				qbpp = &(*qbpp)->qb_next;
2820 			}
2821 		}
2822 		qbp = q->q_bandp;
2823 		i = bp->b_band;
2824 		while (--i)
2825 			qbp = qbp->qb_next;
2826 	}
2827 
2828 	/*
2829 	 * If queue is empty or if message is high priority,
2830 	 * place on the front of the queue.
2831 	 */
2832 	tmp = q->q_first;
2833 	if ((!tmp) || (mcls == QPCTL)) {
2834 		bp->b_next = tmp;
2835 		if (tmp)
2836 			tmp->b_prev = bp;
2837 		else
2838 			q->q_last = bp;
2839 		q->q_first = bp;
2840 		bp->b_prev = NULL;
2841 		if (qbp) {
2842 			qbp->qb_first = bp;
2843 			qbp->qb_last = bp;
2844 		}
2845 	} else if (qbp) {	/* bp->b_band != 0 */
2846 		tmp = qbp->qb_first;
2847 		if (tmp) {
2848 
2849 			/*
2850 			 * Insert bp before the first message in this band.
2851 			 */
2852 			bp->b_next = tmp;
2853 			bp->b_prev = tmp->b_prev;
2854 			if (tmp->b_prev)
2855 				tmp->b_prev->b_next = bp;
2856 			else
2857 				q->q_first = bp;
2858 			tmp->b_prev = bp;
2859 		} else {
2860 			tmp = q->q_last;
2861 			if ((mcls < (int)queclass(tmp)) ||
2862 			    (bp->b_band < tmp->b_band)) {
2863 
2864 				/*
2865 				 * Tack bp on end of queue.
2866 				 */
2867 				bp->b_next = NULL;
2868 				bp->b_prev = tmp;
2869 				tmp->b_next = bp;
2870 				q->q_last = bp;
2871 			} else {
2872 				tmp = q->q_first;
2873 				while (tmp->b_datap->db_type >= QPCTL)
2874 					tmp = tmp->b_next;
2875 				while (tmp->b_band > bp->b_band)
2876 					tmp = tmp->b_next;
2877 
2878 				/*
2879 				 * Insert bp before tmp.
2880 				 */
2881 				bp->b_next = tmp;
2882 				bp->b_prev = tmp->b_prev;
2883 				if (tmp->b_prev)
2884 					tmp->b_prev->b_next = bp;
2885 				else
2886 					q->q_first = bp;
2887 				tmp->b_prev = bp;
2888 			}
2889 			qbp->qb_last = bp;
2890 		}
2891 		qbp->qb_first = bp;
2892 	} else {		/* bp->b_band == 0 && !QPCTL */
2893 
2894 		/*
2895 		 * If the queue class or band is less than that of the last
2896 		 * message on the queue, tack bp on the end of the queue.
2897 		 */
2898 		tmp = q->q_last;
2899 		if ((mcls < (int)queclass(tmp)) || (bp->b_band < tmp->b_band)) {
2900 			bp->b_next = NULL;
2901 			bp->b_prev = tmp;
2902 			tmp->b_next = bp;
2903 			q->q_last = bp;
2904 		} else {
2905 			tmp = q->q_first;
2906 			while (tmp->b_datap->db_type >= QPCTL)
2907 				tmp = tmp->b_next;
2908 			while (tmp->b_band > bp->b_band)
2909 				tmp = tmp->b_next;
2910 
2911 			/*
2912 			 * Insert bp before tmp.
2913 			 */
2914 			bp->b_next = tmp;
2915 			bp->b_prev = tmp->b_prev;
2916 			if (tmp->b_prev)
2917 				tmp->b_prev->b_next = bp;
2918 			else
2919 				q->q_first = bp;
2920 			tmp->b_prev = bp;
2921 		}
2922 	}
2923 
2924 	/* Get message byte count for q_count accounting */
2925 	bytecnt = mp_cont_len(bp, &mblkcnt);
2926 
2927 	if (qbp) {
2928 		qbp->qb_count += bytecnt;
2929 		qbp->qb_mblkcnt += mblkcnt;
2930 		if ((qbp->qb_count >= qbp->qb_hiwat) ||
2931 		    (qbp->qb_mblkcnt >= qbp->qb_hiwat)) {
2932 			qbp->qb_flag |= QB_FULL;
2933 		}
2934 	} else {
2935 		q->q_count += bytecnt;
2936 		q->q_mblkcnt += mblkcnt;
2937 		if ((q->q_count >= q->q_hiwat) ||
2938 		    (q->q_mblkcnt >= q->q_hiwat)) {
2939 			q->q_flag |= QFULL;
2940 		}
2941 	}
2942 
2943 	STR_FTEVENT_MSG(bp, q, FTEV_PUTBQ, NULL);
2944 
2945 	if ((mcls > QNORM) || (canenable(q) && (q->q_flag & QWANTR)))
2946 		qenable_locked(q);
2947 	ASSERT(MUTEX_HELD(QLOCK(q)));
2948 	if (freezer != curthread)
2949 		mutex_exit(QLOCK(q));
2950 
2951 	return (1);
2952 }
2953 
2954 /*
2955  * Insert a message before an existing message on the queue.  If the
2956  * existing message is NULL, the new messages is placed on the end of
2957  * the queue.  The queue class of the new message is ignored.  However,
2958  * the priority band of the new message must adhere to the following
2959  * ordering:
2960  *
2961  *	emp->b_prev->b_band >= mp->b_band >= emp->b_band.
2962  *
2963  * All flow control parameters are updated.
2964  *
2965  * insq can be called with the stream frozen, but other utility functions
2966  * holding QLOCK, and by streams modules without any locks/frozen.
2967  */
2968 int
2969 insq(queue_t *q, mblk_t *emp, mblk_t *mp)
2970 {
2971 	mblk_t *tmp;
2972 	qband_t *qbp = NULL;
2973 	int mcls = (int)queclass(mp);
2974 	kthread_id_t freezer;
2975 	int	bytecnt = 0, mblkcnt = 0;
2976 
2977 	freezer = STREAM(q)->sd_freezer;
2978 	if (freezer == curthread) {
2979 		ASSERT(frozenstr(q));
2980 		ASSERT(MUTEX_HELD(QLOCK(q)));
2981 	} else if (MUTEX_HELD(QLOCK(q))) {
2982 		/* Don't drop lock on exit */
2983 		freezer = curthread;
2984 	} else
2985 		mutex_enter(QLOCK(q));
2986 
2987 	if (mcls == QPCTL) {
2988 		if (mp->b_band != 0)
2989 			mp->b_band = 0;		/* force to be correct */
2990 		if (emp && emp->b_prev &&
2991 		    (emp->b_prev->b_datap->db_type < QPCTL))
2992 			goto badord;
2993 	}
2994 	if (emp) {
2995 		if (((mcls == QNORM) && (mp->b_band < emp->b_band)) ||
2996 		    (emp->b_prev && (emp->b_prev->b_datap->db_type < QPCTL) &&
2997 		    (emp->b_prev->b_band < mp->b_band))) {
2998 			goto badord;
2999 		}
3000 	} else {
3001 		tmp = q->q_last;
3002 		if (tmp && (mcls == QNORM) && (mp->b_band > tmp->b_band)) {
3003 badord:
3004 			cmn_err(CE_WARN,
3005 			    "insq: attempt to insert message out of order "
3006 			    "on q %p", (void *)q);
3007 			if (freezer != curthread)
3008 				mutex_exit(QLOCK(q));
3009 			return (0);
3010 		}
3011 	}
3012 
3013 	if (mp->b_band != 0) {
3014 		int i;
3015 		qband_t **qbpp;
3016 
3017 		if (mp->b_band > q->q_nband) {
3018 			qbpp = &q->q_bandp;
3019 			while (*qbpp)
3020 				qbpp = &(*qbpp)->qb_next;
3021 			while (mp->b_band > q->q_nband) {
3022 				if ((*qbpp = allocband()) == NULL) {
3023 					if (freezer != curthread)
3024 						mutex_exit(QLOCK(q));
3025 					return (0);
3026 				}
3027 				(*qbpp)->qb_hiwat = q->q_hiwat;
3028 				(*qbpp)->qb_lowat = q->q_lowat;
3029 				q->q_nband++;
3030 				qbpp = &(*qbpp)->qb_next;
3031 			}
3032 		}
3033 		qbp = q->q_bandp;
3034 		i = mp->b_band;
3035 		while (--i)
3036 			qbp = qbp->qb_next;
3037 	}
3038 
3039 	if ((mp->b_next = emp) != NULL) {
3040 		if ((mp->b_prev = emp->b_prev) != NULL)
3041 			emp->b_prev->b_next = mp;
3042 		else
3043 			q->q_first = mp;
3044 		emp->b_prev = mp;
3045 	} else {
3046 		if ((mp->b_prev = q->q_last) != NULL)
3047 			q->q_last->b_next = mp;
3048 		else
3049 			q->q_first = mp;
3050 		q->q_last = mp;
3051 	}
3052 
3053 	/* Get mblk and byte count for q_count accounting */
3054 	bytecnt = mp_cont_len(mp, &mblkcnt);
3055 
3056 	if (qbp) {	/* adjust qband pointers and count */
3057 		if (!qbp->qb_first) {
3058 			qbp->qb_first = mp;
3059 			qbp->qb_last = mp;
3060 		} else {
3061 			if (mp->b_prev == NULL || (mp->b_prev != NULL &&
3062 			    (mp->b_prev->b_band != mp->b_band)))
3063 				qbp->qb_first = mp;
3064 			else if (mp->b_next == NULL || (mp->b_next != NULL &&
3065 			    (mp->b_next->b_band != mp->b_band)))
3066 				qbp->qb_last = mp;
3067 		}
3068 		qbp->qb_count += bytecnt;
3069 		qbp->qb_mblkcnt += mblkcnt;
3070 		if ((qbp->qb_count >= qbp->qb_hiwat) ||
3071 		    (qbp->qb_mblkcnt >= qbp->qb_hiwat)) {
3072 			qbp->qb_flag |= QB_FULL;
3073 		}
3074 	} else {
3075 		q->q_count += bytecnt;
3076 		q->q_mblkcnt += mblkcnt;
3077 		if ((q->q_count >= q->q_hiwat) ||
3078 		    (q->q_mblkcnt >= q->q_hiwat)) {
3079 			q->q_flag |= QFULL;
3080 		}
3081 	}
3082 
3083 	STR_FTEVENT_MSG(mp, q, FTEV_INSQ, NULL);
3084 
3085 	if (canenable(q) && (q->q_flag & QWANTR))
3086 		qenable_locked(q);
3087 
3088 	ASSERT(MUTEX_HELD(QLOCK(q)));
3089 	if (freezer != curthread)
3090 		mutex_exit(QLOCK(q));
3091 
3092 	return (1);
3093 }
3094 
3095 /*
3096  * Create and put a control message on queue.
3097  */
3098 int
3099 putctl(queue_t *q, int type)
3100 {
3101 	mblk_t *bp;
3102 
3103 	if ((datamsg(type) && (type != M_DELAY)) ||
3104 	    (bp = allocb_tryhard(0)) == NULL)
3105 		return (0);
3106 	bp->b_datap->db_type = (unsigned char) type;
3107 
3108 	put(q, bp);
3109 
3110 	return (1);
3111 }
3112 
3113 /*
3114  * Control message with a single-byte parameter
3115  */
3116 int
3117 putctl1(queue_t *q, int type, int param)
3118 {
3119 	mblk_t *bp;
3120 
3121 	if ((datamsg(type) && (type != M_DELAY)) ||
3122 	    (bp = allocb_tryhard(1)) == NULL)
3123 		return (0);
3124 	bp->b_datap->db_type = (unsigned char)type;
3125 	*bp->b_wptr++ = (unsigned char)param;
3126 
3127 	put(q, bp);
3128 
3129 	return (1);
3130 }
3131 
3132 int
3133 putnextctl1(queue_t *q, int type, int param)
3134 {
3135 	mblk_t *bp;
3136 
3137 	if ((datamsg(type) && (type != M_DELAY)) ||
3138 	    ((bp = allocb_tryhard(1)) == NULL))
3139 		return (0);
3140 
3141 	bp->b_datap->db_type = (unsigned char)type;
3142 	*bp->b_wptr++ = (unsigned char)param;
3143 
3144 	putnext(q, bp);
3145 
3146 	return (1);
3147 }
3148 
3149 int
3150 putnextctl(queue_t *q, int type)
3151 {
3152 	mblk_t *bp;
3153 
3154 	if ((datamsg(type) && (type != M_DELAY)) ||
3155 	    ((bp = allocb_tryhard(0)) == NULL))
3156 		return (0);
3157 	bp->b_datap->db_type = (unsigned char)type;
3158 
3159 	putnext(q, bp);
3160 
3161 	return (1);
3162 }
3163 
3164 /*
3165  * Return the queue upstream from this one
3166  */
3167 queue_t *
3168 backq(queue_t *q)
3169 {
3170 	q = _OTHERQ(q);
3171 	if (q->q_next) {
3172 		q = q->q_next;
3173 		return (_OTHERQ(q));
3174 	}
3175 	return (NULL);
3176 }
3177 
3178 /*
3179  * Send a block back up the queue in reverse from this
3180  * one (e.g. to respond to ioctls)
3181  */
3182 void
3183 qreply(queue_t *q, mblk_t *bp)
3184 {
3185 	ASSERT(q && bp);
3186 
3187 	putnext(_OTHERQ(q), bp);
3188 }
3189 
3190 /*
3191  * Streams Queue Scheduling
3192  *
3193  * Queues are enabled through qenable() when they have messages to
3194  * process.  They are serviced by queuerun(), which runs each enabled
3195  * queue's service procedure.  The call to queuerun() is processor
3196  * dependent - the general principle is that it be run whenever a queue
3197  * is enabled but before returning to user level.  For system calls,
3198  * the function runqueues() is called if their action causes a queue
3199  * to be enabled.  For device interrupts, queuerun() should be
3200  * called before returning from the last level of interrupt.  Beyond
3201  * this, no timing assumptions should be made about queue scheduling.
3202  */
3203 
3204 /*
3205  * Enable a queue: put it on list of those whose service procedures are
3206  * ready to run and set up the scheduling mechanism.
3207  * The broadcast is done outside the mutex -> to avoid the woken thread
3208  * from contending with the mutex. This is OK 'cos the queue has been
3209  * enqueued on the runlist and flagged safely at this point.
3210  */
3211 void
3212 qenable(queue_t *q)
3213 {
3214 	mutex_enter(QLOCK(q));
3215 	qenable_locked(q);
3216 	mutex_exit(QLOCK(q));
3217 }
3218 /*
3219  * Return number of messages on queue
3220  */
3221 int
3222 qsize(queue_t *qp)
3223 {
3224 	int count = 0;
3225 	mblk_t *mp;
3226 
3227 	mutex_enter(QLOCK(qp));
3228 	for (mp = qp->q_first; mp; mp = mp->b_next)
3229 		count++;
3230 	mutex_exit(QLOCK(qp));
3231 	return (count);
3232 }
3233 
3234 /*
3235  * noenable - set queue so that putq() will not enable it.
3236  * enableok - set queue so that putq() can enable it.
3237  */
3238 void
3239 noenable(queue_t *q)
3240 {
3241 	mutex_enter(QLOCK(q));
3242 	q->q_flag |= QNOENB;
3243 	mutex_exit(QLOCK(q));
3244 }
3245 
3246 void
3247 enableok(queue_t *q)
3248 {
3249 	mutex_enter(QLOCK(q));
3250 	q->q_flag &= ~QNOENB;
3251 	mutex_exit(QLOCK(q));
3252 }
3253 
3254 /*
3255  * Set queue fields.
3256  */
3257 int
3258 strqset(queue_t *q, qfields_t what, unsigned char pri, intptr_t val)
3259 {
3260 	qband_t *qbp = NULL;
3261 	queue_t	*wrq;
3262 	int error = 0;
3263 	kthread_id_t freezer;
3264 
3265 	freezer = STREAM(q)->sd_freezer;
3266 	if (freezer == curthread) {
3267 		ASSERT(frozenstr(q));
3268 		ASSERT(MUTEX_HELD(QLOCK(q)));
3269 	} else
3270 		mutex_enter(QLOCK(q));
3271 
3272 	if (what >= QBAD) {
3273 		error = EINVAL;
3274 		goto done;
3275 	}
3276 	if (pri != 0) {
3277 		int i;
3278 		qband_t **qbpp;
3279 
3280 		if (pri > q->q_nband) {
3281 			qbpp = &q->q_bandp;
3282 			while (*qbpp)
3283 				qbpp = &(*qbpp)->qb_next;
3284 			while (pri > q->q_nband) {
3285 				if ((*qbpp = allocband()) == NULL) {
3286 					error = EAGAIN;
3287 					goto done;
3288 				}
3289 				(*qbpp)->qb_hiwat = q->q_hiwat;
3290 				(*qbpp)->qb_lowat = q->q_lowat;
3291 				q->q_nband++;
3292 				qbpp = &(*qbpp)->qb_next;
3293 			}
3294 		}
3295 		qbp = q->q_bandp;
3296 		i = pri;
3297 		while (--i)
3298 			qbp = qbp->qb_next;
3299 	}
3300 	switch (what) {
3301 
3302 	case QHIWAT:
3303 		if (qbp)
3304 			qbp->qb_hiwat = (size_t)val;
3305 		else
3306 			q->q_hiwat = (size_t)val;
3307 		break;
3308 
3309 	case QLOWAT:
3310 		if (qbp)
3311 			qbp->qb_lowat = (size_t)val;
3312 		else
3313 			q->q_lowat = (size_t)val;
3314 		break;
3315 
3316 	case QMAXPSZ:
3317 		if (qbp)
3318 			error = EINVAL;
3319 		else
3320 			q->q_maxpsz = (ssize_t)val;
3321 
3322 		/*
3323 		 * Performance concern, strwrite looks at the module below
3324 		 * the stream head for the maxpsz each time it does a write
3325 		 * we now cache it at the stream head.  Check to see if this
3326 		 * queue is sitting directly below the stream head.
3327 		 */
3328 		wrq = STREAM(q)->sd_wrq;
3329 		if (q != wrq->q_next)
3330 			break;
3331 
3332 		/*
3333 		 * If the stream is not frozen drop the current QLOCK and
3334 		 * acquire the sd_wrq QLOCK which protects sd_qn_*
3335 		 */
3336 		if (freezer != curthread) {
3337 			mutex_exit(QLOCK(q));
3338 			mutex_enter(QLOCK(wrq));
3339 		}
3340 		ASSERT(MUTEX_HELD(QLOCK(wrq)));
3341 
3342 		if (strmsgsz != 0) {
3343 			if (val == INFPSZ)
3344 				val = strmsgsz;
3345 			else  {
3346 				if (STREAM(q)->sd_vnode->v_type == VFIFO)
3347 					val = MIN(PIPE_BUF, val);
3348 				else
3349 					val = MIN(strmsgsz, val);
3350 			}
3351 		}
3352 		STREAM(q)->sd_qn_maxpsz = val;
3353 		if (freezer != curthread) {
3354 			mutex_exit(QLOCK(wrq));
3355 			mutex_enter(QLOCK(q));
3356 		}
3357 		break;
3358 
3359 	case QMINPSZ:
3360 		if (qbp)
3361 			error = EINVAL;
3362 		else
3363 			q->q_minpsz = (ssize_t)val;
3364 
3365 		/*
3366 		 * Performance concern, strwrite looks at the module below
3367 		 * the stream head for the maxpsz each time it does a write
3368 		 * we now cache it at the stream head.  Check to see if this
3369 		 * queue is sitting directly below the stream head.
3370 		 */
3371 		wrq = STREAM(q)->sd_wrq;
3372 		if (q != wrq->q_next)
3373 			break;
3374 
3375 		/*
3376 		 * If the stream is not frozen drop the current QLOCK and
3377 		 * acquire the sd_wrq QLOCK which protects sd_qn_*
3378 		 */
3379 		if (freezer != curthread) {
3380 			mutex_exit(QLOCK(q));
3381 			mutex_enter(QLOCK(wrq));
3382 		}
3383 		STREAM(q)->sd_qn_minpsz = (ssize_t)val;
3384 
3385 		if (freezer != curthread) {
3386 			mutex_exit(QLOCK(wrq));
3387 			mutex_enter(QLOCK(q));
3388 		}
3389 		break;
3390 
3391 	case QSTRUIOT:
3392 		if (qbp)
3393 			error = EINVAL;
3394 		else
3395 			q->q_struiot = (ushort_t)val;
3396 		break;
3397 
3398 	case QCOUNT:
3399 	case QFIRST:
3400 	case QLAST:
3401 	case QFLAG:
3402 		error = EPERM;
3403 		break;
3404 
3405 	default:
3406 		error = EINVAL;
3407 		break;
3408 	}
3409 done:
3410 	if (freezer != curthread)
3411 		mutex_exit(QLOCK(q));
3412 	return (error);
3413 }
3414 
3415 /*
3416  * Get queue fields.
3417  */
3418 int
3419 strqget(queue_t *q, qfields_t what, unsigned char pri, void *valp)
3420 {
3421 	qband_t 	*qbp = NULL;
3422 	int 		error = 0;
3423 	kthread_id_t 	freezer;
3424 
3425 	freezer = STREAM(q)->sd_freezer;
3426 	if (freezer == curthread) {
3427 		ASSERT(frozenstr(q));
3428 		ASSERT(MUTEX_HELD(QLOCK(q)));
3429 	} else
3430 		mutex_enter(QLOCK(q));
3431 	if (what >= QBAD) {
3432 		error = EINVAL;
3433 		goto done;
3434 	}
3435 	if (pri != 0) {
3436 		int i;
3437 		qband_t **qbpp;
3438 
3439 		if (pri > q->q_nband) {
3440 			qbpp = &q->q_bandp;
3441 			while (*qbpp)
3442 				qbpp = &(*qbpp)->qb_next;
3443 			while (pri > q->q_nband) {
3444 				if ((*qbpp = allocband()) == NULL) {
3445 					error = EAGAIN;
3446 					goto done;
3447 				}
3448 				(*qbpp)->qb_hiwat = q->q_hiwat;
3449 				(*qbpp)->qb_lowat = q->q_lowat;
3450 				q->q_nband++;
3451 				qbpp = &(*qbpp)->qb_next;
3452 			}
3453 		}
3454 		qbp = q->q_bandp;
3455 		i = pri;
3456 		while (--i)
3457 			qbp = qbp->qb_next;
3458 	}
3459 	switch (what) {
3460 	case QHIWAT:
3461 		if (qbp)
3462 			*(size_t *)valp = qbp->qb_hiwat;
3463 		else
3464 			*(size_t *)valp = q->q_hiwat;
3465 		break;
3466 
3467 	case QLOWAT:
3468 		if (qbp)
3469 			*(size_t *)valp = qbp->qb_lowat;
3470 		else
3471 			*(size_t *)valp = q->q_lowat;
3472 		break;
3473 
3474 	case QMAXPSZ:
3475 		if (qbp)
3476 			error = EINVAL;
3477 		else
3478 			*(ssize_t *)valp = q->q_maxpsz;
3479 		break;
3480 
3481 	case QMINPSZ:
3482 		if (qbp)
3483 			error = EINVAL;
3484 		else
3485 			*(ssize_t *)valp = q->q_minpsz;
3486 		break;
3487 
3488 	case QCOUNT:
3489 		if (qbp)
3490 			*(size_t *)valp = qbp->qb_count;
3491 		else
3492 			*(size_t *)valp = q->q_count;
3493 		break;
3494 
3495 	case QFIRST:
3496 		if (qbp)
3497 			*(mblk_t **)valp = qbp->qb_first;
3498 		else
3499 			*(mblk_t **)valp = q->q_first;
3500 		break;
3501 
3502 	case QLAST:
3503 		if (qbp)
3504 			*(mblk_t **)valp = qbp->qb_last;
3505 		else
3506 			*(mblk_t **)valp = q->q_last;
3507 		break;
3508 
3509 	case QFLAG:
3510 		if (qbp)
3511 			*(uint_t *)valp = qbp->qb_flag;
3512 		else
3513 			*(uint_t *)valp = q->q_flag;
3514 		break;
3515 
3516 	case QSTRUIOT:
3517 		if (qbp)
3518 			error = EINVAL;
3519 		else
3520 			*(short *)valp = q->q_struiot;
3521 		break;
3522 
3523 	default:
3524 		error = EINVAL;
3525 		break;
3526 	}
3527 done:
3528 	if (freezer != curthread)
3529 		mutex_exit(QLOCK(q));
3530 	return (error);
3531 }
3532 
3533 /*
3534  * Function awakes all in cvwait/sigwait/pollwait, on one of:
3535  *	QWANTWSYNC or QWANTR or QWANTW,
3536  *
3537  * Note: for QWANTWSYNC/QWANTW and QWANTR, if no WSLEEPer or RSLEEPer then a
3538  *	 deferred wakeup will be done. Also if strpoll() in progress then a
3539  *	 deferred pollwakeup will be done.
3540  */
3541 void
3542 strwakeq(queue_t *q, int flag)
3543 {
3544 	stdata_t 	*stp = STREAM(q);
3545 	pollhead_t 	*pl;
3546 
3547 	mutex_enter(&stp->sd_lock);
3548 	pl = &stp->sd_pollist;
3549 	if (flag & QWANTWSYNC) {
3550 		ASSERT(!(q->q_flag & QREADR));
3551 		if (stp->sd_flag & WSLEEP) {
3552 			stp->sd_flag &= ~WSLEEP;
3553 			cv_broadcast(&stp->sd_wrq->q_wait);
3554 		} else {
3555 			stp->sd_wakeq |= WSLEEP;
3556 		}
3557 
3558 		mutex_exit(&stp->sd_lock);
3559 		pollwakeup(pl, POLLWRNORM);
3560 		mutex_enter(&stp->sd_lock);
3561 
3562 		if (stp->sd_sigflags & S_WRNORM)
3563 			strsendsig(stp->sd_siglist, S_WRNORM, 0, 0);
3564 	} else if (flag & QWANTR) {
3565 		if (stp->sd_flag & RSLEEP) {
3566 			stp->sd_flag &= ~RSLEEP;
3567 			cv_broadcast(&_RD(stp->sd_wrq)->q_wait);
3568 		} else {
3569 			stp->sd_wakeq |= RSLEEP;
3570 		}
3571 
3572 		mutex_exit(&stp->sd_lock);
3573 		pollwakeup(pl, POLLIN | POLLRDNORM);
3574 		mutex_enter(&stp->sd_lock);
3575 
3576 		{
3577 			int events = stp->sd_sigflags & (S_INPUT | S_RDNORM);
3578 
3579 			if (events)
3580 				strsendsig(stp->sd_siglist, events, 0, 0);
3581 		}
3582 	} else {
3583 		if (stp->sd_flag & WSLEEP) {
3584 			stp->sd_flag &= ~WSLEEP;
3585 			cv_broadcast(&stp->sd_wrq->q_wait);
3586 		}
3587 
3588 		mutex_exit(&stp->sd_lock);
3589 		pollwakeup(pl, POLLWRNORM);
3590 		mutex_enter(&stp->sd_lock);
3591 
3592 		if (stp->sd_sigflags & S_WRNORM)
3593 			strsendsig(stp->sd_siglist, S_WRNORM, 0, 0);
3594 	}
3595 	mutex_exit(&stp->sd_lock);
3596 }
3597 
3598 int
3599 struioget(queue_t *q, mblk_t *mp, struiod_t *dp, int noblock)
3600 {
3601 	stdata_t *stp = STREAM(q);
3602 	int typ  = STRUIOT_STANDARD;
3603 	uio_t	 *uiop = &dp->d_uio;
3604 	dblk_t	 *dbp;
3605 	ssize_t	 uiocnt;
3606 	ssize_t	 cnt;
3607 	unsigned char *ptr;
3608 	ssize_t	 resid;
3609 	int	 error = 0;
3610 	on_trap_data_t otd;
3611 	queue_t	*stwrq;
3612 
3613 	/*
3614 	 * Plumbing may change while taking the type so store the
3615 	 * queue in a temporary variable. It doesn't matter even
3616 	 * if the we take the type from the previous plumbing,
3617 	 * that's because if the plumbing has changed when we were
3618 	 * holding the queue in a temporary variable, we can continue
3619 	 * processing the message the way it would have been processed
3620 	 * in the old plumbing, without any side effects but a bit
3621 	 * extra processing for partial ip header checksum.
3622 	 *
3623 	 * This has been done to avoid holding the sd_lock which is
3624 	 * very hot.
3625 	 */
3626 
3627 	stwrq = stp->sd_struiowrq;
3628 	if (stwrq)
3629 		typ = stwrq->q_struiot;
3630 
3631 	for (; (resid = uiop->uio_resid) > 0 && mp; mp = mp->b_cont) {
3632 		dbp = mp->b_datap;
3633 		ptr = (uchar_t *)(mp->b_rptr + dbp->db_cksumstuff);
3634 		uiocnt = dbp->db_cksumend - dbp->db_cksumstuff;
3635 		cnt = MIN(uiocnt, uiop->uio_resid);
3636 		if (!(dbp->db_struioflag & STRUIO_SPEC) ||
3637 		    (dbp->db_struioflag & STRUIO_DONE) || cnt == 0) {
3638 			/*
3639 			 * Either this mblk has already been processed
3640 			 * or there is no more room in this mblk (?).
3641 			 */
3642 			continue;
3643 		}
3644 		switch (typ) {
3645 		case STRUIOT_STANDARD:
3646 			if (noblock) {
3647 				if (on_trap(&otd, OT_DATA_ACCESS)) {
3648 					no_trap();
3649 					error = EWOULDBLOCK;
3650 					goto out;
3651 				}
3652 			}
3653 			if (error = uiomove(ptr, cnt, UIO_WRITE, uiop)) {
3654 				if (noblock)
3655 					no_trap();
3656 				goto out;
3657 			}
3658 			if (noblock)
3659 				no_trap();
3660 			break;
3661 
3662 		default:
3663 			error = EIO;
3664 			goto out;
3665 		}
3666 		dbp->db_struioflag |= STRUIO_DONE;
3667 		dbp->db_cksumstuff += cnt;
3668 	}
3669 out:
3670 	if (error == EWOULDBLOCK && (resid -= uiop->uio_resid) > 0) {
3671 		/*
3672 		 * A fault has occured and some bytes were moved to the
3673 		 * current mblk, the uio_t has already been updated by
3674 		 * the appropriate uio routine, so also update the mblk
3675 		 * to reflect this in case this same mblk chain is used
3676 		 * again (after the fault has been handled).
3677 		 */
3678 		uiocnt = dbp->db_cksumend - dbp->db_cksumstuff;
3679 		if (uiocnt >= resid)
3680 			dbp->db_cksumstuff += resid;
3681 	}
3682 	return (error);
3683 }
3684 
3685 /*
3686  * Try to enter queue synchronously. Any attempt to enter a closing queue will
3687  * fails. The qp->q_rwcnt keeps track of the number of successful entries so
3688  * that removeq() will not try to close the queue while a thread is inside the
3689  * queue.
3690  */
3691 static boolean_t
3692 rwnext_enter(queue_t *qp)
3693 {
3694 	mutex_enter(QLOCK(qp));
3695 	if (qp->q_flag & QWCLOSE) {
3696 		mutex_exit(QLOCK(qp));
3697 		return (B_FALSE);
3698 	}
3699 	qp->q_rwcnt++;
3700 	ASSERT(qp->q_rwcnt != 0);
3701 	mutex_exit(QLOCK(qp));
3702 	return (B_TRUE);
3703 }
3704 
3705 /*
3706  * Decrease the count of threads running in sync stream queue and wake up any
3707  * threads blocked in removeq().
3708  */
3709 static void
3710 rwnext_exit(queue_t *qp)
3711 {
3712 	mutex_enter(QLOCK(qp));
3713 	qp->q_rwcnt--;
3714 	if (qp->q_flag & QWANTRMQSYNC) {
3715 		qp->q_flag &= ~QWANTRMQSYNC;
3716 		cv_broadcast(&qp->q_wait);
3717 	}
3718 	mutex_exit(QLOCK(qp));
3719 }
3720 
3721 /*
3722  * The purpose of rwnext() is to call the rw procedure of the next
3723  * (downstream) modules queue.
3724  *
3725  * treated as put entrypoint for perimeter syncronization.
3726  *
3727  * There's no need to grab sq_putlocks here (which only exist for CIPUT
3728  * sync queues). If it is CIPUT sync queue sq_count is incremented and it does
3729  * not matter if any regular put entrypoints have been already entered. We
3730  * can't increment one of the sq_putcounts (instead of sq_count) because
3731  * qwait_rw won't know which counter to decrement.
3732  *
3733  * It would be reasonable to add the lockless FASTPUT logic.
3734  */
3735 int
3736 rwnext(queue_t *qp, struiod_t *dp)
3737 {
3738 	queue_t		*nqp;
3739 	syncq_t		*sq;
3740 	uint16_t	count;
3741 	uint16_t	flags;
3742 	struct qinit	*qi;
3743 	int		(*proc)();
3744 	struct stdata	*stp;
3745 	int		isread;
3746 	int		rval;
3747 
3748 	stp = STREAM(qp);
3749 	/*
3750 	 * Prevent q_next from changing by holding sd_lock until acquiring
3751 	 * SQLOCK. Note that a read-side rwnext from the streamhead will
3752 	 * already have sd_lock acquired. In either case sd_lock is always
3753 	 * released after acquiring SQLOCK.
3754 	 *
3755 	 * The streamhead read-side holding sd_lock when calling rwnext is
3756 	 * required to prevent a race condition were M_DATA mblks flowing
3757 	 * up the read-side of the stream could be bypassed by a rwnext()
3758 	 * down-call. In this case sd_lock acts as the streamhead perimeter.
3759 	 */
3760 	if ((nqp = _WR(qp)) == qp) {
3761 		isread = 0;
3762 		mutex_enter(&stp->sd_lock);
3763 		qp = nqp->q_next;
3764 	} else {
3765 		isread = 1;
3766 		if (nqp != stp->sd_wrq)
3767 			/* Not streamhead */
3768 			mutex_enter(&stp->sd_lock);
3769 		qp = _RD(nqp->q_next);
3770 	}
3771 	qi = qp->q_qinfo;
3772 	if (qp->q_struiot == STRUIOT_NONE || ! (proc = qi->qi_rwp)) {
3773 		/*
3774 		 * Not a synchronous module or no r/w procedure for this
3775 		 * queue, so just return EINVAL and let the caller handle it.
3776 		 */
3777 		mutex_exit(&stp->sd_lock);
3778 		return (EINVAL);
3779 	}
3780 
3781 	if (rwnext_enter(qp) == B_FALSE) {
3782 		mutex_exit(&stp->sd_lock);
3783 		return (EINVAL);
3784 	}
3785 
3786 	sq = qp->q_syncq;
3787 	mutex_enter(SQLOCK(sq));
3788 	mutex_exit(&stp->sd_lock);
3789 	count = sq->sq_count;
3790 	flags = sq->sq_flags;
3791 	ASSERT(sq->sq_ciputctrl == NULL || (flags & SQ_CIPUT));
3792 
3793 	while ((flags & SQ_GOAWAY) || (!(flags & SQ_CIPUT) && count != 0)) {
3794 		/*
3795 		 * if this queue is being closed, return.
3796 		 */
3797 		if (qp->q_flag & QWCLOSE) {
3798 			mutex_exit(SQLOCK(sq));
3799 			rwnext_exit(qp);
3800 			return (EINVAL);
3801 		}
3802 
3803 		/*
3804 		 * Wait until we can enter the inner perimeter.
3805 		 */
3806 		sq->sq_flags = flags | SQ_WANTWAKEUP;
3807 		cv_wait(&sq->sq_wait, SQLOCK(sq));
3808 		count = sq->sq_count;
3809 		flags = sq->sq_flags;
3810 	}
3811 
3812 	if (isread == 0 && stp->sd_struiowrq == NULL ||
3813 	    isread == 1 && stp->sd_struiordq == NULL) {
3814 		/*
3815 		 * Stream plumbing changed while waiting for inner perimeter
3816 		 * so just return EINVAL and let the caller handle it.
3817 		 */
3818 		mutex_exit(SQLOCK(sq));
3819 		rwnext_exit(qp);
3820 		return (EINVAL);
3821 	}
3822 	if (!(flags & SQ_CIPUT))
3823 		sq->sq_flags = flags | SQ_EXCL;
3824 	sq->sq_count = count + 1;
3825 	ASSERT(sq->sq_count != 0);		/* Wraparound */
3826 	/*
3827 	 * Note: The only message ordering guarantee that rwnext() makes is
3828 	 *	 for the write queue flow-control case. All others (r/w queue
3829 	 *	 with q_count > 0 (or q_first != 0)) are the resposibilty of
3830 	 *	 the queue's rw procedure. This could be genralized here buy
3831 	 *	 running the queue's service procedure, but that wouldn't be
3832 	 *	 the most efficent for all cases.
3833 	 */
3834 	mutex_exit(SQLOCK(sq));
3835 	if (! isread && (qp->q_flag & QFULL)) {
3836 		/*
3837 		 * Write queue may be flow controlled. If so,
3838 		 * mark the queue for wakeup when it's not.
3839 		 */
3840 		mutex_enter(QLOCK(qp));
3841 		if (qp->q_flag & QFULL) {
3842 			qp->q_flag |= QWANTWSYNC;
3843 			mutex_exit(QLOCK(qp));
3844 			rval = EWOULDBLOCK;
3845 			goto out;
3846 		}
3847 		mutex_exit(QLOCK(qp));
3848 	}
3849 
3850 	if (! isread && dp->d_mp)
3851 		STR_FTEVENT_MSG(dp->d_mp, nqp, FTEV_RWNEXT, dp->d_mp->b_rptr -
3852 		    dp->d_mp->b_datap->db_base);
3853 
3854 	rval = (*proc)(qp, dp);
3855 
3856 	if (isread && dp->d_mp)
3857 		STR_FTEVENT_MSG(dp->d_mp, _RD(nqp), FTEV_RWNEXT,
3858 		    dp->d_mp->b_rptr - dp->d_mp->b_datap->db_base);
3859 out:
3860 	/*
3861 	 * The queue is protected from being freed by sq_count, so it is
3862 	 * safe to call rwnext_exit and reacquire SQLOCK(sq).
3863 	 */
3864 	rwnext_exit(qp);
3865 
3866 	mutex_enter(SQLOCK(sq));
3867 	flags = sq->sq_flags;
3868 	ASSERT(sq->sq_count != 0);
3869 	sq->sq_count--;
3870 	if (flags & SQ_TAIL) {
3871 		putnext_tail(sq, qp, flags);
3872 		/*
3873 		 * The only purpose of this ASSERT is to preserve calling stack
3874 		 * in DEBUG kernel.
3875 		 */
3876 		ASSERT(flags & SQ_TAIL);
3877 		return (rval);
3878 	}
3879 	ASSERT(flags & (SQ_EXCL|SQ_CIPUT));
3880 	/*
3881 	 * Safe to always drop SQ_EXCL:
3882 	 *	Not SQ_CIPUT means we set SQ_EXCL above
3883 	 *	For SQ_CIPUT SQ_EXCL will only be set if the put procedure
3884 	 *	did a qwriter(INNER) in which case nobody else
3885 	 *	is in the inner perimeter and we are exiting.
3886 	 *
3887 	 * I would like to make the following assertion:
3888 	 *
3889 	 * ASSERT((flags & (SQ_EXCL|SQ_CIPUT)) != (SQ_EXCL|SQ_CIPUT) ||
3890 	 * 	sq->sq_count == 0);
3891 	 *
3892 	 * which indicates that if we are both putshared and exclusive,
3893 	 * we became exclusive while executing the putproc, and the only
3894 	 * claim on the syncq was the one we dropped a few lines above.
3895 	 * But other threads that enter putnext while the syncq is exclusive
3896 	 * need to make a claim as they may need to drop SQLOCK in the
3897 	 * has_writers case to avoid deadlocks.  If these threads are
3898 	 * delayed or preempted, it is possible that the writer thread can
3899 	 * find out that there are other claims making the (sq_count == 0)
3900 	 * test invalid.
3901 	 */
3902 
3903 	sq->sq_flags = flags & ~SQ_EXCL;
3904 	if (sq->sq_flags & SQ_WANTWAKEUP) {
3905 		sq->sq_flags &= ~SQ_WANTWAKEUP;
3906 		cv_broadcast(&sq->sq_wait);
3907 	}
3908 	mutex_exit(SQLOCK(sq));
3909 	return (rval);
3910 }
3911 
3912 /*
3913  * The purpose of infonext() is to call the info procedure of the next
3914  * (downstream) modules queue.
3915  *
3916  * treated as put entrypoint for perimeter syncronization.
3917  *
3918  * There's no need to grab sq_putlocks here (which only exist for CIPUT
3919  * sync queues). If it is CIPUT sync queue regular sq_count is incremented and
3920  * it does not matter if any regular put entrypoints have been already
3921  * entered.
3922  */
3923 int
3924 infonext(queue_t *qp, infod_t *idp)
3925 {
3926 	queue_t		*nqp;
3927 	syncq_t		*sq;
3928 	uint16_t	count;
3929 	uint16_t 	flags;
3930 	struct qinit	*qi;
3931 	int		(*proc)();
3932 	struct stdata	*stp;
3933 	int		rval;
3934 
3935 	stp = STREAM(qp);
3936 	/*
3937 	 * Prevent q_next from changing by holding sd_lock until
3938 	 * acquiring SQLOCK.
3939 	 */
3940 	mutex_enter(&stp->sd_lock);
3941 	if ((nqp = _WR(qp)) == qp) {
3942 		qp = nqp->q_next;
3943 	} else {
3944 		qp = _RD(nqp->q_next);
3945 	}
3946 	qi = qp->q_qinfo;
3947 	if (qp->q_struiot == STRUIOT_NONE || ! (proc = qi->qi_infop)) {
3948 		mutex_exit(&stp->sd_lock);
3949 		return (EINVAL);
3950 	}
3951 	sq = qp->q_syncq;
3952 	mutex_enter(SQLOCK(sq));
3953 	mutex_exit(&stp->sd_lock);
3954 	count = sq->sq_count;
3955 	flags = sq->sq_flags;
3956 	ASSERT(sq->sq_ciputctrl == NULL || (flags & SQ_CIPUT));
3957 
3958 	while ((flags & SQ_GOAWAY) || (!(flags & SQ_CIPUT) && count != 0)) {
3959 		/*
3960 		 * Wait until we can enter the inner perimeter.
3961 		 */
3962 		sq->sq_flags = flags | SQ_WANTWAKEUP;
3963 		cv_wait(&sq->sq_wait, SQLOCK(sq));
3964 		count = sq->sq_count;
3965 		flags = sq->sq_flags;
3966 	}
3967 
3968 	if (! (flags & SQ_CIPUT))
3969 		sq->sq_flags = flags | SQ_EXCL;
3970 	sq->sq_count = count + 1;
3971 	ASSERT(sq->sq_count != 0);		/* Wraparound */
3972 	mutex_exit(SQLOCK(sq));
3973 
3974 	rval = (*proc)(qp, idp);
3975 
3976 	mutex_enter(SQLOCK(sq));
3977 	flags = sq->sq_flags;
3978 	ASSERT(sq->sq_count != 0);
3979 	sq->sq_count--;
3980 	if (flags & SQ_TAIL) {
3981 		putnext_tail(sq, qp, flags);
3982 		/*
3983 		 * The only purpose of this ASSERT is to preserve calling stack
3984 		 * in DEBUG kernel.
3985 		 */
3986 		ASSERT(flags & SQ_TAIL);
3987 		return (rval);
3988 	}
3989 	ASSERT(flags & (SQ_EXCL|SQ_CIPUT));
3990 /*
3991  * XXXX
3992  * I am not certain the next comment is correct here.  I need to consider
3993  * why the infonext is called, and if dropping SQ_EXCL unless non-CIPUT
3994  * might cause other problems.  It just might be safer to drop it if
3995  * !SQ_CIPUT because that is when we set it.
3996  */
3997 	/*
3998 	 * Safe to always drop SQ_EXCL:
3999 	 *	Not SQ_CIPUT means we set SQ_EXCL above
4000 	 *	For SQ_CIPUT SQ_EXCL will only be set if the put procedure
4001 	 *	did a qwriter(INNER) in which case nobody else
4002 	 *	is in the inner perimeter and we are exiting.
4003 	 *
4004 	 * I would like to make the following assertion:
4005 	 *
4006 	 * ASSERT((flags & (SQ_EXCL|SQ_CIPUT)) != (SQ_EXCL|SQ_CIPUT) ||
4007 	 *	sq->sq_count == 0);
4008 	 *
4009 	 * which indicates that if we are both putshared and exclusive,
4010 	 * we became exclusive while executing the putproc, and the only
4011 	 * claim on the syncq was the one we dropped a few lines above.
4012 	 * But other threads that enter putnext while the syncq is exclusive
4013 	 * need to make a claim as they may need to drop SQLOCK in the
4014 	 * has_writers case to avoid deadlocks.  If these threads are
4015 	 * delayed or preempted, it is possible that the writer thread can
4016 	 * find out that there are other claims making the (sq_count == 0)
4017 	 * test invalid.
4018 	 */
4019 
4020 	sq->sq_flags = flags & ~SQ_EXCL;
4021 	mutex_exit(SQLOCK(sq));
4022 	return (rval);
4023 }
4024 
4025 /*
4026  * Return nonzero if the queue is responsible for struio(), else return 0.
4027  */
4028 int
4029 isuioq(queue_t *q)
4030 {
4031 	if (q->q_flag & QREADR)
4032 		return (STREAM(q)->sd_struiordq == q);
4033 	else
4034 		return (STREAM(q)->sd_struiowrq == q);
4035 }
4036 
4037 #if defined(__sparc)
4038 int disable_putlocks = 0;
4039 #else
4040 int disable_putlocks = 1;
4041 #endif
4042 
4043 /*
4044  * called by create_putlock.
4045  */
4046 static void
4047 create_syncq_putlocks(queue_t *q)
4048 {
4049 	syncq_t	*sq = q->q_syncq;
4050 	ciputctrl_t *cip;
4051 	int i;
4052 
4053 	ASSERT(sq != NULL);
4054 
4055 	ASSERT(disable_putlocks == 0);
4056 	ASSERT(n_ciputctrl >= min_n_ciputctrl);
4057 	ASSERT(ciputctrl_cache != NULL);
4058 
4059 	if (!(sq->sq_type & SQ_CIPUT))
4060 		return;
4061 
4062 	for (i = 0; i <= 1; i++) {
4063 		if (sq->sq_ciputctrl == NULL) {
4064 			cip = kmem_cache_alloc(ciputctrl_cache, KM_SLEEP);
4065 			SUMCHECK_CIPUTCTRL_COUNTS(cip, n_ciputctrl - 1, 0);
4066 			mutex_enter(SQLOCK(sq));
4067 			if (sq->sq_ciputctrl != NULL) {
4068 				mutex_exit(SQLOCK(sq));
4069 				kmem_cache_free(ciputctrl_cache, cip);
4070 			} else {
4071 				ASSERT(sq->sq_nciputctrl == 0);
4072 				sq->sq_nciputctrl = n_ciputctrl - 1;
4073 				/*
4074 				 * putnext checks sq_ciputctrl without holding
4075 				 * SQLOCK. if it is not NULL putnext assumes
4076 				 * sq_nciputctrl is initialized. membar below
4077 				 * insures that.
4078 				 */
4079 				membar_producer();
4080 				sq->sq_ciputctrl = cip;
4081 				mutex_exit(SQLOCK(sq));
4082 			}
4083 		}
4084 		ASSERT(sq->sq_nciputctrl == n_ciputctrl - 1);
4085 		if (i == 1)
4086 			break;
4087 		q = _OTHERQ(q);
4088 		if (!(q->q_flag & QPERQ)) {
4089 			ASSERT(sq == q->q_syncq);
4090 			break;
4091 		}
4092 		ASSERT(q->q_syncq != NULL);
4093 		ASSERT(sq != q->q_syncq);
4094 		sq = q->q_syncq;
4095 		ASSERT(sq->sq_type & SQ_CIPUT);
4096 	}
4097 }
4098 
4099 /*
4100  * If stream argument is 0 only create per cpu sq_putlocks/sq_putcounts for
4101  * syncq of q. If stream argument is not 0 create per cpu stream_putlocks for
4102  * the stream of q and per cpu sq_putlocks/sq_putcounts for all syncq's
4103  * starting from q and down to the driver.
4104  *
4105  * This should be called after the affected queues are part of stream
4106  * geometry. It should be called from driver/module open routine after
4107  * qprocson() call. It is also called from nfs syscall where it is known that
4108  * stream is configured and won't change its geometry during create_putlock
4109  * call.
4110  *
4111  * caller normally uses 0 value for the stream argument to speed up MT putnext
4112  * into the perimeter of q for example because its perimeter is per module
4113  * (e.g. IP).
4114  *
4115  * caller normally uses non 0 value for the stream argument to hint the system
4116  * that the stream of q is a very contended global system stream
4117  * (e.g. NFS/UDP) and the part of the stream from q to the driver is
4118  * particularly MT hot.
4119  *
4120  * Caller insures stream plumbing won't happen while we are here and therefore
4121  * q_next can be safely used.
4122  */
4123 
4124 void
4125 create_putlocks(queue_t *q, int stream)
4126 {
4127 	ciputctrl_t	*cip;
4128 	struct stdata	*stp = STREAM(q);
4129 
4130 	q = _WR(q);
4131 	ASSERT(stp != NULL);
4132 
4133 	if (disable_putlocks != 0)
4134 		return;
4135 
4136 	if (n_ciputctrl < min_n_ciputctrl)
4137 		return;
4138 
4139 	ASSERT(ciputctrl_cache != NULL);
4140 
4141 	if (stream != 0 && stp->sd_ciputctrl == NULL) {
4142 		cip = kmem_cache_alloc(ciputctrl_cache, KM_SLEEP);
4143 		SUMCHECK_CIPUTCTRL_COUNTS(cip, n_ciputctrl - 1, 0);
4144 		mutex_enter(&stp->sd_lock);
4145 		if (stp->sd_ciputctrl != NULL) {
4146 			mutex_exit(&stp->sd_lock);
4147 			kmem_cache_free(ciputctrl_cache, cip);
4148 		} else {
4149 			ASSERT(stp->sd_nciputctrl == 0);
4150 			stp->sd_nciputctrl = n_ciputctrl - 1;
4151 			/*
4152 			 * putnext checks sd_ciputctrl without holding
4153 			 * sd_lock. if it is not NULL putnext assumes
4154 			 * sd_nciputctrl is initialized. membar below
4155 			 * insures that.
4156 			 */
4157 			membar_producer();
4158 			stp->sd_ciputctrl = cip;
4159 			mutex_exit(&stp->sd_lock);
4160 		}
4161 	}
4162 
4163 	ASSERT(stream == 0 || stp->sd_nciputctrl == n_ciputctrl - 1);
4164 
4165 	while (_SAMESTR(q)) {
4166 		create_syncq_putlocks(q);
4167 		if (stream == 0)
4168 			return;
4169 		q = q->q_next;
4170 	}
4171 	ASSERT(q != NULL);
4172 	create_syncq_putlocks(q);
4173 }
4174 
4175 /*
4176  * STREAMS Flow Trace - record STREAMS Flow Trace events as an mblk flows
4177  * through a stream.
4178  *
4179  * Data currently record per-event is a timestamp, module/driver name,
4180  * downstream module/driver name, optional callstack, event type and a per
4181  * type datum.  Much of the STREAMS framework is instrumented for automatic
4182  * flow tracing (when enabled).  Events can be defined and used by STREAMS
4183  * modules and drivers.
4184  *
4185  * Global objects:
4186  *
4187  *	str_ftevent() - Add a flow-trace event to a dblk.
4188  *	str_ftfree() - Free flow-trace data
4189  *
4190  * Local objects:
4191  *
4192  *	fthdr_cache - pointer to the kmem cache for trace header.
4193  *	ftblk_cache - pointer to the kmem cache for trace data blocks.
4194  */
4195 
4196 int str_ftnever = 1;	/* Don't do STREAMS flow tracing */
4197 int str_ftstack = 0;	/* Don't record event call stacks */
4198 
4199 void
4200 str_ftevent(fthdr_t *hp, void *p, ushort_t evnt, ushort_t data)
4201 {
4202 	ftblk_t *bp = hp->tail;
4203 	ftblk_t *nbp;
4204 	ftevnt_t *ep;
4205 	int ix, nix;
4206 
4207 	ASSERT(hp != NULL);
4208 
4209 	for (;;) {
4210 		if ((ix = bp->ix) == FTBLK_EVNTS) {
4211 			/*
4212 			 * Tail doesn't have room, so need a new tail.
4213 			 *
4214 			 * To make this MT safe, first, allocate a new
4215 			 * ftblk, and initialize it.  To make life a
4216 			 * little easier, reserve the first slot (mostly
4217 			 * by making ix = 1).  When we are finished with
4218 			 * the initialization, CAS this pointer to the
4219 			 * tail.  If this succeeds, this is the new
4220 			 * "next" block.  Otherwise, another thread
4221 			 * got here first, so free the block and start
4222 			 * again.
4223 			 */
4224 			nbp = kmem_cache_alloc(ftblk_cache, KM_NOSLEEP);
4225 			if (nbp == NULL) {
4226 				/* no mem, so punt */
4227 				str_ftnever++;
4228 				/* free up all flow data? */
4229 				return;
4230 			}
4231 			nbp->nxt = NULL;
4232 			nbp->ix = 1;
4233 			/*
4234 			 * Just in case there is another thread about
4235 			 * to get the next index, we need to make sure
4236 			 * the value is there for it.
4237 			 */
4238 			membar_producer();
4239 			if (casptr(&hp->tail, bp, nbp) == bp) {
4240 				/* CAS was successful */
4241 				bp->nxt = nbp;
4242 				membar_producer();
4243 				bp = nbp;
4244 				ix = 0;
4245 				goto cas_good;
4246 			} else {
4247 				kmem_cache_free(ftblk_cache, nbp);
4248 				bp = hp->tail;
4249 				continue;
4250 			}
4251 		}
4252 		nix = ix + 1;
4253 		if (cas32((uint32_t *)&bp->ix, ix, nix) == ix) {
4254 		cas_good:
4255 			if (curthread != hp->thread) {
4256 				hp->thread = curthread;
4257 				evnt |= FTEV_CS;
4258 			}
4259 			if (CPU->cpu_seqid != hp->cpu_seqid) {
4260 				hp->cpu_seqid = CPU->cpu_seqid;
4261 				evnt |= FTEV_PS;
4262 			}
4263 			ep = &bp->ev[ix];
4264 			break;
4265 		}
4266 	}
4267 
4268 	if (evnt & FTEV_QMASK) {
4269 		queue_t *qp = p;
4270 
4271 		if (!(qp->q_flag & QREADR))
4272 			evnt |= FTEV_ISWR;
4273 
4274 		ep->mid = Q2NAME(qp);
4275 
4276 		/*
4277 		 * We only record the next queue name for FTEV_PUTNEXT since
4278 		 * that's the only time we *really* need it, and the putnext()
4279 		 * code ensures that qp->q_next won't vanish.  (We could use
4280 		 * claimstr()/releasestr() but at a performance cost.)
4281 		 */
4282 		if ((evnt & FTEV_MASK) == FTEV_PUTNEXT && qp->q_next != NULL)
4283 			ep->midnext = Q2NAME(qp->q_next);
4284 		else
4285 			ep->midnext = NULL;
4286 	} else {
4287 		ep->mid = p;
4288 		ep->midnext = NULL;
4289 	}
4290 
4291 	if (ep->stk != NULL)
4292 		ep->stk->fs_depth = getpcstack(ep->stk->fs_stk, FTSTK_DEPTH);
4293 
4294 	ep->ts = gethrtime();
4295 	ep->evnt = evnt;
4296 	ep->data = data;
4297 	hp->hash = (hp->hash << 9) + hp->hash;
4298 	hp->hash += (evnt << 16) | data;
4299 	hp->hash += (uintptr_t)ep->mid;
4300 }
4301 
4302 /*
4303  * Free flow-trace data.
4304  */
4305 void
4306 str_ftfree(dblk_t *dbp)
4307 {
4308 	fthdr_t *hp = dbp->db_fthdr;
4309 	ftblk_t *bp = &hp->first;
4310 	ftblk_t *nbp;
4311 
4312 	if (bp != hp->tail || bp->ix != 0) {
4313 		/*
4314 		 * Clear out the hash, have the tail point to itself, and free
4315 		 * any continuation blocks.
4316 		 */
4317 		bp = hp->first.nxt;
4318 		hp->tail = &hp->first;
4319 		hp->hash = 0;
4320 		hp->first.nxt = NULL;
4321 		hp->first.ix = 0;
4322 		while (bp != NULL) {
4323 			nbp = bp->nxt;
4324 			kmem_cache_free(ftblk_cache, bp);
4325 			bp = nbp;
4326 		}
4327 	}
4328 	kmem_cache_free(fthdr_cache, hp);
4329 	dbp->db_fthdr = NULL;
4330 }
4331