xref: /titanic_44/usr/src/uts/common/rpc/rpcmod.c (revision 952d685ebe0e34acfa6e0842e7484f982f38b74c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 /*
34  * Kernel RPC filtering module
35  */
36 
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/stream.h>
40 #include <sys/stropts.h>
41 #include <sys/tihdr.h>
42 #include <sys/timod.h>
43 #include <sys/tiuser.h>
44 #include <sys/debug.h>
45 #include <sys/signal.h>
46 #include <sys/pcb.h>
47 #include <sys/user.h>
48 #include <sys/errno.h>
49 #include <sys/cred.h>
50 #include <sys/policy.h>
51 #include <sys/inline.h>
52 #include <sys/cmn_err.h>
53 #include <sys/kmem.h>
54 #include <sys/file.h>
55 #include <sys/sysmacros.h>
56 #include <sys/systm.h>
57 #include <sys/t_lock.h>
58 #include <sys/ddi.h>
59 #include <sys/vtrace.h>
60 #include <sys/callb.h>
61 
62 #include <sys/strlog.h>
63 #include <rpc/rpc_com.h>
64 #include <inet/common.h>
65 #include <rpc/types.h>
66 #include <sys/time.h>
67 #include <rpc/xdr.h>
68 #include <rpc/auth.h>
69 #include <rpc/clnt.h>
70 #include <rpc/rpc_msg.h>
71 #include <rpc/clnt.h>
72 #include <rpc/svc.h>
73 #include <rpc/rpcsys.h>
74 #include <rpc/rpc_rdma.h>
75 
76 /*
77  * This is the loadable module wrapper.
78  */
79 #include <sys/conf.h>
80 #include <sys/modctl.h>
81 #include <sys/syscall.h>
82 
83 extern struct streamtab rpcinfo;
84 
85 static struct fmodsw fsw = {
86 	"rpcmod",
87 	&rpcinfo,
88 	D_NEW|D_MP,
89 };
90 
91 /*
92  * Module linkage information for the kernel.
93  */
94 
95 static struct modlstrmod modlstrmod = {
96 	&mod_strmodops, "rpc interface str mod", &fsw
97 };
98 
99 /*
100  * For the RPC system call.
101  */
102 static struct sysent rpcsysent = {
103 	2,
104 	SE_32RVAL1 | SE_ARGC | SE_NOUNLOAD,
105 	rpcsys
106 };
107 
108 static struct modlsys modlsys = {
109 	&mod_syscallops,
110 	"RPC syscall",
111 	&rpcsysent
112 };
113 
114 #ifdef _SYSCALL32_IMPL
115 static struct modlsys modlsys32 = {
116 	&mod_syscallops32,
117 	"32-bit RPC syscall",
118 	&rpcsysent
119 };
120 #endif /* _SYSCALL32_IMPL */
121 
122 static struct modlinkage modlinkage = {
123 	MODREV_1,
124 	{
125 		&modlsys,
126 #ifdef _SYSCALL32_IMPL
127 		&modlsys32,
128 #endif
129 		&modlstrmod,
130 		NULL
131 	}
132 };
133 
134 int
135 _init(void)
136 {
137 	int error = 0;
138 	callb_id_t cid;
139 	int status;
140 
141 	svc_init();
142 	clnt_init();
143 	cid = callb_add(connmgr_cpr_reset, 0, CB_CL_CPR_RPC, "rpc");
144 
145 	if (error = mod_install(&modlinkage)) {
146 		/*
147 		 * Could not install module, cleanup previous
148 		 * initialization work.
149 		 */
150 		clnt_fini();
151 		if (cid != NULL)
152 			(void) callb_delete(cid);
153 
154 		return (error);
155 	}
156 
157 	/*
158 	 * Load up the RDMA plugins and initialize the stats. Even if the
159 	 * plugins loadup fails, but rpcmod was successfully installed the
160 	 * counters still get initialized.
161 	 */
162 	rw_init(&rdma_lock, NULL, RW_DEFAULT, NULL);
163 	mutex_init(&rdma_modload_lock, NULL, MUTEX_DEFAULT, NULL);
164 	mt_kstat_init();
165 
166 	/*
167 	 * Get our identification into ldi.  This is used for loading
168 	 * other modules, e.g. rpcib.
169 	 */
170 	status = ldi_ident_from_mod(&modlinkage, &rpcmod_li);
171 	if (status != 0) {
172 		cmn_err(CE_WARN, "ldi_ident_from_mod fails with %d", status);
173 		rpcmod_li = NULL;
174 	}
175 
176 	return (error);
177 }
178 
179 /*
180  * The unload entry point fails, because we advertise entry points into
181  * rpcmod from the rest of kRPC: rpcmod_release().
182  */
183 int
184 _fini(void)
185 {
186 	return (EBUSY);
187 }
188 
189 int
190 _info(struct modinfo *modinfop)
191 {
192 	return (mod_info(&modlinkage, modinfop));
193 }
194 
195 extern int nulldev();
196 
197 #define	RPCMOD_ID	2049
198 
199 int rmm_open(), rmm_close();
200 
201 /*
202  * To save instructions, since STREAMS ignores the return value
203  * from these functions, they are defined as void here. Kind of icky, but...
204  */
205 void rmm_rput(queue_t *, mblk_t *);
206 void rmm_wput(queue_t *, mblk_t *);
207 void rmm_rsrv(queue_t *);
208 void rmm_wsrv(queue_t *);
209 
210 int rpcmodopen(), rpcmodclose();
211 void rpcmodrput(), rpcmodwput();
212 void rpcmodrsrv(), rpcmodwsrv();
213 
214 static	void	rpcmodwput_other(queue_t *, mblk_t *);
215 static	int	mir_close(queue_t *q);
216 static	int	mir_open(queue_t *q, dev_t *devp, int flag, int sflag,
217 		    cred_t *credp);
218 static	void	mir_rput(queue_t *q, mblk_t *mp);
219 static	void	mir_rsrv(queue_t *q);
220 static	void	mir_wput(queue_t *q, mblk_t *mp);
221 static	void	mir_wsrv(queue_t *q);
222 
223 static struct module_info rpcmod_info =
224 	{RPCMOD_ID, "rpcmod", 0, INFPSZ, 256*1024, 1024};
225 
226 /*
227  * Read side has no service procedure.
228  */
229 static struct qinit rpcmodrinit = {
230 	(int (*)())rmm_rput,
231 	(int (*)())rmm_rsrv,
232 	rmm_open,
233 	rmm_close,
234 	nulldev,
235 	&rpcmod_info,
236 	NULL
237 };
238 
239 /*
240  * The write put procedure is simply putnext to conserve stack space.
241  * The write service procedure is not used to queue data, but instead to
242  * synchronize with flow control.
243  */
244 static struct qinit rpcmodwinit = {
245 	(int (*)())rmm_wput,
246 	(int (*)())rmm_wsrv,
247 	rmm_open,
248 	rmm_close,
249 	nulldev,
250 	&rpcmod_info,
251 	NULL
252 };
253 struct streamtab rpcinfo = { &rpcmodrinit, &rpcmodwinit, NULL, NULL };
254 
255 struct xprt_style_ops {
256 	int (*xo_open)();
257 	int (*xo_close)();
258 	void (*xo_wput)();
259 	void (*xo_wsrv)();
260 	void (*xo_rput)();
261 	void (*xo_rsrv)();
262 };
263 
264 static struct xprt_style_ops xprt_clts_ops = {
265 	rpcmodopen,
266 	rpcmodclose,
267 	rpcmodwput,
268 	rpcmodwsrv,
269 	rpcmodrput,
270 	NULL
271 };
272 
273 static struct xprt_style_ops xprt_cots_ops = {
274 	mir_open,
275 	mir_close,
276 	mir_wput,
277 	mir_wsrv,
278 	mir_rput,
279 	mir_rsrv
280 };
281 
282 /*
283  * Per rpcmod "slot" data structure. q->q_ptr points to one of these.
284  */
285 struct rpcm {
286 	void		*rm_krpc_cell;	/* Reserved for use by KRPC */
287 	struct		xprt_style_ops	*rm_ops;
288 	int		rm_type;	/* Client or server side stream */
289 #define	RM_CLOSING	0x1		/* somebody is trying to close slot */
290 	uint_t		rm_state;	/* state of the slot. see above */
291 	uint_t		rm_ref;		/* cnt of external references to slot */
292 	kmutex_t	rm_lock;	/* mutex protecting above fields */
293 	kcondvar_t	rm_cwait;	/* condition for closing */
294 	zoneid_t	rm_zoneid;	/* zone which pushed rpcmod */
295 };
296 
297 struct temp_slot {
298 	void *cell;
299 	struct xprt_style_ops *ops;
300 	int type;
301 	mblk_t *info_ack;
302 	kmutex_t lock;
303 	kcondvar_t wait;
304 };
305 
306 void tmp_rput(queue_t *q, mblk_t *mp);
307 
308 struct xprt_style_ops tmpops = {
309 	NULL,
310 	NULL,
311 	putnext,
312 	NULL,
313 	tmp_rput,
314 	NULL
315 };
316 
317 void
318 tmp_rput(queue_t *q, mblk_t *mp)
319 {
320 	struct temp_slot *t = (struct temp_slot *)(q->q_ptr);
321 	struct T_info_ack *pptr;
322 
323 	switch (mp->b_datap->db_type) {
324 	case M_PCPROTO:
325 		pptr = (struct T_info_ack *)mp->b_rptr;
326 		switch (pptr->PRIM_type) {
327 		case T_INFO_ACK:
328 			mutex_enter(&t->lock);
329 			t->info_ack = mp;
330 			cv_signal(&t->wait);
331 			mutex_exit(&t->lock);
332 			return;
333 		default:
334 			break;
335 		}
336 	default:
337 		break;
338 	}
339 
340 	/*
341 	 * Not an info-ack, so free it. This is ok because we should
342 	 * not be receiving data until the open finishes: rpcmod
343 	 * is pushed well before the end-point is bound to an address.
344 	 */
345 	freemsg(mp);
346 }
347 
348 int
349 rmm_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
350 {
351 	mblk_t *bp;
352 	struct temp_slot ts, *t;
353 	struct T_info_ack *pptr;
354 	int error = 0;
355 	int procson = 0;
356 
357 	ASSERT(q != NULL);
358 	/*
359 	 * Check for re-opens.
360 	 */
361 	if (q->q_ptr) {
362 		TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END,
363 		    "rpcmodopen_end:(%s)", "q->qptr");
364 		return (0);
365 	}
366 
367 	t = &ts;
368 	bzero(t, sizeof (*t));
369 	q->q_ptr = (void *)t;
370 	/* WR(q)->q_ptr = (void *)t; */
371 
372 	/*
373 	 * Allocate the required messages upfront.
374 	 */
375 	if ((bp = allocb(sizeof (struct T_info_req) +
376 	    sizeof (struct T_info_ack), BPRI_LO)) == (mblk_t *)NULL) {
377 		return (ENOBUFS);
378 	}
379 
380 	mutex_init(&t->lock, NULL, MUTEX_DEFAULT, NULL);
381 	cv_init(&t->wait, NULL, CV_DEFAULT, NULL);
382 
383 	t->ops = &tmpops;
384 
385 	qprocson(q);
386 	procson = 1;
387 	bp->b_datap->db_type = M_PCPROTO;
388 	*(int32_t *)bp->b_wptr = (int32_t)T_INFO_REQ;
389 	bp->b_wptr += sizeof (struct T_info_req);
390 	putnext(WR(q), bp);
391 
392 	mutex_enter(&t->lock);
393 	while ((bp = t->info_ack) == NULL) {
394 		if (cv_wait_sig(&t->wait, &t->lock) == 0) {
395 			error = EINTR;
396 			break;
397 		}
398 	}
399 	mutex_exit(&t->lock);
400 	mutex_destroy(&t->lock);
401 	cv_destroy(&t->wait);
402 	if (error)
403 		goto out;
404 
405 	pptr = (struct T_info_ack *)t->info_ack->b_rptr;
406 
407 	if (pptr->SERV_type == T_CLTS) {
408 		error = rpcmodopen(q, devp, flag, sflag, crp);
409 		if (error == 0) {
410 			t = (struct temp_slot *)q->q_ptr;
411 			t->ops = &xprt_clts_ops;
412 		}
413 	} else {
414 		error = mir_open(q, devp, flag, sflag, crp);
415 		if (error == 0) {
416 			t = (struct temp_slot *)q->q_ptr;
417 			t->ops = &xprt_cots_ops;
418 		}
419 	}
420 
421 out:
422 	freemsg(bp);
423 
424 	if (error && procson)
425 		qprocsoff(q);
426 
427 	return (error);
428 }
429 
430 void
431 rmm_rput(queue_t *q, mblk_t  *mp)
432 {
433 	(*((struct temp_slot *)q->q_ptr)->ops->xo_rput)(q, mp);
434 }
435 
436 void
437 rmm_rsrv(queue_t *q)
438 {
439 	(*((struct temp_slot *)q->q_ptr)->ops->xo_rsrv)(q);
440 }
441 
442 void
443 rmm_wput(queue_t *q, mblk_t *mp)
444 {
445 	(*((struct temp_slot *)q->q_ptr)->ops->xo_wput)(q, mp);
446 }
447 
448 void
449 rmm_wsrv(queue_t *q)
450 {
451 	(*((struct temp_slot *)q->q_ptr)->ops->xo_wsrv)(q);
452 }
453 
454 int
455 rmm_close(queue_t *q, int flag, cred_t *crp)
456 {
457 	return ((*((struct temp_slot *)q->q_ptr)->ops->xo_close)(q, flag, crp));
458 }
459 
460 /*
461  * rpcmodopen -	open routine gets called when the module gets pushed
462  *		onto the stream.
463  */
464 /*ARGSUSED*/
465 int
466 rpcmodopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
467 {
468 	struct rpcm *rmp;
469 
470 	extern void (*rpc_rele)(queue_t *, mblk_t *);
471 	static void rpcmod_release(queue_t *, mblk_t *);
472 
473 	TRACE_0(TR_FAC_KRPC, TR_RPCMODOPEN_START, "rpcmodopen_start:");
474 
475 	/*
476 	 * Initialize entry points to release a rpcmod slot (and an input
477 	 * message if supplied) and to send an output message to the module
478 	 * below rpcmod.
479 	 */
480 	if (rpc_rele == NULL)
481 		rpc_rele = rpcmod_release;
482 
483 	/*
484 	 * Only sufficiently privileged users can use this module, and it
485 	 * is assumed that they will use this module properly, and NOT send
486 	 * bulk data from downstream.
487 	 */
488 	if (secpolicy_rpcmod_open(crp) != 0)
489 		return (EPERM);
490 
491 	/*
492 	 * Allocate slot data structure.
493 	 */
494 	rmp = kmem_zalloc(sizeof (*rmp), KM_SLEEP);
495 
496 	mutex_init(&rmp->rm_lock, NULL, MUTEX_DEFAULT, NULL);
497 	cv_init(&rmp->rm_cwait, NULL, CV_DEFAULT, NULL);
498 	rmp->rm_zoneid = rpc_zoneid();
499 	/*
500 	 * slot type will be set by kRPC client and server ioctl's
501 	 */
502 	rmp->rm_type = 0;
503 
504 	q->q_ptr = (void *)rmp;
505 	WR(q)->q_ptr = (void *)rmp;
506 
507 	TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END, "rpcmodopen_end:(%s)", "end");
508 	return (0);
509 }
510 
511 /*
512  * rpcmodclose - This routine gets called when the module gets popped
513  * off of the stream.
514  */
515 /*ARGSUSED*/
516 int
517 rpcmodclose(queue_t *q, int flag, cred_t *crp)
518 {
519 	struct rpcm *rmp;
520 
521 	ASSERT(q != NULL);
522 	rmp = (struct rpcm *)q->q_ptr;
523 
524 	/*
525 	 * Mark our state as closing.
526 	 */
527 	mutex_enter(&rmp->rm_lock);
528 	rmp->rm_state |= RM_CLOSING;
529 
530 	/*
531 	 * Check and see if there are any messages on the queue.  If so, send
532 	 * the messages, regardless whether the downstream module is ready to
533 	 * accept data.
534 	 */
535 	if (rmp->rm_type == RPC_SERVER) {
536 		flushq(q, FLUSHDATA);
537 
538 		qenable(WR(q));
539 
540 		if (rmp->rm_ref) {
541 			mutex_exit(&rmp->rm_lock);
542 			/*
543 			 * call into SVC to clean the queue
544 			 */
545 			svc_queueclean(q);
546 			mutex_enter(&rmp->rm_lock);
547 
548 			/*
549 			 * Block while there are kRPC threads with a reference
550 			 * to this message.
551 			 */
552 			while (rmp->rm_ref)
553 				cv_wait(&rmp->rm_cwait, &rmp->rm_lock);
554 		}
555 
556 		mutex_exit(&rmp->rm_lock);
557 
558 		/*
559 		 * It is now safe to remove this queue from the stream. No kRPC
560 		 * threads have a reference to the stream, and none ever will,
561 		 * because RM_CLOSING is set.
562 		 */
563 		qprocsoff(q);
564 
565 		/* Notify kRPC that this stream is going away. */
566 		svc_queueclose(q);
567 	} else {
568 		mutex_exit(&rmp->rm_lock);
569 		qprocsoff(q);
570 	}
571 
572 	q->q_ptr = NULL;
573 	WR(q)->q_ptr = NULL;
574 	mutex_destroy(&rmp->rm_lock);
575 	cv_destroy(&rmp->rm_cwait);
576 	kmem_free(rmp, sizeof (*rmp));
577 	return (0);
578 }
579 
580 #ifdef	DEBUG
581 int	rpcmod_send_msg_up = 0;
582 int	rpcmod_send_uderr = 0;
583 int	rpcmod_send_dup = 0;
584 int	rpcmod_send_dup_cnt = 0;
585 #endif
586 
587 /*
588  * rpcmodrput -	Module read put procedure.  This is called from
589  *		the module, driver, or stream head downstream.
590  */
591 void
592 rpcmodrput(queue_t *q, mblk_t *mp)
593 {
594 	struct rpcm *rmp;
595 	union T_primitives *pptr;
596 	int hdrsz;
597 
598 	TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_START, "rpcmodrput_start:");
599 
600 	ASSERT(q != NULL);
601 	rmp = (struct rpcm *)q->q_ptr;
602 
603 	if (rmp->rm_type == 0) {
604 		freemsg(mp);
605 		return;
606 	}
607 
608 #ifdef DEBUG
609 	if (rpcmod_send_msg_up > 0) {
610 		mblk_t *nmp = copymsg(mp);
611 		if (nmp) {
612 			putnext(q, nmp);
613 			rpcmod_send_msg_up--;
614 		}
615 	}
616 	if ((rpcmod_send_uderr > 0) && mp->b_datap->db_type == M_PROTO) {
617 		mblk_t *nmp;
618 		struct T_unitdata_ind *data;
619 		struct T_uderror_ind *ud;
620 		int d;
621 		data = (struct T_unitdata_ind *)mp->b_rptr;
622 		if (data->PRIM_type == T_UNITDATA_IND) {
623 			d = sizeof (*ud) - sizeof (*data);
624 			nmp = allocb(mp->b_wptr - mp->b_rptr + d, BPRI_HI);
625 			if (nmp) {
626 				ud = (struct T_uderror_ind *)nmp->b_rptr;
627 				ud->PRIM_type = T_UDERROR_IND;
628 				ud->DEST_length = data->SRC_length;
629 				ud->DEST_offset = data->SRC_offset + d;
630 				ud->OPT_length = data->OPT_length;
631 				ud->OPT_offset = data->OPT_offset + d;
632 				ud->ERROR_type = ENETDOWN;
633 				if (data->SRC_length) {
634 					bcopy(mp->b_rptr +
635 					    data->SRC_offset,
636 					    nmp->b_rptr +
637 					    ud->DEST_offset,
638 					    data->SRC_length);
639 				}
640 				if (data->OPT_length) {
641 					bcopy(mp->b_rptr +
642 					    data->OPT_offset,
643 					    nmp->b_rptr +
644 					    ud->OPT_offset,
645 					    data->OPT_length);
646 				}
647 				nmp->b_wptr += d;
648 				nmp->b_wptr += (mp->b_wptr - mp->b_rptr);
649 				nmp->b_datap->db_type = M_PROTO;
650 				putnext(q, nmp);
651 				rpcmod_send_uderr--;
652 			}
653 		}
654 	}
655 #endif
656 	switch (mp->b_datap->db_type) {
657 	default:
658 		putnext(q, mp);
659 		break;
660 
661 	case M_PROTO:
662 	case M_PCPROTO:
663 		ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (int32_t));
664 		pptr = (union T_primitives *)mp->b_rptr;
665 
666 		/*
667 		 * Forward this message to krpc if it is data.
668 		 */
669 		if (pptr->type == T_UNITDATA_IND) {
670 		    mblk_t *nmp;
671 
672 		/*
673 		 * Check if the module is being popped.
674 		 */
675 		    mutex_enter(&rmp->rm_lock);
676 		    if (rmp->rm_state & RM_CLOSING) {
677 			mutex_exit(&rmp->rm_lock);
678 			putnext(q, mp);
679 			break;
680 		    }
681 
682 		    switch (rmp->rm_type) {
683 		    case RPC_CLIENT:
684 			mutex_exit(&rmp->rm_lock);
685 			hdrsz = mp->b_wptr - mp->b_rptr;
686 
687 			/*
688 			 * Make sure the header is sane.
689 			 */
690 			if (hdrsz < TUNITDATAINDSZ ||
691 				hdrsz < (pptr->unitdata_ind.OPT_length +
692 					pptr->unitdata_ind.OPT_offset) ||
693 				hdrsz < (pptr->unitdata_ind.SRC_length +
694 					pptr->unitdata_ind.SRC_offset)) {
695 					freemsg(mp);
696 					return;
697 			}
698 
699 			/*
700 			 * Call clnt_clts_dispatch_notify, so that it can
701 			 * pass the message to the proper caller.  Don't
702 			 * discard the header just yet since the client may
703 			 * need the sender's address.
704 			 */
705 			clnt_clts_dispatch_notify(mp, hdrsz, rmp->rm_zoneid);
706 			return;
707 		    case RPC_SERVER:
708 			/*
709 			 * rm_krpc_cell is exclusively used by the kRPC
710 			 * CLTS server
711 			 */
712 			if (rmp->rm_krpc_cell) {
713 #ifdef DEBUG
714 				/*
715 				 * Test duplicate request cache and
716 				 * rm_ref count handling by sending a
717 				 * duplicate every so often, if
718 				 * desired.
719 				 */
720 				if (rpcmod_send_dup &&
721 				    rpcmod_send_dup_cnt++ %
722 				    rpcmod_send_dup)
723 					nmp = copymsg(mp);
724 				else
725 					nmp = NULL;
726 #endif
727 				/*
728 				 * Raise the reference count on this
729 				 * module to prevent it from being
730 				 * popped before krpc generates the
731 				 * reply.
732 				 */
733 				rmp->rm_ref++;
734 				mutex_exit(&rmp->rm_lock);
735 
736 				/*
737 				 * Submit the message to krpc.
738 				 */
739 				svc_queuereq(q, mp);
740 #ifdef DEBUG
741 				/*
742 				 * Send duplicate if we created one.
743 				 */
744 				if (nmp) {
745 					mutex_enter(&rmp->rm_lock);
746 					rmp->rm_ref++;
747 					mutex_exit(&rmp->rm_lock);
748 					svc_queuereq(q, nmp);
749 				}
750 #endif
751 			} else {
752 				mutex_exit(&rmp->rm_lock);
753 				freemsg(mp);
754 			}
755 			return;
756 		    default:
757 			mutex_exit(&rmp->rm_lock);
758 			freemsg(mp);
759 			return;
760 		    } /* end switch(rmp->rm_type) */
761 		} else if (pptr->type == T_UDERROR_IND) {
762 		    mutex_enter(&rmp->rm_lock);
763 		    hdrsz = mp->b_wptr - mp->b_rptr;
764 
765 		/*
766 		 * Make sure the header is sane
767 		 */
768 		    if (hdrsz < TUDERRORINDSZ ||
769 			hdrsz < (pptr->uderror_ind.OPT_length +
770 				pptr->uderror_ind.OPT_offset) ||
771 			hdrsz < (pptr->uderror_ind.DEST_length +
772 				pptr->uderror_ind.DEST_offset)) {
773 			    mutex_exit(&rmp->rm_lock);
774 			    freemsg(mp);
775 			    return;
776 		    }
777 
778 		/*
779 		 * In the case where a unit data error has been
780 		 * received, all we need to do is clear the message from
781 		 * the queue.
782 		 */
783 		    mutex_exit(&rmp->rm_lock);
784 		    freemsg(mp);
785 		    RPCLOG(32, "rpcmodrput: unitdata error received at "
786 				"%ld\n", gethrestime_sec());
787 		    return;
788 		} /* end else if (pptr->type == T_UDERROR_IND) */
789 
790 		putnext(q, mp);
791 		break;
792 	} /* end switch (mp->b_datap->db_type) */
793 
794 	TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_END,
795 		"rpcmodrput_end:");
796 	/*
797 	 * Return codes are not looked at by the STREAMS framework.
798 	 */
799 }
800 
801 /*
802  * write put procedure
803  */
804 void
805 rpcmodwput(queue_t *q, mblk_t *mp)
806 {
807 	struct rpcm	*rmp;
808 
809 	ASSERT(q != NULL);
810 
811 	switch (mp->b_datap->db_type) {
812 	    case M_PROTO:
813 	    case M_PCPROTO:
814 		    break;
815 	    default:
816 		    rpcmodwput_other(q, mp);
817 		    return;
818 	}
819 
820 	/*
821 	 * Check to see if we can send the message downstream.
822 	 */
823 	if (canputnext(q)) {
824 		putnext(q, mp);
825 		return;
826 	}
827 
828 	rmp = (struct rpcm *)q->q_ptr;
829 	ASSERT(rmp != NULL);
830 
831 	/*
832 	 * The first canputnext failed.  Try again except this time with the
833 	 * lock held, so that we can check the state of the stream to see if
834 	 * it is closing.  If either of these conditions evaluate to true
835 	 * then send the meesage.
836 	 */
837 	mutex_enter(&rmp->rm_lock);
838 	if (canputnext(q) || (rmp->rm_state & RM_CLOSING)) {
839 		mutex_exit(&rmp->rm_lock);
840 		putnext(q, mp);
841 	} else {
842 		/*
843 		 * canputnext failed again and the stream is not closing.
844 		 * Place the message on the queue and let the service
845 		 * procedure handle the message.
846 		 */
847 		mutex_exit(&rmp->rm_lock);
848 		(void) putq(q, mp);
849 	}
850 }
851 
852 static void
853 rpcmodwput_other(queue_t *q, mblk_t *mp)
854 {
855 	struct rpcm	*rmp;
856 	struct iocblk	*iocp;
857 
858 	rmp = (struct rpcm *)q->q_ptr;
859 	ASSERT(rmp != NULL);
860 
861 	switch (mp->b_datap->db_type) {
862 		case M_IOCTL:
863 			iocp = (struct iocblk *)mp->b_rptr;
864 			ASSERT(iocp != NULL);
865 			switch (iocp->ioc_cmd) {
866 			    case RPC_CLIENT:
867 			    case RPC_SERVER:
868 				    mutex_enter(&rmp->rm_lock);
869 				    rmp->rm_type = iocp->ioc_cmd;
870 				    mutex_exit(&rmp->rm_lock);
871 				    mp->b_datap->db_type = M_IOCACK;
872 				    qreply(q, mp);
873 				    return;
874 			    default:
875 				/*
876 				 * pass the ioctl downstream and hope someone
877 				 * down there knows how to handle it.
878 				 */
879 				    putnext(q, mp);
880 				    return;
881 			}
882 		default:
883 			break;
884 	}
885 	/*
886 	 * This is something we definitely do not know how to handle, just
887 	 * pass the message downstream
888 	 */
889 	putnext(q, mp);
890 }
891 
892 /*
893  * Module write service procedure. This is called by downstream modules
894  * for back enabling during flow control.
895  */
896 void
897 rpcmodwsrv(queue_t *q)
898 {
899 	struct rpcm	*rmp;
900 	mblk_t		*mp = NULL;
901 
902 	rmp = (struct rpcm *)q->q_ptr;
903 	ASSERT(rmp != NULL);
904 
905 	/*
906 	 * Get messages that may be queued and send them down stream
907 	 */
908 	while ((mp = getq(q)) != NULL) {
909 		/*
910 		 * Optimize the service procedure for the server-side, by
911 		 * avoiding a call to canputnext().
912 		 */
913 		if (rmp->rm_type == RPC_SERVER || canputnext(q)) {
914 			putnext(q, mp);
915 			continue;
916 		}
917 		(void) putbq(q, mp);
918 		return;
919 	}
920 }
921 
922 static void
923 rpcmod_release(queue_t *q, mblk_t *bp)
924 {
925 	struct rpcm *rmp;
926 
927 	/*
928 	 * For now, just free the message.
929 	 */
930 	if (bp)
931 		freemsg(bp);
932 	rmp = (struct rpcm *)q->q_ptr;
933 
934 	mutex_enter(&rmp->rm_lock);
935 	rmp->rm_ref--;
936 
937 	if (rmp->rm_ref == 0 && (rmp->rm_state & RM_CLOSING)) {
938 		cv_broadcast(&rmp->rm_cwait);
939 	}
940 
941 	mutex_exit(&rmp->rm_lock);
942 }
943 
944 /*
945  * This part of rpcmod is pushed on a connection-oriented transport for use
946  * by RPC.  It serves to bypass the Stream head, implements
947  * the record marking protocol, and dispatches incoming RPC messages.
948  */
949 
950 /* Default idle timer values */
951 #define	MIR_CLNT_IDLE_TIMEOUT	(5 * (60 * 1000L))	/* 5 minutes */
952 #define	MIR_SVC_IDLE_TIMEOUT	(6 * (60 * 1000L))	/* 6 minutes */
953 #define	MIR_SVC_ORDREL_TIMEOUT	(10 * (60 * 1000L))	/* 10 minutes */
954 #define	MIR_LASTFRAG	0x80000000	/* Record marker */
955 
956 #define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
957 
958 typedef struct mir_s {
959 	void	*mir_krpc_cell;	/* Reserved for KRPC use. This field */
960 					/* must be first in the structure. */
961 	struct xprt_style_ops	*rm_ops;
962 	int	mir_type;		/* Client or server side stream */
963 
964 	mblk_t	*mir_head_mp;		/* RPC msg in progress */
965 		/*
966 		 * mir_head_mp points the first mblk being collected in
967 		 * the current RPC message.  Record headers are removed
968 		 * before data is linked into mir_head_mp.
969 		 */
970 	mblk_t	*mir_tail_mp;		/* Last mblk in mir_head_mp */
971 		/*
972 		 * mir_tail_mp points to the last mblk in the message
973 		 * chain starting at mir_head_mp.  It is only valid
974 		 * if mir_head_mp is non-NULL and is used to add new
975 		 * data blocks to the end of chain quickly.
976 		 */
977 
978 	int32_t	mir_frag_len;		/* Bytes seen in the current frag */
979 		/*
980 		 * mir_frag_len starts at -4 for beginning of each fragment.
981 		 * When this length is negative, it indicates the number of
982 		 * bytes that rpcmod needs to complete the record marker
983 		 * header.  When it is positive or zero, it holds the number
984 		 * of bytes that have arrived for the current fragment and
985 		 * are held in mir_header_mp.
986 		 */
987 
988 	int32_t	mir_frag_header;
989 		/*
990 		 * Fragment header as collected for the current fragment.
991 		 * It holds the last-fragment indicator and the number
992 		 * of bytes in the fragment.
993 		 */
994 
995 	unsigned int
996 		mir_ordrel_pending : 1,	/* Sent T_ORDREL_REQ */
997 		mir_hold_inbound : 1,	/* Hold inbound messages on server */
998 					/* side until outbound flow control */
999 					/* is relieved. */
1000 		mir_closing : 1,	/* The stream is being closed */
1001 		mir_inrservice : 1,	/* data queued or rd srv proc running */
1002 		mir_inwservice : 1,	/* data queued or wr srv proc running */
1003 		mir_inwflushdata : 1,	/* flush M_DATAs when srv runs */
1004 		/*
1005 		 * On client streams, mir_clntreq is 0 or 1; it is set
1006 		 * to 1 whenever a new request is sent out (mir_wput)
1007 		 * and cleared when the timer fires (mir_timer).  If
1008 		 * the timer fires with this value equal to 0, then the
1009 		 * stream is considered idle and KRPC is notified.
1010 		 */
1011 		mir_clntreq : 1,
1012 		/*
1013 		 * On server streams, stop accepting messages
1014 		 */
1015 		mir_svc_no_more_msgs : 1,
1016 		mir_listen_stream : 1,	/* listen end point */
1017 		mir_unused : 1,	/* no longer used */
1018 		mir_timer_call : 1,
1019 		mir_junk_fill_thru_bit_31 : 21;
1020 
1021 	int	mir_setup_complete;	/* server has initialized everything */
1022 	timeout_id_t mir_timer_id;	/* Timer for idle checks */
1023 	clock_t	mir_idle_timeout;	/* Allowed idle time before shutdown */
1024 		/*
1025 		 * This value is copied from clnt_idle_timeout or
1026 		 * svc_idle_timeout during the appropriate ioctl.
1027 		 * Kept in milliseconds
1028 		 */
1029 	clock_t	mir_use_timestamp;	/* updated on client with each use */
1030 		/*
1031 		 * This value is set to lbolt
1032 		 * every time a client stream sends or receives data.
1033 		 * Even if the timer message arrives, we don't shutdown
1034 		 * client unless:
1035 		 *    lbolt >= MSEC_TO_TICK(mir_idle_timeout)+mir_use_timestamp.
1036 		 * This value is kept in HZ.
1037 		 */
1038 
1039 	uint_t	*mir_max_msg_sizep;	/* Reference to sanity check size */
1040 		/*
1041 		 * This pointer is set to &clnt_max_msg_size or
1042 		 * &svc_max_msg_size during the appropriate ioctl.
1043 		 */
1044 	zoneid_t mir_zoneid;	/* zone which pushed rpcmod */
1045 	/* Server-side fields. */
1046 	int	mir_ref_cnt;		/* Reference count: server side only */
1047 					/* counts the number of references */
1048 					/* that a kernel RPC server thread */
1049 					/* (see svc_run()) has on this rpcmod */
1050 					/* slot. Effectively, it is the */
1051 					/* number * of unprocessed messages */
1052 					/* that have been passed up to the */
1053 					/* KRPC layer */
1054 
1055 	mblk_t	*mir_svc_pend_mp;	/* Pending T_ORDREL_IND or */
1056 					/* T_DISCON_IND */
1057 
1058 	/*
1059 	 * these fields are for both client and server, but for debugging,
1060 	 * it is easier to have these last in the structure.
1061 	 */
1062 	kmutex_t	mir_mutex;	/* Mutex and condvar for close */
1063 	kcondvar_t	mir_condvar;	/* synchronization. */
1064 	kcondvar_t	mir_timer_cv;	/* Timer routine sync. */
1065 } mir_t;
1066 
1067 #define	MIR_SVC_QUIESCED(mir)	\
1068 	(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)
1069 
1070 #define	MIR_CLEAR_INRSRV(mir_ptr)	{	\
1071 	(mir_ptr)->mir_inrservice = 0;	\
1072 	if ((mir_ptr)->mir_type == RPC_SERVER &&	\
1073 		(mir_ptr)->mir_closing)	\
1074 		cv_signal(&(mir_ptr)->mir_condvar);	\
1075 }
1076 
1077 /*
1078  * Don't block service procedure (and mir_close) if
1079  * we are in the process of closing.
1080  */
1081 #define	MIR_WCANPUTNEXT(mir_ptr, write_q)	\
1082 	(canputnext(write_q) || ((mir_ptr)->mir_svc_no_more_msgs == 1))
1083 
1084 static int	mir_clnt_dup_request(queue_t *q, mblk_t *mp);
1085 static void	mir_rput_proto(queue_t *q, mblk_t *mp);
1086 static int	mir_svc_policy_notify(queue_t *q, int event);
1087 static void	mir_svc_release(queue_t *wq, mblk_t *mp);
1088 static void	mir_svc_start(queue_t *wq);
1089 static void	mir_svc_idle_start(queue_t *, mir_t *);
1090 static void	mir_svc_idle_stop(queue_t *, mir_t *);
1091 static void	mir_svc_start_close(queue_t *, mir_t *);
1092 static void	mir_clnt_idle_do_stop(queue_t *);
1093 static void	mir_clnt_idle_stop(queue_t *, mir_t *);
1094 static void	mir_clnt_idle_start(queue_t *, mir_t *);
1095 static void	mir_wput(queue_t *q, mblk_t *mp);
1096 static void	mir_wput_other(queue_t *q, mblk_t *mp);
1097 static void	mir_wsrv(queue_t *q);
1098 static	void	mir_disconnect(queue_t *, mir_t *ir);
1099 static	int	mir_check_len(queue_t *, int32_t, mblk_t *);
1100 static	void	mir_timer(void *);
1101 
1102 extern void	(*mir_rele)(queue_t *, mblk_t *);
1103 extern void	(*mir_start)(queue_t *);
1104 extern void	(*clnt_stop_idle)(queue_t *);
1105 
1106 clock_t	clnt_idle_timeout = MIR_CLNT_IDLE_TIMEOUT;
1107 clock_t	svc_idle_timeout = MIR_SVC_IDLE_TIMEOUT;
1108 
1109 /*
1110  * Timeout for subsequent notifications of idle connection.  This is
1111  * typically used to clean up after a wedged orderly release.
1112  */
1113 clock_t	svc_ordrel_timeout = MIR_SVC_ORDREL_TIMEOUT; /* milliseconds */
1114 
1115 extern	uint_t	*clnt_max_msg_sizep;
1116 extern	uint_t	*svc_max_msg_sizep;
1117 uint_t	clnt_max_msg_size = RPC_MAXDATASIZE;
1118 uint_t	svc_max_msg_size = RPC_MAXDATASIZE;
1119 uint_t	mir_krpc_cell_null;
1120 
1121 static void
1122 mir_timer_stop(mir_t *mir)
1123 {
1124 	timeout_id_t tid;
1125 
1126 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1127 
1128 	/*
1129 	 * Since the mir_mutex lock needs to be released to call
1130 	 * untimeout(), we need to make sure that no other thread
1131 	 * can start/stop the timer (changing mir_timer_id) during
1132 	 * that time.  The mir_timer_call bit and the mir_timer_cv
1133 	 * condition variable are used to synchronize this.  Setting
1134 	 * mir_timer_call also tells mir_timer() (refer to the comments
1135 	 * in mir_timer()) that it does not need to do anything.
1136 	 */
1137 	while (mir->mir_timer_call)
1138 		cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
1139 	mir->mir_timer_call = B_TRUE;
1140 
1141 	if ((tid = mir->mir_timer_id) != 0) {
1142 		mir->mir_timer_id = 0;
1143 		mutex_exit(&mir->mir_mutex);
1144 		(void) untimeout(tid);
1145 		mutex_enter(&mir->mir_mutex);
1146 	}
1147 	mir->mir_timer_call = B_FALSE;
1148 	cv_broadcast(&mir->mir_timer_cv);
1149 }
1150 
1151 static void
1152 mir_timer_start(queue_t *q, mir_t *mir, clock_t intrvl)
1153 {
1154 	timeout_id_t tid;
1155 
1156 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1157 
1158 	while (mir->mir_timer_call)
1159 		cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
1160 	mir->mir_timer_call = B_TRUE;
1161 
1162 	if ((tid = mir->mir_timer_id) != 0) {
1163 		mutex_exit(&mir->mir_mutex);
1164 		(void) untimeout(tid);
1165 		mutex_enter(&mir->mir_mutex);
1166 	}
1167 	/* Only start the timer when it is not closing. */
1168 	if (!mir->mir_closing) {
1169 		mir->mir_timer_id = timeout(mir_timer, q,
1170 		    MSEC_TO_TICK(intrvl));
1171 	}
1172 	mir->mir_timer_call = B_FALSE;
1173 	cv_broadcast(&mir->mir_timer_cv);
1174 }
1175 
1176 static int
1177 mir_clnt_dup_request(queue_t *q, mblk_t *mp)
1178 {
1179 	mblk_t  *mp1;
1180 	uint32_t  new_xid;
1181 	uint32_t  old_xid;
1182 
1183 	ASSERT(MUTEX_HELD(&((mir_t *)q->q_ptr)->mir_mutex));
1184 	new_xid = BE32_TO_U32(&mp->b_rptr[4]);
1185 	/*
1186 	 * This loop is a bit tacky -- it walks the STREAMS list of
1187 	 * flow-controlled messages.
1188 	 */
1189 	if ((mp1 = q->q_first) != NULL) {
1190 		do {
1191 			old_xid = BE32_TO_U32(&mp1->b_rptr[4]);
1192 			if (new_xid == old_xid)
1193 				return (1);
1194 		} while ((mp1 = mp1->b_next) != NULL);
1195 	}
1196 	return (0);
1197 }
1198 
1199 static int
1200 mir_close(queue_t *q)
1201 {
1202 	mir_t	*mir;
1203 	mblk_t	*mp;
1204 	bool_t queue_cleaned = FALSE;
1205 
1206 	RPCLOG(32, "rpcmod: mir_close of q 0x%p\n", (void *)q);
1207 	mir = (mir_t *)q->q_ptr;
1208 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
1209 	mutex_enter(&mir->mir_mutex);
1210 	if ((mp = mir->mir_head_mp) != NULL) {
1211 		mir->mir_head_mp = (mblk_t *)0;
1212 		freemsg(mp);
1213 	}
1214 	/*
1215 	 * Set mir_closing so we get notified when MIR_SVC_QUIESCED()
1216 	 * is TRUE.  And mir_timer_start() won't start the timer again.
1217 	 */
1218 	mir->mir_closing = B_TRUE;
1219 	mir_timer_stop(mir);
1220 
1221 	if (mir->mir_type == RPC_SERVER) {
1222 		flushq(q, FLUSHDATA);	/* Ditch anything waiting on read q */
1223 
1224 		/*
1225 		 * This will prevent more requests from arriving and
1226 		 * will force rpcmod to ignore flow control.
1227 		 */
1228 		mir_svc_start_close(WR(q), mir);
1229 
1230 		while ((!MIR_SVC_QUIESCED(mir)) || mir->mir_inwservice == 1) {
1231 
1232 			if (mir->mir_ref_cnt && !mir->mir_inrservice &&
1233 					(queue_cleaned == FALSE)) {
1234 				/*
1235 				 * call into SVC to clean the queue
1236 				 */
1237 				mutex_exit(&mir->mir_mutex);
1238 				svc_queueclean(q);
1239 				queue_cleaned = TRUE;
1240 				mutex_enter(&mir->mir_mutex);
1241 				continue;
1242 			}
1243 
1244 			/*
1245 			 * Bugid 1253810 - Force the write service
1246 			 * procedure to send its messages, regardless
1247 			 * whether the downstream  module is ready
1248 			 * to accept data.
1249 			 */
1250 			if (mir->mir_inwservice == 1)
1251 				qenable(WR(q));
1252 
1253 			cv_wait(&mir->mir_condvar, &mir->mir_mutex);
1254 		}
1255 
1256 		mutex_exit(&mir->mir_mutex);
1257 		qprocsoff(q);
1258 
1259 		/* Notify KRPC that this stream is going away. */
1260 		svc_queueclose(q);
1261 	} else {
1262 		mutex_exit(&mir->mir_mutex);
1263 		qprocsoff(q);
1264 	}
1265 
1266 	mutex_destroy(&mir->mir_mutex);
1267 	cv_destroy(&mir->mir_condvar);
1268 	cv_destroy(&mir->mir_timer_cv);
1269 	kmem_free(mir, sizeof (mir_t));
1270 	return (0);
1271 }
1272 
1273 /*
1274  * This is server side only (RPC_SERVER).
1275  *
1276  * Exit idle mode.
1277  */
1278 static void
1279 mir_svc_idle_stop(queue_t *q, mir_t *mir)
1280 {
1281 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1282 	ASSERT((q->q_flag & QREADR) == 0);
1283 	ASSERT(mir->mir_type == RPC_SERVER);
1284 	RPCLOG(16, "rpcmod: mir_svc_idle_stop of q 0x%p\n", (void *)q);
1285 
1286 	mir_timer_stop(mir);
1287 }
1288 
1289 /*
1290  * This is server side only (RPC_SERVER).
1291  *
1292  * Start idle processing, which will include setting idle timer if the
1293  * stream is not being closed.
1294  */
1295 static void
1296 mir_svc_idle_start(queue_t *q, mir_t *mir)
1297 {
1298 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1299 	ASSERT((q->q_flag & QREADR) == 0);
1300 	ASSERT(mir->mir_type == RPC_SERVER);
1301 	RPCLOG(16, "rpcmod: mir_svc_idle_start q 0x%p\n", (void *)q);
1302 
1303 	/*
1304 	 * Don't re-start idle timer if we are closing queues.
1305 	 */
1306 	if (mir->mir_closing) {
1307 		RPCLOG(16, "mir_svc_idle_start - closing: 0x%p\n",
1308 			(void *)q);
1309 
1310 		/*
1311 		 * We will call mir_svc_idle_start() whenever MIR_SVC_QUIESCED()
1312 		 * is true.  When it is true, and we are in the process of
1313 		 * closing the stream, signal any thread waiting in
1314 		 * mir_close().
1315 		 */
1316 		if (mir->mir_inwservice == 0)
1317 			cv_signal(&mir->mir_condvar);
1318 
1319 	} else {
1320 		RPCLOG(16, "mir_svc_idle_start - reset %s timer\n",
1321 			mir->mir_ordrel_pending ? "ordrel" : "normal");
1322 		/*
1323 		 * Normal condition, start the idle timer.  If an orderly
1324 		 * release has been sent, set the timeout to wait for the
1325 		 * client to close its side of the connection.  Otherwise,
1326 		 * use the normal idle timeout.
1327 		 */
1328 		mir_timer_start(q, mir, mir->mir_ordrel_pending ?
1329 		    svc_ordrel_timeout : mir->mir_idle_timeout);
1330 	}
1331 }
1332 
1333 /* ARGSUSED */
1334 static int
1335 mir_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1336 {
1337 	mir_t	*mir;
1338 
1339 	RPCLOG(32, "rpcmod: mir_open of q 0x%p\n", (void *)q);
1340 	/* Set variables used directly by KRPC. */
1341 	if (!mir_rele)
1342 		mir_rele = mir_svc_release;
1343 	if (!mir_start)
1344 		mir_start = mir_svc_start;
1345 	if (!clnt_stop_idle)
1346 		clnt_stop_idle = mir_clnt_idle_do_stop;
1347 	if (!clnt_max_msg_sizep)
1348 		clnt_max_msg_sizep = &clnt_max_msg_size;
1349 	if (!svc_max_msg_sizep)
1350 		svc_max_msg_sizep = &svc_max_msg_size;
1351 
1352 	/* Allocate a zero'ed out mir structure for this stream. */
1353 	mir = kmem_zalloc(sizeof (mir_t), KM_SLEEP);
1354 
1355 	/*
1356 	 * We set hold inbound here so that incoming messages will
1357 	 * be held on the read-side queue until the stream is completely
1358 	 * initialized with a RPC_CLIENT or RPC_SERVER ioctl.  During
1359 	 * the ioctl processing, the flag is cleared and any messages that
1360 	 * arrived between the open and the ioctl are delivered to KRPC.
1361 	 *
1362 	 * Early data should never arrive on a client stream since
1363 	 * servers only respond to our requests and we do not send any.
1364 	 * until after the stream is initialized.  Early data is
1365 	 * very common on a server stream where the client will start
1366 	 * sending data as soon as the connection is made (and this
1367 	 * is especially true with TCP where the protocol accepts the
1368 	 * connection before nfsd or KRPC is notified about it).
1369 	 */
1370 
1371 	mir->mir_hold_inbound = 1;
1372 
1373 	/*
1374 	 * Start the record marker looking for a 4-byte header.  When
1375 	 * this length is negative, it indicates that rpcmod is looking
1376 	 * for bytes to consume for the record marker header.  When it
1377 	 * is positive, it holds the number of bytes that have arrived
1378 	 * for the current fragment and are being held in mir_header_mp.
1379 	 */
1380 
1381 	mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
1382 
1383 	mir->mir_zoneid = rpc_zoneid();
1384 	mutex_init(&mir->mir_mutex, NULL, MUTEX_DEFAULT, NULL);
1385 	cv_init(&mir->mir_condvar, NULL, CV_DRIVER, NULL);
1386 	cv_init(&mir->mir_timer_cv, NULL, CV_DRIVER, NULL);
1387 
1388 	q->q_ptr = (char *)mir;
1389 	WR(q)->q_ptr = (char *)mir;
1390 
1391 	/*
1392 	 * We noenable the read-side queue because we don't want it
1393 	 * automatically enabled by putq.  We enable it explicitly
1394 	 * in mir_wsrv when appropriate. (See additional comments on
1395 	 * flow control at the beginning of mir_rsrv.)
1396 	 */
1397 	noenable(q);
1398 
1399 	qprocson(q);
1400 	return (0);
1401 }
1402 
1403 /*
1404  * Read-side put routine for both the client and server side.  Does the
1405  * record marking for incoming RPC messages, and when complete, dispatches
1406  * the message to either the client or server.
1407  */
1408 static void
1409 mir_do_rput(queue_t *q, mblk_t *mp, int srv)
1410 {
1411 	mblk_t	*cont_mp;
1412 	int	excess;
1413 	int32_t	frag_len;
1414 	int32_t	frag_header;
1415 	mblk_t	*head_mp;
1416 	int	len;
1417 	mir_t	*mir;
1418 	mblk_t	*mp1;
1419 	unsigned char	*rptr;
1420 	mblk_t	*tail_mp;
1421 	unsigned char	*wptr;
1422 	boolean_t	stop_timer = B_FALSE;
1423 
1424 	mir = (mir_t *)q->q_ptr;
1425 	ASSERT(mir != NULL);
1426 
1427 	/*
1428 	 * If the stream has not been set up as a RPC_CLIENT or RPC_SERVER
1429 	 * with the corresponding ioctl, then don't accept
1430 	 * any inbound data.  This should never happen for streams
1431 	 * created by nfsd or client-side KRPC because they are careful
1432 	 * to set the mode of the stream before doing anything else.
1433 	 */
1434 	if (mir->mir_type == 0) {
1435 		freemsg(mp);
1436 		return;
1437 	}
1438 
1439 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
1440 
1441 	switch (mp->b_datap->db_type) {
1442 	case M_DATA:
1443 		break;
1444 	case M_PROTO:
1445 	case M_PCPROTO:
1446 		rptr = mp->b_rptr;
1447 		if (mp->b_wptr - rptr < sizeof (uint32_t)) {
1448 			RPCLOG(1, "mir_rput: runt TPI message (%d bytes)\n",
1449 			    (int)(mp->b_wptr - rptr));
1450 			freemsg(mp);
1451 			return;
1452 		}
1453 		if (((union T_primitives *)rptr)->type != T_DATA_IND) {
1454 			mir_rput_proto(q, mp);
1455 			return;
1456 		}
1457 
1458 		/* Throw away the T_DATA_IND block and continue with data. */
1459 		mp1 = mp;
1460 		mp = mp->b_cont;
1461 		freeb(mp1);
1462 		break;
1463 	case M_SETOPTS:
1464 		/*
1465 		 * If a module on the stream is trying set the Stream head's
1466 		 * high water mark, then set our hiwater to the requested
1467 		 * value.  We are the "stream head" for all inbound
1468 		 * data messages since messages are passed directly to KRPC.
1469 		 */
1470 		if ((mp->b_wptr - mp->b_rptr) >= sizeof (struct stroptions)) {
1471 			struct stroptions	*stropts;
1472 
1473 			stropts = (struct stroptions *)mp->b_rptr;
1474 			if ((stropts->so_flags & SO_HIWAT) &&
1475 				!(stropts->so_flags & SO_BAND)) {
1476 				(void) strqset(q, QHIWAT, 0, stropts->so_hiwat);
1477 			}
1478 		}
1479 		putnext(q, mp);
1480 		return;
1481 	case M_FLUSH:
1482 		RPCLOG(32, "mir_do_rput: ignoring M_FLUSH on q 0x%p. ",
1483 		    (void *)q);
1484 		RPCLOG(32, "M_FLUSH is %x\n", (uint_t)*mp->b_rptr);
1485 
1486 		putnext(q, mp);
1487 		return;
1488 	default:
1489 		putnext(q, mp);
1490 		return;
1491 	}
1492 
1493 	mutex_enter(&mir->mir_mutex);
1494 
1495 	/*
1496 	 * If this connection is closing, don't accept any new messages.
1497 	 */
1498 	if (mir->mir_svc_no_more_msgs) {
1499 		ASSERT(mir->mir_type == RPC_SERVER);
1500 		mutex_exit(&mir->mir_mutex);
1501 		freemsg(mp);
1502 		return;
1503 	}
1504 
1505 	/* Get local copies for quicker access. */
1506 	frag_len = mir->mir_frag_len;
1507 	frag_header = mir->mir_frag_header;
1508 	head_mp = mir->mir_head_mp;
1509 	tail_mp = mir->mir_tail_mp;
1510 
1511 	/* Loop, processing each message block in the mp chain separately. */
1512 	do {
1513 		/*
1514 		 * cont_mp is used in the do/while condition below to
1515 		 * walk to the next block in the STREAMS message.
1516 		 * mp->b_cont may be nil'ed during processing so we
1517 		 * can't rely on it to find the next block.
1518 		 */
1519 		cont_mp = mp->b_cont;
1520 
1521 		/*
1522 		 * Get local copies of rptr and wptr for our processing.
1523 		 * These always point into "mp" (the current block being
1524 		 * processed), but rptr is updated as we consume any
1525 		 * record header in this message, and wptr is updated to
1526 		 * point to the end of the data for the current fragment,
1527 		 * if it ends in this block.  The main point is that
1528 		 * they are not always the same as b_rptr and b_wptr.
1529 		 * b_rptr and b_wptr will be updated when appropriate.
1530 		 */
1531 		rptr = mp->b_rptr;
1532 		wptr = mp->b_wptr;
1533 same_mblk:;
1534 		len = (int)(wptr - rptr);
1535 		if (len <= 0) {
1536 			/*
1537 			 * If we have processed all of the data in the message
1538 			 * or the block is empty to begin with, then we're
1539 			 * done with this block and can go on to cont_mp,
1540 			 * if there is one.
1541 			 *
1542 			 * First, we check to see if the current block is
1543 			 * now zero-length and, if so, we free it.
1544 			 * This happens when either the block was empty
1545 			 * to begin with or we consumed all of the data
1546 			 * for the record marking header.
1547 			 */
1548 			if (rptr <= mp->b_rptr) {
1549 				/*
1550 				 * If head_mp is non-NULL, add cont_mp to the
1551 				 * mblk list. XXX But there is a possibility
1552 				 * that tail_mp = mp or even head_mp = mp XXX
1553 				 */
1554 				if (head_mp) {
1555 					if (head_mp == mp)
1556 						head_mp = NULL;
1557 					else if (tail_mp != mp) {
1558 		ASSERT((tail_mp->b_cont == NULL) || (tail_mp->b_cont == mp));
1559 						tail_mp->b_cont = cont_mp;
1560 						/*
1561 						 * It's possible that, because
1562 						 * of a very short mblk (0-3
1563 						 * bytes), we've ended up here
1564 						 * and that cont_mp could be
1565 						 * NULL (if we're at the end
1566 						 * of an mblk chain). If so,
1567 						 * don't set tail_mp to
1568 						 * cont_mp, because the next
1569 						 * time we access it, we'll
1570 						 * dereference a NULL pointer
1571 						 * and crash. Just leave
1572 						 * tail_mp pointing at the
1573 						 * current end of chain.
1574 						 */
1575 						if (cont_mp)
1576 							tail_mp = cont_mp;
1577 					} else {
1578 						mblk_t *smp = head_mp;
1579 
1580 						while ((smp->b_cont != NULL) &&
1581 							(smp->b_cont != mp))
1582 							smp = smp->b_cont;
1583 						smp->b_cont = cont_mp;
1584 						/*
1585 						 * Don't set tail_mp to cont_mp
1586 						 * if it's NULL. Instead, set
1587 						 * tail_mp to smp, which is the
1588 						 * end of the chain starting
1589 						 * at head_mp.
1590 						 */
1591 						if (cont_mp)
1592 							tail_mp = cont_mp;
1593 						else
1594 							tail_mp = smp;
1595 					}
1596 				}
1597 				freeb(mp);
1598 			}
1599 			continue;
1600 		}
1601 
1602 		/*
1603 		 * frag_len starts at -4 and is incremented past the record
1604 		 * marking header to 0, and then becomes positive as real data
1605 		 * bytes are received for the message.  While frag_len is less
1606 		 * than zero, we need more bytes for the record marking
1607 		 * header.
1608 		 */
1609 		if (frag_len < 0) {
1610 			uchar_t	*up = rptr;
1611 			/*
1612 			 * Collect as many bytes as we need for the record
1613 			 * marking header and that are available in this block.
1614 			 */
1615 			do {
1616 				--len;
1617 				frag_len++;
1618 				frag_header <<= 8;
1619 				frag_header += (*up++ & 0xFF);
1620 			} while (len > 0 && frag_len < 0);
1621 
1622 			if (rptr == mp->b_rptr) {
1623 				/*
1624 				 * The record header is located at the
1625 				 * beginning of the block, so just walk
1626 				 * b_rptr past it.
1627 				 */
1628 				mp->b_rptr = rptr = up;
1629 			} else {
1630 				/*
1631 				 * The record header is located in the middle
1632 				 * of a block, so copy any remaining data up.
1633 				 * This happens when an RPC message is
1634 				 * fragmented into multiple pieces and
1635 				 * a middle (or end) fragment immediately
1636 				 * follows a previous fragment in the same
1637 				 * message block.
1638 				 */
1639 				wptr = &rptr[len];
1640 				mp->b_wptr = wptr;
1641 				if (len) {
1642 					RPCLOG(32, "mir_do_rput: copying %d "
1643 					    "bytes of data up", len);
1644 					RPCLOG(32, " db_ref %d\n",
1645 					    (uint_t)mp->b_datap->db_ref);
1646 					bcopy(up, rptr, len);
1647 				}
1648 			}
1649 
1650 			/*
1651 			 * If we haven't received the complete record header
1652 			 * yet, then loop around to get the next block in the
1653 			 * STREAMS message. The logic at same_mblk label will
1654 			 * free the current block if it has become empty.
1655 			 */
1656 			if (frag_len < 0) {
1657 				RPCLOG(32, "mir_do_rput: frag_len is still < 0 "
1658 				"(%d)", len);
1659 				goto same_mblk;
1660 			}
1661 
1662 #ifdef	RPCDEBUG
1663 			if ((frag_header & MIR_LASTFRAG) == 0) {
1664 				RPCLOG0(32, "mir_do_rput: multi-fragment "
1665 				    "record\n");
1666 			}
1667 			{
1668 				uint_t l = frag_header & ~MIR_LASTFRAG;
1669 
1670 				if (l != 0 && mir->mir_max_msg_sizep &&
1671 				    l >= *mir->mir_max_msg_sizep) {
1672 					RPCLOG(32, "mir_do_rput: fragment size"
1673 					    " (%d) > maximum", l);
1674 					RPCLOG(32, " (%u)\n",
1675 					    *mir->mir_max_msg_sizep);
1676 				}
1677 			}
1678 #endif
1679 			/*
1680 			 * At this point we have retrieved the complete record
1681 			 * header for this fragment.  If the current block is
1682 			 * empty, then we need to free it and walk to the next
1683 			 * block.
1684 			 */
1685 			if (mp->b_rptr >= wptr) {
1686 				/*
1687 				 * If this is not the last fragment or if we
1688 				 * have not received all the data for this
1689 				 * RPC message, then loop around to the next
1690 				 * block.
1691 				 */
1692 				if (!(frag_header & MIR_LASTFRAG) ||
1693 					(frag_len -
1694 					(frag_header & ~MIR_LASTFRAG)) ||
1695 					!head_mp)
1696 					goto same_mblk;
1697 
1698 				/*
1699 				 * Quick walk to next block in the
1700 				 * STREAMS message.
1701 				 */
1702 				freeb(mp);
1703 				continue;
1704 			}
1705 		}
1706 
1707 		/*
1708 		 * We've collected the complete record header.  The data
1709 		 * in the current block is added to the end of the RPC
1710 		 * message.  Note that tail_mp is the same as mp after
1711 		 * this linkage.
1712 		 */
1713 		if (!head_mp)
1714 			head_mp = mp;
1715 		else if (tail_mp != mp) {
1716 			ASSERT((tail_mp->b_cont == NULL) ||
1717 			    (tail_mp->b_cont == mp));
1718 			tail_mp->b_cont = mp;
1719 		}
1720 		tail_mp = mp;
1721 
1722 		/*
1723 		 * Add the length of this block to the accumulated
1724 		 * fragment length.
1725 		 */
1726 		frag_len += len;
1727 		excess = frag_len - (frag_header & ~MIR_LASTFRAG);
1728 		/*
1729 		 * If we have not received all the data for this fragment,
1730 		 * then walk to the next block.
1731 		 */
1732 		if (excess < 0)
1733 			continue;
1734 
1735 		/*
1736 		 * We've received a complete fragment, so reset frag_len
1737 		 * for the next one.
1738 		 */
1739 		frag_len = -(int32_t)sizeof (uint32_t);
1740 
1741 		/*
1742 		 * Update rptr to point to the beginning of the next
1743 		 * fragment in this block.  If there are no more bytes
1744 		 * in the block (excess is 0), then rptr will be equal
1745 		 * to wptr.
1746 		 */
1747 		rptr = wptr - excess;
1748 
1749 		/*
1750 		 * Now we check to see if this fragment is the last one in
1751 		 * the RPC message.
1752 		 */
1753 		if (!(frag_header & MIR_LASTFRAG)) {
1754 			/*
1755 			 * This isn't the last one, so start processing the
1756 			 * next fragment.
1757 			 */
1758 			frag_header = 0;
1759 
1760 			/*
1761 			 * If excess is 0, the next fragment
1762 			 * starts at the beginning of the next block --
1763 			 * we "continue" to the end of the while loop and
1764 			 * walk to cont_mp.
1765 			 */
1766 			if (excess == 0)
1767 				continue;
1768 			RPCLOG0(32, "mir_do_rput: multi-fragment message with "
1769 			    "two or more fragments in one mblk\n");
1770 
1771 			/*
1772 			 * If excess is non-0, then the next fragment starts
1773 			 * in this block.  rptr points to the beginning
1774 			 * of the next fragment and we "goto same_mblk"
1775 			 * to continue processing.
1776 			 */
1777 			goto same_mblk;
1778 		}
1779 
1780 		/*
1781 		 * We've got a complete RPC message.  Before passing it
1782 		 * upstream, check to see if there is extra data in this
1783 		 * message block. If so, then we separate the excess
1784 		 * from the complete message. The excess data is processed
1785 		 * after the current message goes upstream.
1786 		 */
1787 		if (excess > 0) {
1788 			RPCLOG(32, "mir_do_rput: end of record, but excess "
1789 			    "data (%d bytes) in this mblk. dupb/copyb "
1790 			    "needed\n", excess);
1791 
1792 			/* Duplicate only the overlapping block. */
1793 			mp1 = dupb(tail_mp);
1794 
1795 			/*
1796 			 * dupb() might have failed due to ref count wrap around
1797 			 * so try a copyb().
1798 			 */
1799 			if (mp1 == NULL)
1800 				mp1 = copyb(tail_mp);
1801 
1802 			/*
1803 			 * Do not use bufcall() to schedule a "buffer
1804 			 * availability event."  The reason is that
1805 			 * bufcall() has problems.  For example, if memory
1806 			 * runs out, bufcall() itself will fail since it
1807 			 * needs to allocate memory.  The most appropriate
1808 			 * action right now is to disconnect this connection
1809 			 * as the system is under stress.  We should try to
1810 			 * free up resources.
1811 			 */
1812 			if (mp1 == NULL) {
1813 				freemsg(head_mp);
1814 				RPCLOG0(1, "mir_do_rput: dupb/copyb failed\n");
1815 				mir->mir_frag_header = 0;
1816 				mir->mir_frag_len = -(int)sizeof (uint32_t);
1817 				mir->mir_head_mp = NULL;
1818 				mir->mir_tail_mp = NULL;
1819 
1820 				mir_disconnect(q, mir);
1821 				return;
1822 			}
1823 
1824 			/*
1825 			 * The new message block is linked with the
1826 			 * continuation block in cont_mp.  We then point
1827 			 * cont_mp to the new block so that we will
1828 			 * process it next.
1829 			 */
1830 			mp1->b_cont = cont_mp;
1831 			cont_mp = mp1;
1832 			/*
1833 			 * Data in the new block begins at the
1834 			 * next fragment (rptr).
1835 			 */
1836 			cont_mp->b_rptr += (rptr - tail_mp->b_rptr);
1837 			ASSERT(cont_mp->b_rptr >= cont_mp->b_datap->db_base);
1838 			ASSERT(cont_mp->b_rptr <= cont_mp->b_wptr);
1839 
1840 			/* Data in the current fragment ends at rptr. */
1841 			tail_mp->b_wptr = rptr;
1842 			ASSERT(tail_mp->b_wptr <= tail_mp->b_datap->db_lim);
1843 			ASSERT(tail_mp->b_wptr >= tail_mp->b_rptr);
1844 
1845 		}
1846 
1847 		/* tail_mp is the last block with data for this RPC message. */
1848 		tail_mp->b_cont = NULL;
1849 
1850 		/* Pass the RPC message to the current consumer. */
1851 		switch (mir->mir_type) {
1852 		case RPC_CLIENT:
1853 			if (clnt_dispatch_notify(head_mp, mir->mir_zoneid)) {
1854 				/*
1855 				 * Mark this stream as active.  This marker
1856 				 * is used in mir_timer().
1857 				 */
1858 
1859 				mir->mir_clntreq = 1;
1860 				mir->mir_use_timestamp = lbolt;
1861 			} else
1862 				freemsg(head_mp);
1863 			break;
1864 
1865 		case RPC_SERVER:
1866 			/*
1867 			 * Check for flow control before passing the
1868 			 * message to KRPC.
1869 			 */
1870 
1871 			if (!mir->mir_hold_inbound) {
1872 			    if (mir->mir_krpc_cell) {
1873 				/*
1874 				 * If the reference count is 0
1875 				 * (not including this request),
1876 				 * then the stream is transitioning
1877 				 * from idle to non-idle.  In this case,
1878 				 * we cancel the idle timer.
1879 				 */
1880 				if (mir->mir_ref_cnt++ == 0)
1881 					stop_timer = B_TRUE;
1882 				if (mir_check_len(q,
1883 					(int32_t)msgdsize(mp), mp))
1884 						return;
1885 				svc_queuereq(q, head_mp); /* to KRPC */
1886 			    } else {
1887 				/*
1888 				 * Count # of times this happens. Should be
1889 				 * never, but experience shows otherwise.
1890 				 */
1891 				mir_krpc_cell_null++;
1892 				freemsg(head_mp);
1893 			    }
1894 
1895 			} else {
1896 				/*
1897 				 * If the outbound side of the stream is
1898 				 * flow controlled, then hold this message
1899 				 * until client catches up. mir_hold_inbound
1900 				 * is set in mir_wput and cleared in mir_wsrv.
1901 				 */
1902 				if (srv)
1903 					(void) putbq(q, head_mp);
1904 				else
1905 					(void) putq(q, head_mp);
1906 				mir->mir_inrservice = B_TRUE;
1907 			}
1908 			break;
1909 		default:
1910 			RPCLOG(1, "mir_rput: unknown mir_type %d\n",
1911 				mir->mir_type);
1912 			freemsg(head_mp);
1913 			break;
1914 		}
1915 
1916 		/*
1917 		 * Reset head_mp and frag_header since we're starting on a
1918 		 * new RPC fragment and message.
1919 		 */
1920 		head_mp = NULL;
1921 		tail_mp = NULL;
1922 		frag_header = 0;
1923 	} while ((mp = cont_mp) != NULL);
1924 
1925 	/*
1926 	 * Do a sanity check on the message length.  If this message is
1927 	 * getting excessively large, shut down the connection.
1928 	 */
1929 	if (head_mp != NULL && mir->mir_setup_complete &&
1930 		mir_check_len(q, frag_len, head_mp))
1931 		return;
1932 
1933 	/* Save our local copies back in the mir structure. */
1934 	mir->mir_frag_header = frag_header;
1935 	mir->mir_frag_len = frag_len;
1936 	mir->mir_head_mp = head_mp;
1937 	mir->mir_tail_mp = tail_mp;
1938 
1939 	/*
1940 	 * The timer is stopped after the whole message chain is processed.
1941 	 * The reason is that stopping the timer releases the mir_mutex
1942 	 * lock temporarily.  This means that the request can be serviced
1943 	 * while we are still processing the message chain.  This is not
1944 	 * good.  So we stop the timer here instead.
1945 	 *
1946 	 * Note that if the timer fires before we stop it, it will not
1947 	 * do any harm as MIR_SVC_QUIESCED() is false and mir_timer()
1948 	 * will just return;
1949 	 */
1950 	if (stop_timer) {
1951 		RPCLOG(16, "mir_do_rput stopping idle timer on 0x%p because "
1952 		    "ref cnt going to non zero\n", (void *) WR(q));
1953 		mir_svc_idle_stop(WR(q), mir);
1954 	}
1955 	mutex_exit(&mir->mir_mutex);
1956 }
1957 
1958 static void
1959 mir_rput(queue_t *q, mblk_t *mp)
1960 {
1961 	mir_do_rput(q, mp, 0);
1962 }
1963 
1964 static void
1965 mir_rput_proto(queue_t *q, mblk_t *mp)
1966 {
1967 	mir_t	*mir = (mir_t *)q->q_ptr;
1968 	uint32_t	type;
1969 	uint32_t reason = 0;
1970 
1971 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
1972 
1973 	type = ((union T_primitives *)mp->b_rptr)->type;
1974 	switch (mir->mir_type) {
1975 	case RPC_CLIENT:
1976 		switch (type) {
1977 		case T_DISCON_IND:
1978 		    reason =
1979 			((struct T_discon_ind *)(mp->b_rptr))->DISCON_reason;
1980 		    /*FALLTHROUGH*/
1981 		case T_ORDREL_IND:
1982 			mutex_enter(&mir->mir_mutex);
1983 			if (mir->mir_head_mp) {
1984 				freemsg(mir->mir_head_mp);
1985 				mir->mir_head_mp = (mblk_t *)0;
1986 				mir->mir_tail_mp = (mblk_t *)0;
1987 			}
1988 			/*
1989 			 * We are disconnecting, but not necessarily
1990 			 * closing. By not closing, we will fail to
1991 			 * pick up a possibly changed global timeout value,
1992 			 * unless we store it now.
1993 			 */
1994 			mir->mir_idle_timeout = clnt_idle_timeout;
1995 			mir_clnt_idle_stop(WR(q), mir);
1996 
1997 			/*
1998 			 * Even though we are unconnected, we still
1999 			 * leave the idle timer going on the client. The
2000 			 * reason for is that if we've disconnected due
2001 			 * to a server-side disconnect, reset, or connection
2002 			 * timeout, there is a possibility the client may
2003 			 * retry the RPC request. This retry needs to done on
2004 			 * the same bound address for the server to interpret
2005 			 * it as such. However, we don't want
2006 			 * to wait forever for that possibility. If the
2007 			 * end-point stays unconnected for mir_idle_timeout
2008 			 * units of time, then that is a signal to the
2009 			 * connection manager to give up waiting for the
2010 			 * application (eg. NFS) to send a retry.
2011 			 */
2012 			mir_clnt_idle_start(WR(q), mir);
2013 			mutex_exit(&mir->mir_mutex);
2014 			clnt_dispatch_notifyall(WR(q), type, reason);
2015 			freemsg(mp);
2016 			return;
2017 		case T_ERROR_ACK:
2018 		{
2019 			struct T_error_ack	*terror;
2020 
2021 			terror = (struct T_error_ack *)mp->b_rptr;
2022 			RPCLOG(1, "mir_rput_proto T_ERROR_ACK for queue 0x%p",
2023 				(void *)q);
2024 			RPCLOG(1, " ERROR_prim: %s,",
2025 				rpc_tpiprim2name(terror->ERROR_prim));
2026 			RPCLOG(1, " TLI_error: %s,",
2027 				rpc_tpierr2name(terror->TLI_error));
2028 			RPCLOG(1, " UNIX_error: %d\n", terror->UNIX_error);
2029 			if (terror->ERROR_prim == T_DISCON_REQ)  {
2030 				clnt_dispatch_notifyall(WR(q), type, reason);
2031 				freemsg(mp);
2032 				return;
2033 			} else {
2034 				if (clnt_dispatch_notifyconn(WR(q), mp))
2035 					return;
2036 			}
2037 			break;
2038 		}
2039 		case T_OK_ACK:
2040 		{
2041 			struct T_ok_ack	*tok = (struct T_ok_ack *)mp->b_rptr;
2042 
2043 			if (tok->CORRECT_prim == T_DISCON_REQ) {
2044 				clnt_dispatch_notifyall(WR(q), type, reason);
2045 				freemsg(mp);
2046 				return;
2047 			} else {
2048 				if (clnt_dispatch_notifyconn(WR(q), mp))
2049 					return;
2050 			}
2051 			break;
2052 		}
2053 		case T_CONN_CON:
2054 		case T_INFO_ACK:
2055 		case T_OPTMGMT_ACK:
2056 			if (clnt_dispatch_notifyconn(WR(q), mp))
2057 				return;
2058 			break;
2059 		case T_BIND_ACK:
2060 			break;
2061 		default:
2062 			RPCLOG(1, "mir_rput: unexpected message %d "
2063 			    "for KRPC client\n",
2064 			    ((union T_primitives *)mp->b_rptr)->type);
2065 			break;
2066 		}
2067 		break;
2068 
2069 	case RPC_SERVER:
2070 		switch (type) {
2071 		case T_BIND_ACK:
2072 		{
2073 			struct T_bind_ack	*tbind;
2074 
2075 			/*
2076 			 * If this is a listening stream, then shut
2077 			 * off the idle timer.
2078 			 */
2079 			tbind = (struct T_bind_ack *)mp->b_rptr;
2080 			if (tbind->CONIND_number > 0) {
2081 				mutex_enter(&mir->mir_mutex);
2082 				mir_svc_idle_stop(WR(q), mir);
2083 
2084 				/*
2085 				 * mark this as a listen endpoint
2086 				 * for special handling.
2087 				 */
2088 
2089 				mir->mir_listen_stream = 1;
2090 				mutex_exit(&mir->mir_mutex);
2091 			}
2092 			break;
2093 		}
2094 		case T_DISCON_IND:
2095 		case T_ORDREL_IND:
2096 			RPCLOG(16, "mir_rput_proto: got %s indication\n",
2097 				type == T_DISCON_IND ? "disconnect"
2098 				: "orderly release");
2099 
2100 			/*
2101 			 * For listen endpoint just pass
2102 			 * on the message.
2103 			 */
2104 
2105 			if (mir->mir_listen_stream)
2106 				break;
2107 
2108 			mutex_enter(&mir->mir_mutex);
2109 
2110 			/*
2111 			 * If client wants to break off connection, record
2112 			 * that fact.
2113 			 */
2114 			mir_svc_start_close(WR(q), mir);
2115 
2116 			/*
2117 			 * If we are idle, then send the orderly release
2118 			 * or disconnect indication to nfsd.
2119 			 */
2120 			if (MIR_SVC_QUIESCED(mir)) {
2121 				mutex_exit(&mir->mir_mutex);
2122 				break;
2123 			}
2124 
2125 			RPCLOG(16, "mir_rput_proto: not idle, so "
2126 				"disconnect/ord rel indication not passed "
2127 				"upstream on 0x%p\n", (void *)q);
2128 
2129 			/*
2130 			 * Hold the indication until we get idle
2131 			 * If there already is an indication stored,
2132 			 * replace it if the new one is a disconnect. The
2133 			 * reasoning is that disconnection takes less time
2134 			 * to process, and once a client decides to
2135 			 * disconnect, we should do that.
2136 			 */
2137 			if (mir->mir_svc_pend_mp) {
2138 				if (type == T_DISCON_IND) {
2139 					RPCLOG(16, "mir_rput_proto: replacing"
2140 					    " held disconnect/ord rel"
2141 					    " indication with disconnect on"
2142 					    " 0x%p\n", (void *)q);
2143 
2144 					freemsg(mir->mir_svc_pend_mp);
2145 					mir->mir_svc_pend_mp = mp;
2146 				} else {
2147 					RPCLOG(16, "mir_rput_proto: already "
2148 					    "held a disconnect/ord rel "
2149 					    "indication. freeing ord rel "
2150 					    "ind on 0x%p\n", (void *)q);
2151 					freemsg(mp);
2152 				}
2153 			} else
2154 				mir->mir_svc_pend_mp = mp;
2155 
2156 			mutex_exit(&mir->mir_mutex);
2157 			return;
2158 
2159 		default:
2160 			/* nfsd handles server-side non-data messages. */
2161 			break;
2162 		}
2163 		break;
2164 
2165 	default:
2166 		break;
2167 	}
2168 
2169 	putnext(q, mp);
2170 }
2171 
2172 /*
2173  * The server-side read queues are used to hold inbound messages while
2174  * outbound flow control is exerted.  When outbound flow control is
2175  * relieved, mir_wsrv qenables the read-side queue.  Read-side queues
2176  * are not enabled by STREAMS and are explicitly noenable'ed in mir_open.
2177  *
2178  * For the server side,  we have two types of messages queued. The first type
2179  * are messages that are ready to be XDR decoded and and then sent to the
2180  * RPC program's dispatch routine. The second type are "raw" messages that
2181  * haven't been processed, i.e. assembled from rpc record fragements into
2182  * full requests. The only time we will see the second type of message
2183  * queued is if we have a memory allocation failure while processing a
2184  * a raw message. The field mir_first_non_processed_mblk will mark the
2185  * first such raw message. So the flow for server side is:
2186  *
2187  *	- send processed queued messages to kRPC until we run out or find
2188  *	  one that needs additional processing because we were short on memory
2189  *	  earlier
2190  *	- process a message that was deferred because of lack of
2191  *	  memory
2192  *	- continue processing messages until the queue empties or we
2193  *	  have to stop because of lack of memory
2194  *	- during each of the above phase, if the queue is empty and
2195  *	  there are no pending messages that were passed to the RPC
2196  *	  layer, send upstream the pending disconnect/ordrel indication if
2197  *	  there is one
2198  *
2199  * The read-side queue is also enabled by a bufcall callback if dupmsg
2200  * fails in mir_rput.
2201  */
2202 static void
2203 mir_rsrv(queue_t *q)
2204 {
2205 	mir_t	*mir;
2206 	mblk_t	*mp;
2207 	mblk_t	*cmp = NULL;
2208 	boolean_t stop_timer = B_FALSE;
2209 
2210 	mir = (mir_t *)q->q_ptr;
2211 	mutex_enter(&mir->mir_mutex);
2212 
2213 	mp = NULL;
2214 	switch (mir->mir_type) {
2215 	case RPC_SERVER:
2216 		if (mir->mir_ref_cnt == 0)
2217 			mir->mir_hold_inbound = 0;
2218 		if (mir->mir_hold_inbound) {
2219 
2220 			ASSERT(cmp == NULL);
2221 			if (q->q_first == NULL) {
2222 
2223 				MIR_CLEAR_INRSRV(mir);
2224 
2225 				if (MIR_SVC_QUIESCED(mir)) {
2226 					cmp = mir->mir_svc_pend_mp;
2227 					mir->mir_svc_pend_mp = NULL;
2228 				}
2229 			}
2230 
2231 			mutex_exit(&mir->mir_mutex);
2232 
2233 			if (cmp != NULL) {
2234 				RPCLOG(16, "mir_rsrv: line %d: sending a held "
2235 				    "disconnect/ord rel indication upstream\n",
2236 				    __LINE__);
2237 				putnext(q, cmp);
2238 			}
2239 
2240 			return;
2241 		}
2242 		while (mp = getq(q)) {
2243 			if (mir->mir_krpc_cell) {
2244 				/*
2245 				 * If we were idle, turn off idle timer since
2246 				 * we aren't idle any more.
2247 				 */
2248 				if (mir->mir_ref_cnt++ == 0)
2249 					stop_timer = B_TRUE;
2250 				if (mir_check_len(q,
2251 					(int32_t)msgdsize(mp), mp))
2252 						return;
2253 				svc_queuereq(q, mp);
2254 			} else {
2255 				/*
2256 				 * Count # of times this happens. Should be
2257 				 * never, but experience shows otherwise.
2258 				 */
2259 				mir_krpc_cell_null++;
2260 				freemsg(mp);
2261 			}
2262 		}
2263 		break;
2264 	case RPC_CLIENT:
2265 		break;
2266 	default:
2267 		RPCLOG(1, "mir_rsrv: unexpected mir_type %d\n", mir->mir_type);
2268 
2269 		if (q->q_first == NULL)
2270 			MIR_CLEAR_INRSRV(mir);
2271 
2272 		mutex_exit(&mir->mir_mutex);
2273 
2274 		return;
2275 	}
2276 
2277 	/*
2278 	 * The timer is stopped after all the messages are processed.
2279 	 * The reason is that stopping the timer releases the mir_mutex
2280 	 * lock temporarily.  This means that the request can be serviced
2281 	 * while we are still processing the message queue.  This is not
2282 	 * good.  So we stop the timer here instead.
2283 	 */
2284 	if (stop_timer)  {
2285 		RPCLOG(16, "mir_rsrv stopping idle timer on 0x%p because ref "
2286 		    "cnt going to non zero\n", (void *)WR(q));
2287 		mir_svc_idle_stop(WR(q), mir);
2288 	}
2289 
2290 	if (q->q_first == NULL) {
2291 
2292 		MIR_CLEAR_INRSRV(mir);
2293 
2294 		ASSERT(cmp == NULL);
2295 		if (mir->mir_type == RPC_SERVER && MIR_SVC_QUIESCED(mir)) {
2296 			cmp = mir->mir_svc_pend_mp;
2297 			mir->mir_svc_pend_mp = NULL;
2298 		}
2299 
2300 		mutex_exit(&mir->mir_mutex);
2301 
2302 		if (cmp != NULL) {
2303 			RPCLOG(16, "mir_rsrv: line %d: sending a held "
2304 				"disconnect/ord rel indication upstream\n",
2305 				__LINE__);
2306 			putnext(q, cmp);
2307 		}
2308 
2309 		return;
2310 	}
2311 	mutex_exit(&mir->mir_mutex);
2312 }
2313 
2314 static int mir_svc_policy_fails;
2315 
2316 /*
2317  * Called to send an event code to nfsd/lockd so that it initiates
2318  * connection close.
2319  */
2320 static int
2321 mir_svc_policy_notify(queue_t *q, int event)
2322 {
2323 	mblk_t	*mp;
2324 #ifdef DEBUG
2325 	mir_t *mir = (mir_t *)q->q_ptr;
2326 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
2327 #endif
2328 	ASSERT(q->q_flag & QREADR);
2329 
2330 	/*
2331 	 * Create an M_DATA message with the event code and pass it to the
2332 	 * Stream head (nfsd or whoever created the stream will consume it).
2333 	 */
2334 	mp = allocb(sizeof (int), BPRI_HI);
2335 
2336 	if (!mp) {
2337 
2338 		mir_svc_policy_fails++;
2339 		RPCLOG(16, "mir_svc_policy_notify: could not allocate event "
2340 			"%d\n", event);
2341 		return (ENOMEM);
2342 	}
2343 
2344 	U32_TO_BE32(event, mp->b_rptr);
2345 	mp->b_wptr = mp->b_rptr + sizeof (int);
2346 	putnext(q, mp);
2347 	return (0);
2348 }
2349 
2350 /*
2351  * Server side: start the close phase. We want to get this rpcmod slot in an
2352  * idle state before mir_close() is called.
2353  */
2354 static void
2355 mir_svc_start_close(queue_t *wq, mir_t *mir)
2356 {
2357 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
2358 	ASSERT((wq->q_flag & QREADR) == 0);
2359 	ASSERT(mir->mir_type == RPC_SERVER);
2360 
2361 
2362 	/*
2363 	 * Do not accept any more messages.
2364 	 */
2365 	mir->mir_svc_no_more_msgs = 1;
2366 
2367 	/*
2368 	 * Next two statements will make the read service procedure invoke
2369 	 * svc_queuereq() on everything stuck in the streams read queue.
2370 	 * It's not necessary because enabling the write queue will
2371 	 * have the same effect, but why not speed the process along?
2372 	 */
2373 	mir->mir_hold_inbound = 0;
2374 	qenable(RD(wq));
2375 
2376 	/*
2377 	 * Meanwhile force the write service procedure to send the
2378 	 * responses downstream, regardless of flow control.
2379 	 */
2380 	qenable(wq);
2381 }
2382 
2383 /*
2384  * This routine is called directly by KRPC after a request is completed,
2385  * whether a reply was sent or the request was dropped.
2386  */
2387 static void
2388 mir_svc_release(queue_t *wq, mblk_t *mp)
2389 {
2390 	mir_t   *mir = (mir_t *)wq->q_ptr;
2391 	mblk_t	*cmp = NULL;
2392 
2393 	ASSERT((wq->q_flag & QREADR) == 0);
2394 	if (mp)
2395 		freemsg(mp);
2396 
2397 	mutex_enter(&mir->mir_mutex);
2398 
2399 	/*
2400 	 * Start idle processing if this is the last reference.
2401 	 */
2402 	if ((mir->mir_ref_cnt == 1) && (mir->mir_inrservice == 0)) {
2403 
2404 		RPCLOG(16, "mir_svc_release starting idle timer on 0x%p "
2405 		    "because ref cnt is zero\n", (void *) wq);
2406 
2407 		cmp = mir->mir_svc_pend_mp;
2408 		mir->mir_svc_pend_mp = NULL;
2409 		mir_svc_idle_start(wq, mir);
2410 	}
2411 
2412 	mir->mir_ref_cnt--;
2413 	ASSERT(mir->mir_ref_cnt >= 0);
2414 
2415 	/*
2416 	 * Wake up the thread waiting to close.
2417 	 */
2418 
2419 	if ((mir->mir_ref_cnt == 0) && mir->mir_closing)
2420 		cv_signal(&mir->mir_condvar);
2421 
2422 	mutex_exit(&mir->mir_mutex);
2423 
2424 	if (cmp) {
2425 		RPCLOG(16, "mir_svc_release: sending a held "
2426 		    "disconnect/ord rel indication upstream on queue 0x%p\n",
2427 		    (void *)RD(wq));
2428 
2429 		putnext(RD(wq), cmp);
2430 	}
2431 }
2432 
2433 /*
2434  * This routine is called by server-side KRPC when it is ready to
2435  * handle inbound messages on the stream.
2436  */
2437 static void
2438 mir_svc_start(queue_t *wq)
2439 {
2440 	mir_t   *mir = (mir_t *)wq->q_ptr;
2441 
2442 	/*
2443 	 * no longer need to take the mir_mutex because the
2444 	 * mir_setup_complete field has been moved out of
2445 	 * the binary field protected by the mir_mutex.
2446 	 */
2447 
2448 	mir->mir_setup_complete = 1;
2449 	qenable(RD(wq));
2450 }
2451 
2452 /*
2453  * client side wrapper for stopping timer with normal idle timeout.
2454  */
2455 static void
2456 mir_clnt_idle_stop(queue_t *wq, mir_t *mir)
2457 {
2458 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
2459 	ASSERT((wq->q_flag & QREADR) == 0);
2460 	ASSERT(mir->mir_type == RPC_CLIENT);
2461 
2462 	mir_timer_stop(mir);
2463 }
2464 
2465 /*
2466  * client side wrapper for stopping timer with normal idle timeout.
2467  */
2468 static void
2469 mir_clnt_idle_start(queue_t *wq, mir_t *mir)
2470 {
2471 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
2472 	ASSERT((wq->q_flag & QREADR) == 0);
2473 	ASSERT(mir->mir_type == RPC_CLIENT);
2474 
2475 	mir_timer_start(wq, mir, mir->mir_idle_timeout);
2476 }
2477 
2478 /*
2479  * client side only. Forces rpcmod to stop sending T_ORDREL_REQs on
2480  * end-points that aren't connected.
2481  */
2482 static void
2483 mir_clnt_idle_do_stop(queue_t *wq)
2484 {
2485 	mir_t   *mir = (mir_t *)wq->q_ptr;
2486 
2487 	RPCLOG(1, "mir_clnt_idle_do_stop: wq 0x%p\n", (void *)wq);
2488 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
2489 	mutex_enter(&mir->mir_mutex);
2490 	mir_clnt_idle_stop(wq, mir);
2491 	mutex_exit(&mir->mir_mutex);
2492 }
2493 
2494 /*
2495  * Timer handler.  It handles idle timeout and memory shortage problem.
2496  */
2497 static void
2498 mir_timer(void *arg)
2499 {
2500 	queue_t *wq = (queue_t *)arg;
2501 	mir_t *mir = (mir_t *)wq->q_ptr;
2502 	boolean_t notify;
2503 
2504 	mutex_enter(&mir->mir_mutex);
2505 
2506 	/*
2507 	 * mir_timer_call is set only when either mir_timer_[start|stop]
2508 	 * is progressing.  And mir_timer() can only be run while they
2509 	 * are progressing if the timer is being stopped.  So just
2510 	 * return.
2511 	 */
2512 	if (mir->mir_timer_call) {
2513 		mutex_exit(&mir->mir_mutex);
2514 		return;
2515 	}
2516 	mir->mir_timer_id = 0;
2517 
2518 	switch (mir->mir_type) {
2519 	case RPC_CLIENT:
2520 
2521 		/*
2522 		 * For clients, the timer fires at clnt_idle_timeout
2523 		 * intervals.  If the activity marker (mir_clntreq) is
2524 		 * zero, then the stream has been idle since the last
2525 		 * timer event and we notify KRPC.  If mir_clntreq is
2526 		 * non-zero, then the stream is active and we just
2527 		 * restart the timer for another interval.  mir_clntreq
2528 		 * is set to 1 in mir_wput for every request passed
2529 		 * downstream.
2530 		 *
2531 		 * If this was a memory shortage timer reset the idle
2532 		 * timeout regardless; the mir_clntreq will not be a
2533 		 * valid indicator.
2534 		 *
2535 		 * The timer is initially started in mir_wput during
2536 		 * RPC_CLIENT ioctl processing.
2537 		 *
2538 		 * The timer interval can be changed for individual
2539 		 * streams with the ND variable "mir_idle_timeout".
2540 		 */
2541 		if (mir->mir_clntreq > 0 && mir->mir_use_timestamp +
2542 		    MSEC_TO_TICK(mir->mir_idle_timeout) - lbolt >= 0) {
2543 			clock_t tout;
2544 
2545 			tout = mir->mir_idle_timeout -
2546 				TICK_TO_MSEC(lbolt - mir->mir_use_timestamp);
2547 			if (tout < 0)
2548 				tout = 1000;
2549 #if 0
2550 printf("mir_timer[%d < %d + %d]: reset client timer to %d (ms)\n",
2551 TICK_TO_MSEC(lbolt), TICK_TO_MSEC(mir->mir_use_timestamp),
2552 mir->mir_idle_timeout, tout);
2553 #endif
2554 			mir->mir_clntreq = 0;
2555 			mir_timer_start(wq, mir, tout);
2556 			mutex_exit(&mir->mir_mutex);
2557 			return;
2558 		}
2559 #if 0
2560 printf("mir_timer[%d]: doing client timeout\n", lbolt / hz);
2561 #endif
2562 		/*
2563 		 * We are disconnecting, but not necessarily
2564 		 * closing. By not closing, we will fail to
2565 		 * pick up a possibly changed global timeout value,
2566 		 * unless we store it now.
2567 		 */
2568 		mir->mir_idle_timeout = clnt_idle_timeout;
2569 		mir_clnt_idle_start(wq, mir);
2570 
2571 		mutex_exit(&mir->mir_mutex);
2572 		/*
2573 		 * We pass T_ORDREL_REQ as an integer value
2574 		 * to KRPC as the indication that the stream
2575 		 * is idle.  This is not a T_ORDREL_REQ message,
2576 		 * it is just a convenient value since we call
2577 		 * the same KRPC routine for T_ORDREL_INDs and
2578 		 * T_DISCON_INDs.
2579 		 */
2580 		clnt_dispatch_notifyall(wq, T_ORDREL_REQ, 0);
2581 		return;
2582 
2583 	case RPC_SERVER:
2584 
2585 		/*
2586 		 * For servers, the timer is only running when the stream
2587 		 * is really idle or memory is short.  The timer is started
2588 		 * by mir_wput when mir_type is set to RPC_SERVER and
2589 		 * by mir_svc_idle_start whenever the stream goes idle
2590 		 * (mir_ref_cnt == 0).  The timer is cancelled in
2591 		 * mir_rput whenever a new inbound request is passed to KRPC
2592 		 * and the stream was previously idle.
2593 		 *
2594 		 * The timer interval can be changed for individual
2595 		 * streams with the ND variable "mir_idle_timeout".
2596 		 *
2597 		 * If the stream is not idle do nothing.
2598 		 */
2599 		if (!MIR_SVC_QUIESCED(mir)) {
2600 			mutex_exit(&mir->mir_mutex);
2601 			return;
2602 		}
2603 
2604 		notify = !mir->mir_inrservice;
2605 		mutex_exit(&mir->mir_mutex);
2606 
2607 		/*
2608 		 * If there is no packet queued up in read queue, the stream
2609 		 * is really idle so notify nfsd to close it.
2610 		 */
2611 		if (notify) {
2612 			RPCLOG(16, "mir_timer: telling stream head listener "
2613 			    "to close stream (0x%p)\n", (void *) RD(wq));
2614 			(void) mir_svc_policy_notify(RD(wq), 1);
2615 		}
2616 		return;
2617 	default:
2618 		RPCLOG(1, "mir_timer: unexpected mir_type %d\n",
2619 			mir->mir_type);
2620 		mutex_exit(&mir->mir_mutex);
2621 		return;
2622 	}
2623 }
2624 
2625 /*
2626  * Called by the RPC package to send either a call or a return, or a
2627  * transport connection request.  Adds the record marking header.
2628  */
2629 static void
2630 mir_wput(queue_t *q, mblk_t *mp)
2631 {
2632 	uint_t	frag_header;
2633 	mir_t	*mir = (mir_t *)q->q_ptr;
2634 	uchar_t	*rptr = mp->b_rptr;
2635 
2636 	if (!mir) {
2637 		freemsg(mp);
2638 		return;
2639 	}
2640 
2641 	if (mp->b_datap->db_type != M_DATA) {
2642 		mir_wput_other(q, mp);
2643 		return;
2644 	}
2645 
2646 	if (mir->mir_ordrel_pending == 1) {
2647 		freemsg(mp);
2648 		RPCLOG(16, "mir_wput wq 0x%p: got data after T_ORDREL_REQ\n",
2649 			(void *)q);
2650 		return;
2651 	}
2652 
2653 	frag_header = (uint_t)DLEN(mp);
2654 	frag_header |= MIR_LASTFRAG;
2655 
2656 	/* Stick in the 4 byte record marking header. */
2657 	if ((rptr - mp->b_datap->db_base) < sizeof (uint32_t) ||
2658 	    !IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) {
2659 		/*
2660 		 * Since we know that M_DATA messages are created exclusively
2661 		 * by KRPC, we expect that KRPC will leave room for our header
2662 		 * and 4 byte align which is normal for XDR.
2663 		 * If KRPC (or someone else) does not cooperate, then we
2664 		 * just throw away the message.
2665 		 */
2666 		RPCLOG(1, "mir_wput: KRPC did not leave space for record "
2667 		    "fragment header (%d bytes left)\n",
2668 		    (int)(rptr - mp->b_datap->db_base));
2669 		freemsg(mp);
2670 		return;
2671 	}
2672 	rptr -= sizeof (uint32_t);
2673 	*(uint32_t *)rptr = htonl(frag_header);
2674 	mp->b_rptr = rptr;
2675 
2676 	mutex_enter(&mir->mir_mutex);
2677 	if (mir->mir_type == RPC_CLIENT) {
2678 		/*
2679 		 * For the client, set mir_clntreq to indicate that the
2680 		 * connection is active.
2681 		 */
2682 		mir->mir_clntreq = 1;
2683 		mir->mir_use_timestamp = lbolt;
2684 	}
2685 
2686 	/*
2687 	 * If we haven't already queued some data and the downstream module
2688 	 * can accept more data, send it on, otherwise we queue the message
2689 	 * and take other actions depending on mir_type.
2690 	 */
2691 	if (!mir->mir_inwservice && MIR_WCANPUTNEXT(mir, q)) {
2692 		mutex_exit(&mir->mir_mutex);
2693 
2694 		/*
2695 		 * Now we pass the RPC message downstream.
2696 		 */
2697 		putnext(q, mp);
2698 		return;
2699 	}
2700 
2701 	switch (mir->mir_type) {
2702 	case RPC_CLIENT:
2703 		/*
2704 		 * Check for a previous duplicate request on the
2705 		 * queue.  If there is one, then we throw away
2706 		 * the current message and let the previous one
2707 		 * go through.  If we can't find a duplicate, then
2708 		 * send this one.  This tap dance is an effort
2709 		 * to reduce traffic and processing requirements
2710 		 * under load conditions.
2711 		 */
2712 		if (mir_clnt_dup_request(q, mp)) {
2713 			mutex_exit(&mir->mir_mutex);
2714 			freemsg(mp);
2715 			return;
2716 		}
2717 		break;
2718 	case RPC_SERVER:
2719 		/*
2720 		 * Set mir_hold_inbound so that new inbound RPC
2721 		 * messages will be held until the client catches
2722 		 * up on the earlier replies.  This flag is cleared
2723 		 * in mir_wsrv after flow control is relieved;
2724 		 * the read-side queue is also enabled at that time.
2725 		 */
2726 		mir->mir_hold_inbound = 1;
2727 		break;
2728 	default:
2729 		RPCLOG(1, "mir_wput: unexpected mir_type %d\n", mir->mir_type);
2730 		break;
2731 	}
2732 	mir->mir_inwservice = 1;
2733 	(void) putq(q, mp);
2734 	mutex_exit(&mir->mir_mutex);
2735 }
2736 
2737 static void
2738 mir_wput_other(queue_t *q, mblk_t *mp)
2739 {
2740 	mir_t	*mir = (mir_t *)q->q_ptr;
2741 	struct iocblk	*iocp;
2742 	uchar_t	*rptr = mp->b_rptr;
2743 	bool_t	flush_in_svc = FALSE;
2744 
2745 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
2746 	switch (mp->b_datap->db_type) {
2747 	case M_IOCTL:
2748 		iocp = (struct iocblk *)rptr;
2749 		switch (iocp->ioc_cmd) {
2750 		case RPC_CLIENT:
2751 			mutex_enter(&mir->mir_mutex);
2752 			if (mir->mir_type != 0 &&
2753 			    mir->mir_type != iocp->ioc_cmd) {
2754 ioc_eperm:
2755 				mutex_exit(&mir->mir_mutex);
2756 				iocp->ioc_error = EPERM;
2757 				iocp->ioc_count = 0;
2758 				mp->b_datap->db_type = M_IOCACK;
2759 				qreply(q, mp);
2760 				return;
2761 			}
2762 
2763 			mir->mir_type = iocp->ioc_cmd;
2764 
2765 			/*
2766 			 * Clear mir_hold_inbound which was set to 1 by
2767 			 * mir_open.  This flag is not used on client
2768 			 * streams.
2769 			 */
2770 			mir->mir_hold_inbound = 0;
2771 			mir->mir_max_msg_sizep = &clnt_max_msg_size;
2772 
2773 			/*
2774 			 * Start the idle timer.  See mir_timer() for more
2775 			 * information on how client timers work.
2776 			 */
2777 			mir->mir_idle_timeout = clnt_idle_timeout;
2778 			mir_clnt_idle_start(q, mir);
2779 			mutex_exit(&mir->mir_mutex);
2780 
2781 			mp->b_datap->db_type = M_IOCACK;
2782 			qreply(q, mp);
2783 			return;
2784 		case RPC_SERVER:
2785 			mutex_enter(&mir->mir_mutex);
2786 			if (mir->mir_type != 0 &&
2787 			    mir->mir_type != iocp->ioc_cmd)
2788 				goto ioc_eperm;
2789 
2790 			/*
2791 			 * We don't clear mir_hold_inbound here because
2792 			 * mir_hold_inbound is used in the flow control
2793 			 * model. If we cleared it here, then we'd commit
2794 			 * a small violation to the model where the transport
2795 			 * might immediately block downstream flow.
2796 			 */
2797 
2798 			mir->mir_type = iocp->ioc_cmd;
2799 			mir->mir_max_msg_sizep = &svc_max_msg_size;
2800 
2801 			/*
2802 			 * Start the idle timer.  See mir_timer() for more
2803 			 * information on how server timers work.
2804 			 *
2805 			 * Note that it is important to start the idle timer
2806 			 * here so that connections time out even if we
2807 			 * never receive any data on them.
2808 			 */
2809 			mir->mir_idle_timeout = svc_idle_timeout;
2810 			RPCLOG(16, "mir_wput_other starting idle timer on 0x%p "
2811 			    "because we got RPC_SERVER ioctl\n", (void *)q);
2812 			mir_svc_idle_start(q, mir);
2813 			mutex_exit(&mir->mir_mutex);
2814 
2815 			mp->b_datap->db_type = M_IOCACK;
2816 			qreply(q, mp);
2817 			return;
2818 		default:
2819 			break;
2820 		}
2821 		break;
2822 
2823 	case M_PROTO:
2824 		if (mir->mir_type == RPC_CLIENT) {
2825 			/*
2826 			 * We are likely being called from the context of a
2827 			 * service procedure. So we need to enqueue. However
2828 			 * enqueing may put our message behind data messages.
2829 			 * So flush the data first.
2830 			 */
2831 			flush_in_svc = TRUE;
2832 		}
2833 		if ((mp->b_wptr - rptr) < sizeof (uint32_t) ||
2834 				!IS_P2ALIGNED(rptr, sizeof (uint32_t)))
2835 			break;
2836 
2837 		switch (((union T_primitives *)rptr)->type) {
2838 		case T_DATA_REQ:
2839 			/* Don't pass T_DATA_REQ messages downstream. */
2840 			freemsg(mp);
2841 			return;
2842 		case T_ORDREL_REQ:
2843 			RPCLOG(8, "mir_wput_other wq 0x%p: got T_ORDREL_REQ\n",
2844 			    (void *)q);
2845 			mutex_enter(&mir->mir_mutex);
2846 			if (mir->mir_type != RPC_SERVER) {
2847 				/*
2848 				 * We are likely being called from
2849 				 * clnt_dispatch_notifyall(). Sending
2850 				 * a T_ORDREL_REQ will result in
2851 				 * a some kind of _IND message being sent,
2852 				 * will be another call to
2853 				 * clnt_dispatch_notifyall(). To keep the stack
2854 				 * lean, queue this message.
2855 				 */
2856 				mir->mir_inwservice = 1;
2857 				(void) putq(q, mp);
2858 				mutex_exit(&mir->mir_mutex);
2859 				return;
2860 			}
2861 
2862 			/*
2863 			 * Mark the structure such that we don't accept any
2864 			 * more requests from client. We could defer this
2865 			 * until we actually send the orderly release
2866 			 * request downstream, but all that does is delay
2867 			 * the closing of this stream.
2868 			 */
2869 			RPCLOG(16, "mir_wput_other wq 0x%p: got T_ORDREL_REQ "
2870 			    " so calling mir_svc_start_close\n", (void *)q);
2871 
2872 			mir_svc_start_close(q, mir);
2873 
2874 			/*
2875 			 * If we have sent down a T_ORDREL_REQ, don't send
2876 			 * any more.
2877 			 */
2878 			if (mir->mir_ordrel_pending) {
2879 				freemsg(mp);
2880 				mutex_exit(&mir->mir_mutex);
2881 				return;
2882 			}
2883 
2884 			/*
2885 			 * If the stream is not idle, then we hold the
2886 			 * orderly release until it becomes idle.  This
2887 			 * ensures that KRPC will be able to reply to
2888 			 * all requests that we have passed to it.
2889 			 *
2890 			 * We also queue the request if there is data already
2891 			 * queued, because we cannot allow the T_ORDREL_REQ
2892 			 * to go before data. When we had a separate reply
2893 			 * count, this was not a problem, because the
2894 			 * reply count was reconciled when mir_wsrv()
2895 			 * completed.
2896 			 */
2897 			if (!MIR_SVC_QUIESCED(mir) ||
2898 			    mir->mir_inwservice == 1) {
2899 				mir->mir_inwservice = 1;
2900 				(void) putq(q, mp);
2901 
2902 				RPCLOG(16, "mir_wput_other: queuing "
2903 				    "T_ORDREL_REQ on 0x%p\n", (void *)q);
2904 
2905 				mutex_exit(&mir->mir_mutex);
2906 				return;
2907 			}
2908 
2909 			/*
2910 			 * Mark the structure so that we know we sent
2911 			 * an orderly release request, and reset the idle timer.
2912 			 */
2913 			mir->mir_ordrel_pending = 1;
2914 
2915 			RPCLOG(16, "mir_wput_other: calling mir_svc_idle_start"
2916 			    " on 0x%p because we got T_ORDREL_REQ\n",
2917 			    (void *)q);
2918 
2919 			mir_svc_idle_start(q, mir);
2920 			mutex_exit(&mir->mir_mutex);
2921 
2922 			/*
2923 			 * When we break, we will putnext the T_ORDREL_REQ.
2924 			 */
2925 			break;
2926 
2927 		case T_CONN_REQ:
2928 			mutex_enter(&mir->mir_mutex);
2929 			if (mir->mir_head_mp != NULL) {
2930 				freemsg(mir->mir_head_mp);
2931 				mir->mir_head_mp = NULL;
2932 				mir->mir_tail_mp = NULL;
2933 			}
2934 			mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
2935 			/*
2936 			 * Restart timer in case mir_clnt_idle_do_stop() was
2937 			 * called.
2938 			 */
2939 			mir->mir_idle_timeout = clnt_idle_timeout;
2940 			mir_clnt_idle_stop(q, mir);
2941 			mir_clnt_idle_start(q, mir);
2942 			mutex_exit(&mir->mir_mutex);
2943 			break;
2944 
2945 		default:
2946 			/*
2947 			 * T_DISCON_REQ is one of the interesting default
2948 			 * cases here. Ideally, an M_FLUSH is done before
2949 			 * T_DISCON_REQ is done. However, that is somewhat
2950 			 * cumbersome for clnt_cots.c to do. So we queue
2951 			 * T_DISCON_REQ, and let the service procedure
2952 			 * flush all M_DATA.
2953 			 */
2954 			break;
2955 		}
2956 		/* fallthru */;
2957 	default:
2958 		if (mp->b_datap->db_type >= QPCTL) {
2959 			if (mp->b_datap->db_type == M_FLUSH) {
2960 				if (mir->mir_type == RPC_CLIENT &&
2961 				    *mp->b_rptr & FLUSHW) {
2962 					RPCLOG(32, "mir_wput_other: flushing "
2963 					    "wq 0x%p\n", (void *)q);
2964 					if (*mp->b_rptr & FLUSHBAND) {
2965 						flushband(q, *(mp->b_rptr + 1),
2966 							FLUSHDATA);
2967 					} else {
2968 						flushq(q, FLUSHDATA);
2969 					}
2970 				} else {
2971 					RPCLOG(32, "mir_wput_other: ignoring "
2972 					    "M_FLUSH on wq 0x%p\n", (void *)q);
2973 				}
2974 			}
2975 			break;
2976 		}
2977 
2978 		mutex_enter(&mir->mir_mutex);
2979 		if (mir->mir_inwservice == 0 && MIR_WCANPUTNEXT(mir, q)) {
2980 			mutex_exit(&mir->mir_mutex);
2981 			break;
2982 		}
2983 		mir->mir_inwservice = 1;
2984 		mir->mir_inwflushdata = flush_in_svc;
2985 		(void) putq(q, mp);
2986 		mutex_exit(&mir->mir_mutex);
2987 		qenable(q);
2988 
2989 		return;
2990 	}
2991 	putnext(q, mp);
2992 }
2993 
2994 static void
2995 mir_wsrv(queue_t *q)
2996 {
2997 	mblk_t	*mp;
2998 	mir_t	*mir;
2999 	bool_t flushdata;
3000 
3001 	mir = (mir_t *)q->q_ptr;
3002 	mutex_enter(&mir->mir_mutex);
3003 
3004 	flushdata = mir->mir_inwflushdata;
3005 	mir->mir_inwflushdata = 0;
3006 
3007 	while (mp = getq(q)) {
3008 		if (mp->b_datap->db_type == M_DATA) {
3009 			/*
3010 			 * Do not send any more data if we have sent
3011 			 * a T_ORDREL_REQ.
3012 			 */
3013 			if (flushdata || mir->mir_ordrel_pending == 1) {
3014 				freemsg(mp);
3015 				continue;
3016 			}
3017 
3018 			/*
3019 			 * Make sure that the stream can really handle more
3020 			 * data.
3021 			 */
3022 			if (!MIR_WCANPUTNEXT(mir, q)) {
3023 				(void) putbq(q, mp);
3024 				mutex_exit(&mir->mir_mutex);
3025 				return;
3026 			}
3027 
3028 			/*
3029 			 * Now we pass the RPC message downstream.
3030 			 */
3031 			mutex_exit(&mir->mir_mutex);
3032 			putnext(q, mp);
3033 			mutex_enter(&mir->mir_mutex);
3034 			continue;
3035 		}
3036 
3037 		/*
3038 		 * This is not an RPC message, pass it downstream
3039 		 * (ignoring flow control) if the server side is not sending a
3040 		 * T_ORDREL_REQ downstream.
3041 		 */
3042 		if (mir->mir_type != RPC_SERVER ||
3043 			    ((union T_primitives *)mp->b_rptr)->type !=
3044 			    T_ORDREL_REQ) {
3045 			mutex_exit(&mir->mir_mutex);
3046 			putnext(q, mp);
3047 			mutex_enter(&mir->mir_mutex);
3048 			continue;
3049 		}
3050 
3051 		if (mir->mir_ordrel_pending == 1) {
3052 			/*
3053 			 * Don't send two T_ORDRELs
3054 			 */
3055 			freemsg(mp);
3056 			continue;
3057 		}
3058 
3059 		/*
3060 		 * Mark the structure so that we know we sent an orderly
3061 		 * release request.  We will check to see slot is idle at the
3062 		 * end of this routine, and if so, reset the idle timer to
3063 		 * handle orderly release timeouts.
3064 		 */
3065 		mir->mir_ordrel_pending = 1;
3066 		RPCLOG(16, "mir_wsrv: sending ordrel req on q 0x%p\n",
3067 								(void *)q);
3068 		/*
3069 		 * Send the orderly release downstream. If there are other
3070 		 * pending replies we won't be able to send them.  However,
3071 		 * the only reason we should send the orderly release is if
3072 		 * we were idle, or if an unusual event occurred.
3073 		 */
3074 		mutex_exit(&mir->mir_mutex);
3075 		putnext(q, mp);
3076 		mutex_enter(&mir->mir_mutex);
3077 	}
3078 
3079 	if (q->q_first == NULL)
3080 		/*
3081 		 * If we call mir_svc_idle_start() below, then
3082 		 * clearing mir_inwservice here will also result in
3083 		 * any thread waiting in mir_close() to be signaled.
3084 		 */
3085 		mir->mir_inwservice = 0;
3086 
3087 	if (mir->mir_type != RPC_SERVER) {
3088 		mutex_exit(&mir->mir_mutex);
3089 		return;
3090 	}
3091 
3092 	/*
3093 	 * If idle we call mir_svc_idle_start to start the timer (or wakeup
3094 	 * a close). Also make sure not to start the idle timer on the
3095 	 * listener stream. This can cause nfsd to send an orderly release
3096 	 * command on the listener stream.
3097 	 */
3098 	if (MIR_SVC_QUIESCED(mir) && !(mir->mir_listen_stream)) {
3099 		RPCLOG(16, "mir_wsrv: calling mir_svc_idle_start on 0x%p "
3100 		    "because mir slot is idle\n", (void *)q);
3101 		mir_svc_idle_start(q, mir);
3102 	}
3103 
3104 	/*
3105 	 * If outbound flow control has been relieved, then allow new
3106 	 * inbound requests to be processed.
3107 	 */
3108 	if (mir->mir_hold_inbound) {
3109 		mir->mir_hold_inbound = 0;
3110 		qenable(RD(q));
3111 	}
3112 	mutex_exit(&mir->mir_mutex);
3113 }
3114 
3115 static void
3116 mir_disconnect(queue_t *q, mir_t *mir)
3117 {
3118 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
3119 
3120 	switch (mir->mir_type) {
3121 	case RPC_CLIENT:
3122 		/*
3123 		 * We are disconnecting, but not necessarily
3124 		 * closing. By not closing, we will fail to
3125 		 * pick up a possibly changed global timeout value,
3126 		 * unless we store it now.
3127 		 */
3128 		mir->mir_idle_timeout = clnt_idle_timeout;
3129 		mir_clnt_idle_start(WR(q), mir);
3130 		mutex_exit(&mir->mir_mutex);
3131 
3132 		/*
3133 		 * T_DISCON_REQ is passed to KRPC as an integer value
3134 		 * (this is not a TPI message).  It is used as a
3135 		 * convenient value to indicate a sanity check
3136 		 * failure -- the same KRPC routine is also called
3137 		 * for T_DISCON_INDs and T_ORDREL_INDs.
3138 		 */
3139 		clnt_dispatch_notifyall(WR(q), T_DISCON_REQ, 0);
3140 		break;
3141 
3142 	case RPC_SERVER:
3143 		mir->mir_svc_no_more_msgs = 1;
3144 		mir_svc_idle_stop(WR(q), mir);
3145 		mutex_exit(&mir->mir_mutex);
3146 		RPCLOG(16, "mir_disconnect: telling "
3147 			"stream head listener to disconnect stream "
3148 			"(0x%p)\n", (void *) q);
3149 		(void) mir_svc_policy_notify(q, 2);
3150 		break;
3151 
3152 	default:
3153 		mutex_exit(&mir->mir_mutex);
3154 		break;
3155 	}
3156 }
3157 
3158 /*
3159  * do a sanity check on the length of the fragment.
3160  * returns 1 if bad else 0.
3161  */
3162 static int
3163 mir_check_len(queue_t *q, int32_t frag_len,
3164     mblk_t *head_mp)
3165 {
3166 	mir_t   *mir;
3167 
3168 	mir = (mir_t *)q->q_ptr;
3169 
3170 	/*
3171 	 * Do a sanity check on the message length.  If this message is
3172 	 * getting excessively large, shut down the connection.
3173 	 */
3174 
3175 	if ((frag_len <= 0) || (mir->mir_max_msg_sizep == NULL) ||
3176 		(frag_len <= *mir->mir_max_msg_sizep)) {
3177 		return (0);
3178 	}
3179 
3180 	freemsg(head_mp);
3181 	mir->mir_head_mp = (mblk_t *)0;
3182 	mir->mir_frag_len = -(int)sizeof (uint32_t);
3183 	if (mir->mir_type != RPC_SERVER || mir->mir_setup_complete) {
3184 		cmn_err(CE_NOTE,
3185 		"KRPC: record fragment from %s of size(%d) exceeds "
3186 		"maximum (%u). Disconnecting",
3187 		(mir->mir_type == RPC_CLIENT) ? "server" :
3188 		(mir->mir_type == RPC_SERVER) ? "client" :
3189 		"test tool",
3190 		frag_len, *mir->mir_max_msg_sizep);
3191 	}
3192 
3193 	mir_disconnect(q, mir);
3194 	return (1);
3195 }
3196