xref: /titanic_50/usr/src/uts/common/rpc/rpcmod.c (revision f875b4ebb1dd9fdbeb043557cab38ab3bf7f6e01)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 /*
33  * Kernel RPC filtering module
34  */
35 
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/stream.h>
39 #include <sys/stropts.h>
40 #include <sys/tihdr.h>
41 #include <sys/timod.h>
42 #include <sys/tiuser.h>
43 #include <sys/debug.h>
44 #include <sys/signal.h>
45 #include <sys/pcb.h>
46 #include <sys/user.h>
47 #include <sys/errno.h>
48 #include <sys/cred.h>
49 #include <sys/policy.h>
50 #include <sys/inline.h>
51 #include <sys/cmn_err.h>
52 #include <sys/kmem.h>
53 #include <sys/file.h>
54 #include <sys/sysmacros.h>
55 #include <sys/systm.h>
56 #include <sys/t_lock.h>
57 #include <sys/ddi.h>
58 #include <sys/vtrace.h>
59 #include <sys/callb.h>
60 
61 #include <sys/strlog.h>
62 #include <rpc/rpc_com.h>
63 #include <inet/common.h>
64 #include <rpc/types.h>
65 #include <sys/time.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69 #include <rpc/rpc_msg.h>
70 #include <rpc/clnt.h>
71 #include <rpc/svc.h>
72 #include <rpc/rpcsys.h>
73 #include <rpc/rpc_rdma.h>
74 
75 /*
76  * This is the loadable module wrapper.
77  */
78 #include <sys/conf.h>
79 #include <sys/modctl.h>
80 #include <sys/syscall.h>
81 
82 extern struct streamtab rpcinfo;
83 
84 static struct fmodsw fsw = {
85 	"rpcmod",
86 	&rpcinfo,
87 	D_NEW|D_MP,
88 };
89 
90 /*
91  * Module linkage information for the kernel.
92  */
93 
94 static struct modlstrmod modlstrmod = {
95 	&mod_strmodops, "rpc interface str mod", &fsw
96 };
97 
98 /*
99  * For the RPC system call.
100  */
101 static struct sysent rpcsysent = {
102 	2,
103 	SE_32RVAL1 | SE_ARGC | SE_NOUNLOAD,
104 	rpcsys
105 };
106 
107 static struct modlsys modlsys = {
108 	&mod_syscallops,
109 	"RPC syscall",
110 	&rpcsysent
111 };
112 
113 #ifdef _SYSCALL32_IMPL
114 static struct modlsys modlsys32 = {
115 	&mod_syscallops32,
116 	"32-bit RPC syscall",
117 	&rpcsysent
118 };
119 #endif /* _SYSCALL32_IMPL */
120 
121 static struct modlinkage modlinkage = {
122 	MODREV_1,
123 	{
124 		&modlsys,
125 #ifdef _SYSCALL32_IMPL
126 		&modlsys32,
127 #endif
128 		&modlstrmod,
129 		NULL
130 	}
131 };
132 
133 int
134 _init(void)
135 {
136 	int error = 0;
137 	callb_id_t cid;
138 	int status;
139 
140 	svc_init();
141 	clnt_init();
142 	cid = callb_add(connmgr_cpr_reset, 0, CB_CL_CPR_RPC, "rpc");
143 
144 	if (error = mod_install(&modlinkage)) {
145 		/*
146 		 * Could not install module, cleanup previous
147 		 * initialization work.
148 		 */
149 		clnt_fini();
150 		if (cid != NULL)
151 			(void) callb_delete(cid);
152 
153 		return (error);
154 	}
155 
156 	/*
157 	 * Load up the RDMA plugins and initialize the stats. Even if the
158 	 * plugins loadup fails, but rpcmod was successfully installed the
159 	 * counters still get initialized.
160 	 */
161 	rw_init(&rdma_lock, NULL, RW_DEFAULT, NULL);
162 	mutex_init(&rdma_modload_lock, NULL, MUTEX_DEFAULT, NULL);
163 	mt_kstat_init();
164 
165 	/*
166 	 * Get our identification into ldi.  This is used for loading
167 	 * other modules, e.g. rpcib.
168 	 */
169 	status = ldi_ident_from_mod(&modlinkage, &rpcmod_li);
170 	if (status != 0) {
171 		cmn_err(CE_WARN, "ldi_ident_from_mod fails with %d", status);
172 		rpcmod_li = NULL;
173 	}
174 
175 	return (error);
176 }
177 
178 /*
179  * The unload entry point fails, because we advertise entry points into
180  * rpcmod from the rest of kRPC: rpcmod_release().
181  */
182 int
183 _fini(void)
184 {
185 	return (EBUSY);
186 }
187 
188 int
189 _info(struct modinfo *modinfop)
190 {
191 	return (mod_info(&modlinkage, modinfop));
192 }
193 
194 extern int nulldev();
195 
196 #define	RPCMOD_ID	2049
197 
198 int rmm_open(), rmm_close();
199 
200 /*
201  * To save instructions, since STREAMS ignores the return value
202  * from these functions, they are defined as void here. Kind of icky, but...
203  */
204 void rmm_rput(queue_t *, mblk_t *);
205 void rmm_wput(queue_t *, mblk_t *);
206 void rmm_rsrv(queue_t *);
207 void rmm_wsrv(queue_t *);
208 
209 int rpcmodopen(), rpcmodclose();
210 void rpcmodrput(), rpcmodwput();
211 void rpcmodrsrv(), rpcmodwsrv();
212 
213 static	void	rpcmodwput_other(queue_t *, mblk_t *);
214 static	int	mir_close(queue_t *q);
215 static	int	mir_open(queue_t *q, dev_t *devp, int flag, int sflag,
216 		    cred_t *credp);
217 static	void	mir_rput(queue_t *q, mblk_t *mp);
218 static	void	mir_rsrv(queue_t *q);
219 static	void	mir_wput(queue_t *q, mblk_t *mp);
220 static	void	mir_wsrv(queue_t *q);
221 
222 static struct module_info rpcmod_info =
223 	{RPCMOD_ID, "rpcmod", 0, INFPSZ, 256*1024, 1024};
224 
225 /*
226  * Read side has no service procedure.
227  */
228 static struct qinit rpcmodrinit = {
229 	(int (*)())rmm_rput,
230 	(int (*)())rmm_rsrv,
231 	rmm_open,
232 	rmm_close,
233 	nulldev,
234 	&rpcmod_info,
235 	NULL
236 };
237 
238 /*
239  * The write put procedure is simply putnext to conserve stack space.
240  * The write service procedure is not used to queue data, but instead to
241  * synchronize with flow control.
242  */
243 static struct qinit rpcmodwinit = {
244 	(int (*)())rmm_wput,
245 	(int (*)())rmm_wsrv,
246 	rmm_open,
247 	rmm_close,
248 	nulldev,
249 	&rpcmod_info,
250 	NULL
251 };
252 struct streamtab rpcinfo = { &rpcmodrinit, &rpcmodwinit, NULL, NULL };
253 
254 struct xprt_style_ops {
255 	int (*xo_open)();
256 	int (*xo_close)();
257 	void (*xo_wput)();
258 	void (*xo_wsrv)();
259 	void (*xo_rput)();
260 	void (*xo_rsrv)();
261 };
262 
263 static struct xprt_style_ops xprt_clts_ops = {
264 	rpcmodopen,
265 	rpcmodclose,
266 	rpcmodwput,
267 	rpcmodwsrv,
268 	rpcmodrput,
269 	NULL
270 };
271 
272 static struct xprt_style_ops xprt_cots_ops = {
273 	mir_open,
274 	mir_close,
275 	mir_wput,
276 	mir_wsrv,
277 	mir_rput,
278 	mir_rsrv
279 };
280 
281 /*
282  * Per rpcmod "slot" data structure. q->q_ptr points to one of these.
283  */
284 struct rpcm {
285 	void		*rm_krpc_cell;	/* Reserved for use by KRPC */
286 	struct		xprt_style_ops	*rm_ops;
287 	int		rm_type;	/* Client or server side stream */
288 #define	RM_CLOSING	0x1		/* somebody is trying to close slot */
289 	uint_t		rm_state;	/* state of the slot. see above */
290 	uint_t		rm_ref;		/* cnt of external references to slot */
291 	kmutex_t	rm_lock;	/* mutex protecting above fields */
292 	kcondvar_t	rm_cwait;	/* condition for closing */
293 	zoneid_t	rm_zoneid;	/* zone which pushed rpcmod */
294 };
295 
296 struct temp_slot {
297 	void *cell;
298 	struct xprt_style_ops *ops;
299 	int type;
300 	mblk_t *info_ack;
301 	kmutex_t lock;
302 	kcondvar_t wait;
303 };
304 
305 typedef struct mir_s {
306 	void	*mir_krpc_cell;	/* Reserved for KRPC use. This field */
307 					/* must be first in the structure. */
308 	struct xprt_style_ops	*rm_ops;
309 	int	mir_type;		/* Client or server side stream */
310 
311 	mblk_t	*mir_head_mp;		/* RPC msg in progress */
312 		/*
313 		 * mir_head_mp points the first mblk being collected in
314 		 * the current RPC message.  Record headers are removed
315 		 * before data is linked into mir_head_mp.
316 		 */
317 	mblk_t	*mir_tail_mp;		/* Last mblk in mir_head_mp */
318 		/*
319 		 * mir_tail_mp points to the last mblk in the message
320 		 * chain starting at mir_head_mp.  It is only valid
321 		 * if mir_head_mp is non-NULL and is used to add new
322 		 * data blocks to the end of chain quickly.
323 		 */
324 
325 	int32_t	mir_frag_len;		/* Bytes seen in the current frag */
326 		/*
327 		 * mir_frag_len starts at -4 for beginning of each fragment.
328 		 * When this length is negative, it indicates the number of
329 		 * bytes that rpcmod needs to complete the record marker
330 		 * header.  When it is positive or zero, it holds the number
331 		 * of bytes that have arrived for the current fragment and
332 		 * are held in mir_header_mp.
333 		 */
334 
335 	int32_t	mir_frag_header;
336 		/*
337 		 * Fragment header as collected for the current fragment.
338 		 * It holds the last-fragment indicator and the number
339 		 * of bytes in the fragment.
340 		 */
341 
342 	unsigned int
343 		mir_ordrel_pending : 1,	/* Sent T_ORDREL_REQ */
344 		mir_hold_inbound : 1,	/* Hold inbound messages on server */
345 					/* side until outbound flow control */
346 					/* is relieved. */
347 		mir_closing : 1,	/* The stream is being closed */
348 		mir_inrservice : 1,	/* data queued or rd srv proc running */
349 		mir_inwservice : 1,	/* data queued or wr srv proc running */
350 		mir_inwflushdata : 1,	/* flush M_DATAs when srv runs */
351 		/*
352 		 * On client streams, mir_clntreq is 0 or 1; it is set
353 		 * to 1 whenever a new request is sent out (mir_wput)
354 		 * and cleared when the timer fires (mir_timer).  If
355 		 * the timer fires with this value equal to 0, then the
356 		 * stream is considered idle and KRPC is notified.
357 		 */
358 		mir_clntreq : 1,
359 		/*
360 		 * On server streams, stop accepting messages
361 		 */
362 		mir_svc_no_more_msgs : 1,
363 		mir_listen_stream : 1,	/* listen end point */
364 		mir_unused : 1,	/* no longer used */
365 		mir_timer_call : 1,
366 		mir_junk_fill_thru_bit_31 : 21;
367 
368 	int	mir_setup_complete;	/* server has initialized everything */
369 	timeout_id_t mir_timer_id;	/* Timer for idle checks */
370 	clock_t	mir_idle_timeout;	/* Allowed idle time before shutdown */
371 		/*
372 		 * This value is copied from clnt_idle_timeout or
373 		 * svc_idle_timeout during the appropriate ioctl.
374 		 * Kept in milliseconds
375 		 */
376 	clock_t	mir_use_timestamp;	/* updated on client with each use */
377 		/*
378 		 * This value is set to lbolt
379 		 * every time a client stream sends or receives data.
380 		 * Even if the timer message arrives, we don't shutdown
381 		 * client unless:
382 		 *    lbolt >= MSEC_TO_TICK(mir_idle_timeout)+mir_use_timestamp.
383 		 * This value is kept in HZ.
384 		 */
385 
386 	uint_t	*mir_max_msg_sizep;	/* Reference to sanity check size */
387 		/*
388 		 * This pointer is set to &clnt_max_msg_size or
389 		 * &svc_max_msg_size during the appropriate ioctl.
390 		 */
391 	zoneid_t mir_zoneid;	/* zone which pushed rpcmod */
392 	/* Server-side fields. */
393 	int	mir_ref_cnt;		/* Reference count: server side only */
394 					/* counts the number of references */
395 					/* that a kernel RPC server thread */
396 					/* (see svc_run()) has on this rpcmod */
397 					/* slot. Effectively, it is the */
398 					/* number * of unprocessed messages */
399 					/* that have been passed up to the */
400 					/* KRPC layer */
401 
402 	mblk_t	*mir_svc_pend_mp;	/* Pending T_ORDREL_IND or */
403 					/* T_DISCON_IND */
404 
405 	/*
406 	 * these fields are for both client and server, but for debugging,
407 	 * it is easier to have these last in the structure.
408 	 */
409 	kmutex_t	mir_mutex;	/* Mutex and condvar for close */
410 	kcondvar_t	mir_condvar;	/* synchronization. */
411 	kcondvar_t	mir_timer_cv;	/* Timer routine sync. */
412 } mir_t;
413 
414 void tmp_rput(queue_t *q, mblk_t *mp);
415 
416 struct xprt_style_ops tmpops = {
417 	NULL,
418 	NULL,
419 	putnext,
420 	NULL,
421 	tmp_rput,
422 	NULL
423 };
424 
425 void
426 tmp_rput(queue_t *q, mblk_t *mp)
427 {
428 	struct temp_slot *t = (struct temp_slot *)(q->q_ptr);
429 	struct T_info_ack *pptr;
430 
431 	switch (mp->b_datap->db_type) {
432 	case M_PCPROTO:
433 		pptr = (struct T_info_ack *)mp->b_rptr;
434 		switch (pptr->PRIM_type) {
435 		case T_INFO_ACK:
436 			mutex_enter(&t->lock);
437 			t->info_ack = mp;
438 			cv_signal(&t->wait);
439 			mutex_exit(&t->lock);
440 			return;
441 		default:
442 			break;
443 		}
444 	default:
445 		break;
446 	}
447 
448 	/*
449 	 * Not an info-ack, so free it. This is ok because we should
450 	 * not be receiving data until the open finishes: rpcmod
451 	 * is pushed well before the end-point is bound to an address.
452 	 */
453 	freemsg(mp);
454 }
455 
456 int
457 rmm_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
458 {
459 	mblk_t *bp;
460 	struct temp_slot ts, *t;
461 	struct T_info_ack *pptr;
462 	int error = 0;
463 
464 	ASSERT(q != NULL);
465 	/*
466 	 * Check for re-opens.
467 	 */
468 	if (q->q_ptr) {
469 		TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END,
470 		    "rpcmodopen_end:(%s)", "q->qptr");
471 		return (0);
472 	}
473 
474 	t = &ts;
475 	bzero(t, sizeof (*t));
476 	q->q_ptr = (void *)t;
477 	WR(q)->q_ptr = (void *)t;
478 
479 	/*
480 	 * Allocate the required messages upfront.
481 	 */
482 	if ((bp = allocb(sizeof (struct T_info_req) +
483 	    sizeof (struct T_info_ack), BPRI_LO)) == (mblk_t *)NULL) {
484 		return (ENOBUFS);
485 	}
486 
487 	mutex_init(&t->lock, NULL, MUTEX_DEFAULT, NULL);
488 	cv_init(&t->wait, NULL, CV_DEFAULT, NULL);
489 
490 	t->ops = &tmpops;
491 
492 	qprocson(q);
493 	bp->b_datap->db_type = M_PCPROTO;
494 	*(int32_t *)bp->b_wptr = (int32_t)T_INFO_REQ;
495 	bp->b_wptr += sizeof (struct T_info_req);
496 	putnext(WR(q), bp);
497 
498 	mutex_enter(&t->lock);
499 	while (t->info_ack == NULL) {
500 		if (cv_wait_sig(&t->wait, &t->lock) == 0) {
501 			error = EINTR;
502 			break;
503 		}
504 	}
505 	mutex_exit(&t->lock);
506 
507 	if (error)
508 		goto out;
509 
510 	pptr = (struct T_info_ack *)t->info_ack->b_rptr;
511 
512 	if (pptr->SERV_type == T_CLTS) {
513 		if ((error = rpcmodopen(q, devp, flag, sflag, crp)) == 0)
514 			((struct rpcm *)q->q_ptr)->rm_ops = &xprt_clts_ops;
515 	} else {
516 		if ((error = mir_open(q, devp, flag, sflag, crp)) == 0)
517 			((mir_t *)q->q_ptr)->rm_ops = &xprt_cots_ops;
518 	}
519 
520 out:
521 	if (error)
522 		qprocsoff(q);
523 
524 	freemsg(t->info_ack);
525 	mutex_destroy(&t->lock);
526 	cv_destroy(&t->wait);
527 
528 	return (error);
529 }
530 
531 void
532 rmm_rput(queue_t *q, mblk_t  *mp)
533 {
534 	(*((struct temp_slot *)q->q_ptr)->ops->xo_rput)(q, mp);
535 }
536 
537 void
538 rmm_rsrv(queue_t *q)
539 {
540 	(*((struct temp_slot *)q->q_ptr)->ops->xo_rsrv)(q);
541 }
542 
543 void
544 rmm_wput(queue_t *q, mblk_t *mp)
545 {
546 	(*((struct temp_slot *)q->q_ptr)->ops->xo_wput)(q, mp);
547 }
548 
549 void
550 rmm_wsrv(queue_t *q)
551 {
552 	(*((struct temp_slot *)q->q_ptr)->ops->xo_wsrv)(q);
553 }
554 
555 int
556 rmm_close(queue_t *q, int flag, cred_t *crp)
557 {
558 	return ((*((struct temp_slot *)q->q_ptr)->ops->xo_close)(q, flag, crp));
559 }
560 
561 /*
562  * rpcmodopen -	open routine gets called when the module gets pushed
563  *		onto the stream.
564  */
565 /*ARGSUSED*/
566 int
567 rpcmodopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
568 {
569 	struct rpcm *rmp;
570 
571 	extern void (*rpc_rele)(queue_t *, mblk_t *);
572 	static void rpcmod_release(queue_t *, mblk_t *);
573 
574 	TRACE_0(TR_FAC_KRPC, TR_RPCMODOPEN_START, "rpcmodopen_start:");
575 
576 	/*
577 	 * Initialize entry points to release a rpcmod slot (and an input
578 	 * message if supplied) and to send an output message to the module
579 	 * below rpcmod.
580 	 */
581 	if (rpc_rele == NULL)
582 		rpc_rele = rpcmod_release;
583 
584 	/*
585 	 * Only sufficiently privileged users can use this module, and it
586 	 * is assumed that they will use this module properly, and NOT send
587 	 * bulk data from downstream.
588 	 */
589 	if (secpolicy_rpcmod_open(crp) != 0)
590 		return (EPERM);
591 
592 	/*
593 	 * Allocate slot data structure.
594 	 */
595 	rmp = kmem_zalloc(sizeof (*rmp), KM_SLEEP);
596 
597 	mutex_init(&rmp->rm_lock, NULL, MUTEX_DEFAULT, NULL);
598 	cv_init(&rmp->rm_cwait, NULL, CV_DEFAULT, NULL);
599 	rmp->rm_zoneid = rpc_zoneid();
600 	/*
601 	 * slot type will be set by kRPC client and server ioctl's
602 	 */
603 	rmp->rm_type = 0;
604 
605 	q->q_ptr = (void *)rmp;
606 	WR(q)->q_ptr = (void *)rmp;
607 
608 	TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END, "rpcmodopen_end:(%s)", "end");
609 	return (0);
610 }
611 
612 /*
613  * rpcmodclose - This routine gets called when the module gets popped
614  * off of the stream.
615  */
616 /*ARGSUSED*/
617 int
618 rpcmodclose(queue_t *q, int flag, cred_t *crp)
619 {
620 	struct rpcm *rmp;
621 
622 	ASSERT(q != NULL);
623 	rmp = (struct rpcm *)q->q_ptr;
624 
625 	/*
626 	 * Mark our state as closing.
627 	 */
628 	mutex_enter(&rmp->rm_lock);
629 	rmp->rm_state |= RM_CLOSING;
630 
631 	/*
632 	 * Check and see if there are any messages on the queue.  If so, send
633 	 * the messages, regardless whether the downstream module is ready to
634 	 * accept data.
635 	 */
636 	if (rmp->rm_type == RPC_SERVER) {
637 		flushq(q, FLUSHDATA);
638 
639 		qenable(WR(q));
640 
641 		if (rmp->rm_ref) {
642 			mutex_exit(&rmp->rm_lock);
643 			/*
644 			 * call into SVC to clean the queue
645 			 */
646 			svc_queueclean(q);
647 			mutex_enter(&rmp->rm_lock);
648 
649 			/*
650 			 * Block while there are kRPC threads with a reference
651 			 * to this message.
652 			 */
653 			while (rmp->rm_ref)
654 				cv_wait(&rmp->rm_cwait, &rmp->rm_lock);
655 		}
656 
657 		mutex_exit(&rmp->rm_lock);
658 
659 		/*
660 		 * It is now safe to remove this queue from the stream. No kRPC
661 		 * threads have a reference to the stream, and none ever will,
662 		 * because RM_CLOSING is set.
663 		 */
664 		qprocsoff(q);
665 
666 		/* Notify kRPC that this stream is going away. */
667 		svc_queueclose(q);
668 	} else {
669 		mutex_exit(&rmp->rm_lock);
670 		qprocsoff(q);
671 	}
672 
673 	q->q_ptr = NULL;
674 	WR(q)->q_ptr = NULL;
675 	mutex_destroy(&rmp->rm_lock);
676 	cv_destroy(&rmp->rm_cwait);
677 	kmem_free(rmp, sizeof (*rmp));
678 	return (0);
679 }
680 
681 #ifdef	DEBUG
682 int	rpcmod_send_msg_up = 0;
683 int	rpcmod_send_uderr = 0;
684 int	rpcmod_send_dup = 0;
685 int	rpcmod_send_dup_cnt = 0;
686 #endif
687 
688 /*
689  * rpcmodrput -	Module read put procedure.  This is called from
690  *		the module, driver, or stream head downstream.
691  */
692 void
693 rpcmodrput(queue_t *q, mblk_t *mp)
694 {
695 	struct rpcm *rmp;
696 	union T_primitives *pptr;
697 	int hdrsz;
698 
699 	TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_START, "rpcmodrput_start:");
700 
701 	ASSERT(q != NULL);
702 	rmp = (struct rpcm *)q->q_ptr;
703 
704 	if (rmp->rm_type == 0) {
705 		freemsg(mp);
706 		return;
707 	}
708 
709 #ifdef DEBUG
710 	if (rpcmod_send_msg_up > 0) {
711 		mblk_t *nmp = copymsg(mp);
712 		if (nmp) {
713 			putnext(q, nmp);
714 			rpcmod_send_msg_up--;
715 		}
716 	}
717 	if ((rpcmod_send_uderr > 0) && mp->b_datap->db_type == M_PROTO) {
718 		mblk_t *nmp;
719 		struct T_unitdata_ind *data;
720 		struct T_uderror_ind *ud;
721 		int d;
722 		data = (struct T_unitdata_ind *)mp->b_rptr;
723 		if (data->PRIM_type == T_UNITDATA_IND) {
724 			d = sizeof (*ud) - sizeof (*data);
725 			nmp = allocb(mp->b_wptr - mp->b_rptr + d, BPRI_HI);
726 			if (nmp) {
727 				ud = (struct T_uderror_ind *)nmp->b_rptr;
728 				ud->PRIM_type = T_UDERROR_IND;
729 				ud->DEST_length = data->SRC_length;
730 				ud->DEST_offset = data->SRC_offset + d;
731 				ud->OPT_length = data->OPT_length;
732 				ud->OPT_offset = data->OPT_offset + d;
733 				ud->ERROR_type = ENETDOWN;
734 				if (data->SRC_length) {
735 					bcopy(mp->b_rptr +
736 					    data->SRC_offset,
737 					    nmp->b_rptr +
738 					    ud->DEST_offset,
739 					    data->SRC_length);
740 				}
741 				if (data->OPT_length) {
742 					bcopy(mp->b_rptr +
743 					    data->OPT_offset,
744 					    nmp->b_rptr +
745 					    ud->OPT_offset,
746 					    data->OPT_length);
747 				}
748 				nmp->b_wptr += d;
749 				nmp->b_wptr += (mp->b_wptr - mp->b_rptr);
750 				nmp->b_datap->db_type = M_PROTO;
751 				putnext(q, nmp);
752 				rpcmod_send_uderr--;
753 			}
754 		}
755 	}
756 #endif
757 	switch (mp->b_datap->db_type) {
758 	default:
759 		putnext(q, mp);
760 		break;
761 
762 	case M_PROTO:
763 	case M_PCPROTO:
764 		ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (int32_t));
765 		pptr = (union T_primitives *)mp->b_rptr;
766 
767 		/*
768 		 * Forward this message to krpc if it is data.
769 		 */
770 		if (pptr->type == T_UNITDATA_IND) {
771 			mblk_t *nmp;
772 
773 		/*
774 		 * Check if the module is being popped.
775 		 */
776 			mutex_enter(&rmp->rm_lock);
777 			if (rmp->rm_state & RM_CLOSING) {
778 				mutex_exit(&rmp->rm_lock);
779 				putnext(q, mp);
780 				break;
781 			}
782 
783 			switch (rmp->rm_type) {
784 			case RPC_CLIENT:
785 				mutex_exit(&rmp->rm_lock);
786 				hdrsz = mp->b_wptr - mp->b_rptr;
787 
788 				/*
789 				 * Make sure the header is sane.
790 				 */
791 				if (hdrsz < TUNITDATAINDSZ ||
792 				    hdrsz < (pptr->unitdata_ind.OPT_length +
793 				    pptr->unitdata_ind.OPT_offset) ||
794 				    hdrsz < (pptr->unitdata_ind.SRC_length +
795 				    pptr->unitdata_ind.SRC_offset)) {
796 					freemsg(mp);
797 					return;
798 				}
799 
800 				/*
801 				 * Call clnt_clts_dispatch_notify, so that it
802 				 * can pass the message to the proper caller.
803 				 * Don't discard the header just yet since the
804 				 * client may need the sender's address.
805 				 */
806 				clnt_clts_dispatch_notify(mp, hdrsz,
807 				    rmp->rm_zoneid);
808 				return;
809 			case RPC_SERVER:
810 				/*
811 				 * rm_krpc_cell is exclusively used by the kRPC
812 				 * CLTS server
813 				 */
814 				if (rmp->rm_krpc_cell) {
815 #ifdef DEBUG
816 					/*
817 					 * Test duplicate request cache and
818 					 * rm_ref count handling by sending a
819 					 * duplicate every so often, if
820 					 * desired.
821 					 */
822 					if (rpcmod_send_dup &&
823 					    rpcmod_send_dup_cnt++ %
824 					    rpcmod_send_dup)
825 						nmp = copymsg(mp);
826 					else
827 						nmp = NULL;
828 #endif
829 					/*
830 					 * Raise the reference count on this
831 					 * module to prevent it from being
832 					 * popped before krpc generates the
833 					 * reply.
834 					 */
835 					rmp->rm_ref++;
836 					mutex_exit(&rmp->rm_lock);
837 
838 					/*
839 					 * Submit the message to krpc.
840 					 */
841 					svc_queuereq(q, mp);
842 #ifdef DEBUG
843 					/*
844 					 * Send duplicate if we created one.
845 					 */
846 					if (nmp) {
847 						mutex_enter(&rmp->rm_lock);
848 						rmp->rm_ref++;
849 						mutex_exit(&rmp->rm_lock);
850 						svc_queuereq(q, nmp);
851 					}
852 #endif
853 				} else {
854 					mutex_exit(&rmp->rm_lock);
855 					freemsg(mp);
856 				}
857 				return;
858 			default:
859 				mutex_exit(&rmp->rm_lock);
860 				freemsg(mp);
861 				return;
862 			} /* end switch(rmp->rm_type) */
863 		} else if (pptr->type == T_UDERROR_IND) {
864 			mutex_enter(&rmp->rm_lock);
865 			hdrsz = mp->b_wptr - mp->b_rptr;
866 
867 			/*
868 			 * Make sure the header is sane
869 			 */
870 			if (hdrsz < TUDERRORINDSZ ||
871 			    hdrsz < (pptr->uderror_ind.OPT_length +
872 			    pptr->uderror_ind.OPT_offset) ||
873 			    hdrsz < (pptr->uderror_ind.DEST_length +
874 			    pptr->uderror_ind.DEST_offset)) {
875 				mutex_exit(&rmp->rm_lock);
876 				freemsg(mp);
877 				return;
878 			}
879 
880 			/*
881 			 * In the case where a unit data error has been
882 			 * received, all we need to do is clear the message from
883 			 * the queue.
884 			 */
885 			mutex_exit(&rmp->rm_lock);
886 			freemsg(mp);
887 			RPCLOG(32, "rpcmodrput: unitdata error received at "
888 			    "%ld\n", gethrestime_sec());
889 			return;
890 		} /* end else if (pptr->type == T_UDERROR_IND) */
891 
892 		putnext(q, mp);
893 		break;
894 	} /* end switch (mp->b_datap->db_type) */
895 
896 	TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_END,
897 	    "rpcmodrput_end:");
898 	/*
899 	 * Return codes are not looked at by the STREAMS framework.
900 	 */
901 }
902 
903 /*
904  * write put procedure
905  */
906 void
907 rpcmodwput(queue_t *q, mblk_t *mp)
908 {
909 	struct rpcm	*rmp;
910 
911 	ASSERT(q != NULL);
912 
913 	switch (mp->b_datap->db_type) {
914 		case M_PROTO:
915 		case M_PCPROTO:
916 			break;
917 		default:
918 			rpcmodwput_other(q, mp);
919 			return;
920 	}
921 
922 	/*
923 	 * Check to see if we can send the message downstream.
924 	 */
925 	if (canputnext(q)) {
926 		putnext(q, mp);
927 		return;
928 	}
929 
930 	rmp = (struct rpcm *)q->q_ptr;
931 	ASSERT(rmp != NULL);
932 
933 	/*
934 	 * The first canputnext failed.  Try again except this time with the
935 	 * lock held, so that we can check the state of the stream to see if
936 	 * it is closing.  If either of these conditions evaluate to true
937 	 * then send the meesage.
938 	 */
939 	mutex_enter(&rmp->rm_lock);
940 	if (canputnext(q) || (rmp->rm_state & RM_CLOSING)) {
941 		mutex_exit(&rmp->rm_lock);
942 		putnext(q, mp);
943 	} else {
944 		/*
945 		 * canputnext failed again and the stream is not closing.
946 		 * Place the message on the queue and let the service
947 		 * procedure handle the message.
948 		 */
949 		mutex_exit(&rmp->rm_lock);
950 		(void) putq(q, mp);
951 	}
952 }
953 
954 static void
955 rpcmodwput_other(queue_t *q, mblk_t *mp)
956 {
957 	struct rpcm	*rmp;
958 	struct iocblk	*iocp;
959 
960 	rmp = (struct rpcm *)q->q_ptr;
961 	ASSERT(rmp != NULL);
962 
963 	switch (mp->b_datap->db_type) {
964 		case M_IOCTL:
965 			iocp = (struct iocblk *)mp->b_rptr;
966 			ASSERT(iocp != NULL);
967 			switch (iocp->ioc_cmd) {
968 				case RPC_CLIENT:
969 				case RPC_SERVER:
970 					mutex_enter(&rmp->rm_lock);
971 					rmp->rm_type = iocp->ioc_cmd;
972 					mutex_exit(&rmp->rm_lock);
973 					mp->b_datap->db_type = M_IOCACK;
974 					qreply(q, mp);
975 					return;
976 				default:
977 				/*
978 				 * pass the ioctl downstream and hope someone
979 				 * down there knows how to handle it.
980 				 */
981 					putnext(q, mp);
982 					return;
983 			}
984 		default:
985 			break;
986 	}
987 	/*
988 	 * This is something we definitely do not know how to handle, just
989 	 * pass the message downstream
990 	 */
991 	putnext(q, mp);
992 }
993 
994 /*
995  * Module write service procedure. This is called by downstream modules
996  * for back enabling during flow control.
997  */
998 void
999 rpcmodwsrv(queue_t *q)
1000 {
1001 	struct rpcm	*rmp;
1002 	mblk_t		*mp = NULL;
1003 
1004 	rmp = (struct rpcm *)q->q_ptr;
1005 	ASSERT(rmp != NULL);
1006 
1007 	/*
1008 	 * Get messages that may be queued and send them down stream
1009 	 */
1010 	while ((mp = getq(q)) != NULL) {
1011 		/*
1012 		 * Optimize the service procedure for the server-side, by
1013 		 * avoiding a call to canputnext().
1014 		 */
1015 		if (rmp->rm_type == RPC_SERVER || canputnext(q)) {
1016 			putnext(q, mp);
1017 			continue;
1018 		}
1019 		(void) putbq(q, mp);
1020 		return;
1021 	}
1022 }
1023 
1024 static void
1025 rpcmod_release(queue_t *q, mblk_t *bp)
1026 {
1027 	struct rpcm *rmp;
1028 
1029 	/*
1030 	 * For now, just free the message.
1031 	 */
1032 	if (bp)
1033 		freemsg(bp);
1034 	rmp = (struct rpcm *)q->q_ptr;
1035 
1036 	mutex_enter(&rmp->rm_lock);
1037 	rmp->rm_ref--;
1038 
1039 	if (rmp->rm_ref == 0 && (rmp->rm_state & RM_CLOSING)) {
1040 		cv_broadcast(&rmp->rm_cwait);
1041 	}
1042 
1043 	mutex_exit(&rmp->rm_lock);
1044 }
1045 
1046 /*
1047  * This part of rpcmod is pushed on a connection-oriented transport for use
1048  * by RPC.  It serves to bypass the Stream head, implements
1049  * the record marking protocol, and dispatches incoming RPC messages.
1050  */
1051 
1052 /* Default idle timer values */
1053 #define	MIR_CLNT_IDLE_TIMEOUT	(5 * (60 * 1000L))	/* 5 minutes */
1054 #define	MIR_SVC_IDLE_TIMEOUT	(6 * (60 * 1000L))	/* 6 minutes */
1055 #define	MIR_SVC_ORDREL_TIMEOUT	(10 * (60 * 1000L))	/* 10 minutes */
1056 #define	MIR_LASTFRAG	0x80000000	/* Record marker */
1057 
1058 #define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
1059 
1060 #define	MIR_SVC_QUIESCED(mir)	\
1061 	(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)
1062 
1063 #define	MIR_CLEAR_INRSRV(mir_ptr)	{	\
1064 	(mir_ptr)->mir_inrservice = 0;	\
1065 	if ((mir_ptr)->mir_type == RPC_SERVER &&	\
1066 		(mir_ptr)->mir_closing)	\
1067 		cv_signal(&(mir_ptr)->mir_condvar);	\
1068 }
1069 
1070 /*
1071  * Don't block service procedure (and mir_close) if
1072  * we are in the process of closing.
1073  */
1074 #define	MIR_WCANPUTNEXT(mir_ptr, write_q)	\
1075 	(canputnext(write_q) || ((mir_ptr)->mir_svc_no_more_msgs == 1))
1076 
1077 static int	mir_clnt_dup_request(queue_t *q, mblk_t *mp);
1078 static void	mir_rput_proto(queue_t *q, mblk_t *mp);
1079 static int	mir_svc_policy_notify(queue_t *q, int event);
1080 static void	mir_svc_release(queue_t *wq, mblk_t *mp);
1081 static void	mir_svc_start(queue_t *wq);
1082 static void	mir_svc_idle_start(queue_t *, mir_t *);
1083 static void	mir_svc_idle_stop(queue_t *, mir_t *);
1084 static void	mir_svc_start_close(queue_t *, mir_t *);
1085 static void	mir_clnt_idle_do_stop(queue_t *);
1086 static void	mir_clnt_idle_stop(queue_t *, mir_t *);
1087 static void	mir_clnt_idle_start(queue_t *, mir_t *);
1088 static void	mir_wput(queue_t *q, mblk_t *mp);
1089 static void	mir_wput_other(queue_t *q, mblk_t *mp);
1090 static void	mir_wsrv(queue_t *q);
1091 static	void	mir_disconnect(queue_t *, mir_t *ir);
1092 static	int	mir_check_len(queue_t *, int32_t, mblk_t *);
1093 static	void	mir_timer(void *);
1094 
1095 extern void	(*mir_rele)(queue_t *, mblk_t *);
1096 extern void	(*mir_start)(queue_t *);
1097 extern void	(*clnt_stop_idle)(queue_t *);
1098 
1099 clock_t	clnt_idle_timeout = MIR_CLNT_IDLE_TIMEOUT;
1100 clock_t	svc_idle_timeout = MIR_SVC_IDLE_TIMEOUT;
1101 
1102 /*
1103  * Timeout for subsequent notifications of idle connection.  This is
1104  * typically used to clean up after a wedged orderly release.
1105  */
1106 clock_t	svc_ordrel_timeout = MIR_SVC_ORDREL_TIMEOUT; /* milliseconds */
1107 
1108 extern	uint_t	*clnt_max_msg_sizep;
1109 extern	uint_t	*svc_max_msg_sizep;
1110 uint_t	clnt_max_msg_size = RPC_MAXDATASIZE;
1111 uint_t	svc_max_msg_size = RPC_MAXDATASIZE;
1112 uint_t	mir_krpc_cell_null;
1113 
1114 static void
1115 mir_timer_stop(mir_t *mir)
1116 {
1117 	timeout_id_t tid;
1118 
1119 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1120 
1121 	/*
1122 	 * Since the mir_mutex lock needs to be released to call
1123 	 * untimeout(), we need to make sure that no other thread
1124 	 * can start/stop the timer (changing mir_timer_id) during
1125 	 * that time.  The mir_timer_call bit and the mir_timer_cv
1126 	 * condition variable are used to synchronize this.  Setting
1127 	 * mir_timer_call also tells mir_timer() (refer to the comments
1128 	 * in mir_timer()) that it does not need to do anything.
1129 	 */
1130 	while (mir->mir_timer_call)
1131 		cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
1132 	mir->mir_timer_call = B_TRUE;
1133 
1134 	if ((tid = mir->mir_timer_id) != 0) {
1135 		mir->mir_timer_id = 0;
1136 		mutex_exit(&mir->mir_mutex);
1137 		(void) untimeout(tid);
1138 		mutex_enter(&mir->mir_mutex);
1139 	}
1140 	mir->mir_timer_call = B_FALSE;
1141 	cv_broadcast(&mir->mir_timer_cv);
1142 }
1143 
1144 static void
1145 mir_timer_start(queue_t *q, mir_t *mir, clock_t intrvl)
1146 {
1147 	timeout_id_t tid;
1148 
1149 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1150 
1151 	while (mir->mir_timer_call)
1152 		cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
1153 	mir->mir_timer_call = B_TRUE;
1154 
1155 	if ((tid = mir->mir_timer_id) != 0) {
1156 		mutex_exit(&mir->mir_mutex);
1157 		(void) untimeout(tid);
1158 		mutex_enter(&mir->mir_mutex);
1159 	}
1160 	/* Only start the timer when it is not closing. */
1161 	if (!mir->mir_closing) {
1162 		mir->mir_timer_id = timeout(mir_timer, q,
1163 		    MSEC_TO_TICK(intrvl));
1164 	}
1165 	mir->mir_timer_call = B_FALSE;
1166 	cv_broadcast(&mir->mir_timer_cv);
1167 }
1168 
1169 static int
1170 mir_clnt_dup_request(queue_t *q, mblk_t *mp)
1171 {
1172 	mblk_t  *mp1;
1173 	uint32_t  new_xid;
1174 	uint32_t  old_xid;
1175 
1176 	ASSERT(MUTEX_HELD(&((mir_t *)q->q_ptr)->mir_mutex));
1177 	new_xid = BE32_TO_U32(&mp->b_rptr[4]);
1178 	/*
1179 	 * This loop is a bit tacky -- it walks the STREAMS list of
1180 	 * flow-controlled messages.
1181 	 */
1182 	if ((mp1 = q->q_first) != NULL) {
1183 		do {
1184 			old_xid = BE32_TO_U32(&mp1->b_rptr[4]);
1185 			if (new_xid == old_xid)
1186 				return (1);
1187 		} while ((mp1 = mp1->b_next) != NULL);
1188 	}
1189 	return (0);
1190 }
1191 
1192 static int
1193 mir_close(queue_t *q)
1194 {
1195 	mir_t	*mir;
1196 	mblk_t	*mp;
1197 	bool_t queue_cleaned = FALSE;
1198 
1199 	RPCLOG(32, "rpcmod: mir_close of q 0x%p\n", (void *)q);
1200 	mir = (mir_t *)q->q_ptr;
1201 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
1202 	mutex_enter(&mir->mir_mutex);
1203 	if ((mp = mir->mir_head_mp) != NULL) {
1204 		mir->mir_head_mp = (mblk_t *)0;
1205 		freemsg(mp);
1206 	}
1207 	/*
1208 	 * Set mir_closing so we get notified when MIR_SVC_QUIESCED()
1209 	 * is TRUE.  And mir_timer_start() won't start the timer again.
1210 	 */
1211 	mir->mir_closing = B_TRUE;
1212 	mir_timer_stop(mir);
1213 
1214 	if (mir->mir_type == RPC_SERVER) {
1215 		flushq(q, FLUSHDATA);	/* Ditch anything waiting on read q */
1216 
1217 		/*
1218 		 * This will prevent more requests from arriving and
1219 		 * will force rpcmod to ignore flow control.
1220 		 */
1221 		mir_svc_start_close(WR(q), mir);
1222 
1223 		while ((!MIR_SVC_QUIESCED(mir)) || mir->mir_inwservice == 1) {
1224 
1225 			if (mir->mir_ref_cnt && !mir->mir_inrservice &&
1226 			    (queue_cleaned == FALSE)) {
1227 				/*
1228 				 * call into SVC to clean the queue
1229 				 */
1230 				mutex_exit(&mir->mir_mutex);
1231 				svc_queueclean(q);
1232 				queue_cleaned = TRUE;
1233 				mutex_enter(&mir->mir_mutex);
1234 				continue;
1235 			}
1236 
1237 			/*
1238 			 * Bugid 1253810 - Force the write service
1239 			 * procedure to send its messages, regardless
1240 			 * whether the downstream  module is ready
1241 			 * to accept data.
1242 			 */
1243 			if (mir->mir_inwservice == 1)
1244 				qenable(WR(q));
1245 
1246 			cv_wait(&mir->mir_condvar, &mir->mir_mutex);
1247 		}
1248 
1249 		mutex_exit(&mir->mir_mutex);
1250 		qprocsoff(q);
1251 
1252 		/* Notify KRPC that this stream is going away. */
1253 		svc_queueclose(q);
1254 	} else {
1255 		mutex_exit(&mir->mir_mutex);
1256 		qprocsoff(q);
1257 	}
1258 
1259 	mutex_destroy(&mir->mir_mutex);
1260 	cv_destroy(&mir->mir_condvar);
1261 	cv_destroy(&mir->mir_timer_cv);
1262 	kmem_free(mir, sizeof (mir_t));
1263 	return (0);
1264 }
1265 
1266 /*
1267  * This is server side only (RPC_SERVER).
1268  *
1269  * Exit idle mode.
1270  */
1271 static void
1272 mir_svc_idle_stop(queue_t *q, mir_t *mir)
1273 {
1274 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1275 	ASSERT((q->q_flag & QREADR) == 0);
1276 	ASSERT(mir->mir_type == RPC_SERVER);
1277 	RPCLOG(16, "rpcmod: mir_svc_idle_stop of q 0x%p\n", (void *)q);
1278 
1279 	mir_timer_stop(mir);
1280 }
1281 
1282 /*
1283  * This is server side only (RPC_SERVER).
1284  *
1285  * Start idle processing, which will include setting idle timer if the
1286  * stream is not being closed.
1287  */
1288 static void
1289 mir_svc_idle_start(queue_t *q, mir_t *mir)
1290 {
1291 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
1292 	ASSERT((q->q_flag & QREADR) == 0);
1293 	ASSERT(mir->mir_type == RPC_SERVER);
1294 	RPCLOG(16, "rpcmod: mir_svc_idle_start q 0x%p\n", (void *)q);
1295 
1296 	/*
1297 	 * Don't re-start idle timer if we are closing queues.
1298 	 */
1299 	if (mir->mir_closing) {
1300 		RPCLOG(16, "mir_svc_idle_start - closing: 0x%p\n",
1301 		    (void *)q);
1302 
1303 		/*
1304 		 * We will call mir_svc_idle_start() whenever MIR_SVC_QUIESCED()
1305 		 * is true.  When it is true, and we are in the process of
1306 		 * closing the stream, signal any thread waiting in
1307 		 * mir_close().
1308 		 */
1309 		if (mir->mir_inwservice == 0)
1310 			cv_signal(&mir->mir_condvar);
1311 
1312 	} else {
1313 		RPCLOG(16, "mir_svc_idle_start - reset %s timer\n",
1314 		    mir->mir_ordrel_pending ? "ordrel" : "normal");
1315 		/*
1316 		 * Normal condition, start the idle timer.  If an orderly
1317 		 * release has been sent, set the timeout to wait for the
1318 		 * client to close its side of the connection.  Otherwise,
1319 		 * use the normal idle timeout.
1320 		 */
1321 		mir_timer_start(q, mir, mir->mir_ordrel_pending ?
1322 		    svc_ordrel_timeout : mir->mir_idle_timeout);
1323 	}
1324 }
1325 
1326 /* ARGSUSED */
1327 static int
1328 mir_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1329 {
1330 	mir_t	*mir;
1331 
1332 	RPCLOG(32, "rpcmod: mir_open of q 0x%p\n", (void *)q);
1333 	/* Set variables used directly by KRPC. */
1334 	if (!mir_rele)
1335 		mir_rele = mir_svc_release;
1336 	if (!mir_start)
1337 		mir_start = mir_svc_start;
1338 	if (!clnt_stop_idle)
1339 		clnt_stop_idle = mir_clnt_idle_do_stop;
1340 	if (!clnt_max_msg_sizep)
1341 		clnt_max_msg_sizep = &clnt_max_msg_size;
1342 	if (!svc_max_msg_sizep)
1343 		svc_max_msg_sizep = &svc_max_msg_size;
1344 
1345 	/* Allocate a zero'ed out mir structure for this stream. */
1346 	mir = kmem_zalloc(sizeof (mir_t), KM_SLEEP);
1347 
1348 	/*
1349 	 * We set hold inbound here so that incoming messages will
1350 	 * be held on the read-side queue until the stream is completely
1351 	 * initialized with a RPC_CLIENT or RPC_SERVER ioctl.  During
1352 	 * the ioctl processing, the flag is cleared and any messages that
1353 	 * arrived between the open and the ioctl are delivered to KRPC.
1354 	 *
1355 	 * Early data should never arrive on a client stream since
1356 	 * servers only respond to our requests and we do not send any.
1357 	 * until after the stream is initialized.  Early data is
1358 	 * very common on a server stream where the client will start
1359 	 * sending data as soon as the connection is made (and this
1360 	 * is especially true with TCP where the protocol accepts the
1361 	 * connection before nfsd or KRPC is notified about it).
1362 	 */
1363 
1364 	mir->mir_hold_inbound = 1;
1365 
1366 	/*
1367 	 * Start the record marker looking for a 4-byte header.  When
1368 	 * this length is negative, it indicates that rpcmod is looking
1369 	 * for bytes to consume for the record marker header.  When it
1370 	 * is positive, it holds the number of bytes that have arrived
1371 	 * for the current fragment and are being held in mir_header_mp.
1372 	 */
1373 
1374 	mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
1375 
1376 	mir->mir_zoneid = rpc_zoneid();
1377 	mutex_init(&mir->mir_mutex, NULL, MUTEX_DEFAULT, NULL);
1378 	cv_init(&mir->mir_condvar, NULL, CV_DRIVER, NULL);
1379 	cv_init(&mir->mir_timer_cv, NULL, CV_DRIVER, NULL);
1380 
1381 	q->q_ptr = (char *)mir;
1382 	WR(q)->q_ptr = (char *)mir;
1383 
1384 	/*
1385 	 * We noenable the read-side queue because we don't want it
1386 	 * automatically enabled by putq.  We enable it explicitly
1387 	 * in mir_wsrv when appropriate. (See additional comments on
1388 	 * flow control at the beginning of mir_rsrv.)
1389 	 */
1390 	noenable(q);
1391 
1392 	qprocson(q);
1393 	return (0);
1394 }
1395 
1396 /*
1397  * Read-side put routine for both the client and server side.  Does the
1398  * record marking for incoming RPC messages, and when complete, dispatches
1399  * the message to either the client or server.
1400  */
1401 static void
1402 mir_do_rput(queue_t *q, mblk_t *mp, int srv)
1403 {
1404 	mblk_t	*cont_mp;
1405 	int	excess;
1406 	int32_t	frag_len;
1407 	int32_t	frag_header;
1408 	mblk_t	*head_mp;
1409 	int	len;
1410 	mir_t	*mir;
1411 	mblk_t	*mp1;
1412 	unsigned char	*rptr;
1413 	mblk_t	*tail_mp;
1414 	unsigned char	*wptr;
1415 	boolean_t	stop_timer = B_FALSE;
1416 
1417 	mir = (mir_t *)q->q_ptr;
1418 	ASSERT(mir != NULL);
1419 
1420 	/*
1421 	 * If the stream has not been set up as a RPC_CLIENT or RPC_SERVER
1422 	 * with the corresponding ioctl, then don't accept
1423 	 * any inbound data.  This should never happen for streams
1424 	 * created by nfsd or client-side KRPC because they are careful
1425 	 * to set the mode of the stream before doing anything else.
1426 	 */
1427 	if (mir->mir_type == 0) {
1428 		freemsg(mp);
1429 		return;
1430 	}
1431 
1432 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
1433 
1434 	switch (mp->b_datap->db_type) {
1435 	case M_DATA:
1436 		break;
1437 	case M_PROTO:
1438 	case M_PCPROTO:
1439 		rptr = mp->b_rptr;
1440 		if (mp->b_wptr - rptr < sizeof (uint32_t)) {
1441 			RPCLOG(1, "mir_rput: runt TPI message (%d bytes)\n",
1442 			    (int)(mp->b_wptr - rptr));
1443 			freemsg(mp);
1444 			return;
1445 		}
1446 		if (((union T_primitives *)rptr)->type != T_DATA_IND) {
1447 			mir_rput_proto(q, mp);
1448 			return;
1449 		}
1450 
1451 		/* Throw away the T_DATA_IND block and continue with data. */
1452 		mp1 = mp;
1453 		mp = mp->b_cont;
1454 		freeb(mp1);
1455 		break;
1456 	case M_SETOPTS:
1457 		/*
1458 		 * If a module on the stream is trying set the Stream head's
1459 		 * high water mark, then set our hiwater to the requested
1460 		 * value.  We are the "stream head" for all inbound
1461 		 * data messages since messages are passed directly to KRPC.
1462 		 */
1463 		if ((mp->b_wptr - mp->b_rptr) >= sizeof (struct stroptions)) {
1464 			struct stroptions	*stropts;
1465 
1466 			stropts = (struct stroptions *)mp->b_rptr;
1467 			if ((stropts->so_flags & SO_HIWAT) &&
1468 			    !(stropts->so_flags & SO_BAND)) {
1469 				(void) strqset(q, QHIWAT, 0, stropts->so_hiwat);
1470 			}
1471 		}
1472 		putnext(q, mp);
1473 		return;
1474 	case M_FLUSH:
1475 		RPCLOG(32, "mir_do_rput: ignoring M_FLUSH on q 0x%p. ",
1476 		    (void *)q);
1477 		RPCLOG(32, "M_FLUSH is %x\n", (uint_t)*mp->b_rptr);
1478 
1479 		putnext(q, mp);
1480 		return;
1481 	default:
1482 		putnext(q, mp);
1483 		return;
1484 	}
1485 
1486 	mutex_enter(&mir->mir_mutex);
1487 
1488 	/*
1489 	 * If this connection is closing, don't accept any new messages.
1490 	 */
1491 	if (mir->mir_svc_no_more_msgs) {
1492 		ASSERT(mir->mir_type == RPC_SERVER);
1493 		mutex_exit(&mir->mir_mutex);
1494 		freemsg(mp);
1495 		return;
1496 	}
1497 
1498 	/* Get local copies for quicker access. */
1499 	frag_len = mir->mir_frag_len;
1500 	frag_header = mir->mir_frag_header;
1501 	head_mp = mir->mir_head_mp;
1502 	tail_mp = mir->mir_tail_mp;
1503 
1504 	/* Loop, processing each message block in the mp chain separately. */
1505 	do {
1506 		/*
1507 		 * cont_mp is used in the do/while condition below to
1508 		 * walk to the next block in the STREAMS message.
1509 		 * mp->b_cont may be nil'ed during processing so we
1510 		 * can't rely on it to find the next block.
1511 		 */
1512 		cont_mp = mp->b_cont;
1513 
1514 		/*
1515 		 * Get local copies of rptr and wptr for our processing.
1516 		 * These always point into "mp" (the current block being
1517 		 * processed), but rptr is updated as we consume any
1518 		 * record header in this message, and wptr is updated to
1519 		 * point to the end of the data for the current fragment,
1520 		 * if it ends in this block.  The main point is that
1521 		 * they are not always the same as b_rptr and b_wptr.
1522 		 * b_rptr and b_wptr will be updated when appropriate.
1523 		 */
1524 		rptr = mp->b_rptr;
1525 		wptr = mp->b_wptr;
1526 same_mblk:;
1527 		len = (int)(wptr - rptr);
1528 		if (len <= 0) {
1529 			/*
1530 			 * If we have processed all of the data in the message
1531 			 * or the block is empty to begin with, then we're
1532 			 * done with this block and can go on to cont_mp,
1533 			 * if there is one.
1534 			 *
1535 			 * First, we check to see if the current block is
1536 			 * now zero-length and, if so, we free it.
1537 			 * This happens when either the block was empty
1538 			 * to begin with or we consumed all of the data
1539 			 * for the record marking header.
1540 			 */
1541 			if (rptr <= mp->b_rptr) {
1542 				/*
1543 				 * If head_mp is non-NULL, add cont_mp to the
1544 				 * mblk list. XXX But there is a possibility
1545 				 * that tail_mp = mp or even head_mp = mp XXX
1546 				 */
1547 				if (head_mp) {
1548 					if (head_mp == mp)
1549 						head_mp = NULL;
1550 					else if (tail_mp != mp) {
1551 		ASSERT((tail_mp->b_cont == NULL) || (tail_mp->b_cont == mp));
1552 						tail_mp->b_cont = cont_mp;
1553 						/*
1554 						 * It's possible that, because
1555 						 * of a very short mblk (0-3
1556 						 * bytes), we've ended up here
1557 						 * and that cont_mp could be
1558 						 * NULL (if we're at the end
1559 						 * of an mblk chain). If so,
1560 						 * don't set tail_mp to
1561 						 * cont_mp, because the next
1562 						 * time we access it, we'll
1563 						 * dereference a NULL pointer
1564 						 * and crash. Just leave
1565 						 * tail_mp pointing at the
1566 						 * current end of chain.
1567 						 */
1568 						if (cont_mp)
1569 							tail_mp = cont_mp;
1570 					} else {
1571 						mblk_t *smp = head_mp;
1572 
1573 						while ((smp->b_cont != NULL) &&
1574 						    (smp->b_cont != mp))
1575 							smp = smp->b_cont;
1576 						smp->b_cont = cont_mp;
1577 						/*
1578 						 * Don't set tail_mp to cont_mp
1579 						 * if it's NULL. Instead, set
1580 						 * tail_mp to smp, which is the
1581 						 * end of the chain starting
1582 						 * at head_mp.
1583 						 */
1584 						if (cont_mp)
1585 							tail_mp = cont_mp;
1586 						else
1587 							tail_mp = smp;
1588 					}
1589 				}
1590 				freeb(mp);
1591 			}
1592 			continue;
1593 		}
1594 
1595 		/*
1596 		 * frag_len starts at -4 and is incremented past the record
1597 		 * marking header to 0, and then becomes positive as real data
1598 		 * bytes are received for the message.  While frag_len is less
1599 		 * than zero, we need more bytes for the record marking
1600 		 * header.
1601 		 */
1602 		if (frag_len < 0) {
1603 			uchar_t	*up = rptr;
1604 			/*
1605 			 * Collect as many bytes as we need for the record
1606 			 * marking header and that are available in this block.
1607 			 */
1608 			do {
1609 				--len;
1610 				frag_len++;
1611 				frag_header <<= 8;
1612 				frag_header += (*up++ & 0xFF);
1613 			} while (len > 0 && frag_len < 0);
1614 
1615 			if (rptr == mp->b_rptr) {
1616 				/*
1617 				 * The record header is located at the
1618 				 * beginning of the block, so just walk
1619 				 * b_rptr past it.
1620 				 */
1621 				mp->b_rptr = rptr = up;
1622 			} else {
1623 				/*
1624 				 * The record header is located in the middle
1625 				 * of a block, so copy any remaining data up.
1626 				 * This happens when an RPC message is
1627 				 * fragmented into multiple pieces and
1628 				 * a middle (or end) fragment immediately
1629 				 * follows a previous fragment in the same
1630 				 * message block.
1631 				 */
1632 				wptr = &rptr[len];
1633 				mp->b_wptr = wptr;
1634 				if (len) {
1635 					RPCLOG(32, "mir_do_rput: copying %d "
1636 					    "bytes of data up", len);
1637 					RPCLOG(32, " db_ref %d\n",
1638 					    (uint_t)mp->b_datap->db_ref);
1639 					bcopy(up, rptr, len);
1640 				}
1641 			}
1642 
1643 			/*
1644 			 * If we haven't received the complete record header
1645 			 * yet, then loop around to get the next block in the
1646 			 * STREAMS message. The logic at same_mblk label will
1647 			 * free the current block if it has become empty.
1648 			 */
1649 			if (frag_len < 0) {
1650 				RPCLOG(32, "mir_do_rput: frag_len is still < 0 "
1651 				"(%d)", len);
1652 				goto same_mblk;
1653 			}
1654 
1655 #ifdef	RPCDEBUG
1656 			if ((frag_header & MIR_LASTFRAG) == 0) {
1657 				RPCLOG0(32, "mir_do_rput: multi-fragment "
1658 				    "record\n");
1659 			}
1660 			{
1661 				uint_t l = frag_header & ~MIR_LASTFRAG;
1662 
1663 				if (l != 0 && mir->mir_max_msg_sizep &&
1664 				    l >= *mir->mir_max_msg_sizep) {
1665 					RPCLOG(32, "mir_do_rput: fragment size"
1666 					    " (%d) > maximum", l);
1667 					RPCLOG(32, " (%u)\n",
1668 					    *mir->mir_max_msg_sizep);
1669 				}
1670 			}
1671 #endif
1672 			/*
1673 			 * At this point we have retrieved the complete record
1674 			 * header for this fragment.  If the current block is
1675 			 * empty, then we need to free it and walk to the next
1676 			 * block.
1677 			 */
1678 			if (mp->b_rptr >= wptr) {
1679 				/*
1680 				 * If this is not the last fragment or if we
1681 				 * have not received all the data for this
1682 				 * RPC message, then loop around to the next
1683 				 * block.
1684 				 */
1685 				if (!(frag_header & MIR_LASTFRAG) ||
1686 				    (frag_len -
1687 				    (frag_header & ~MIR_LASTFRAG)) ||
1688 				    !head_mp)
1689 					goto same_mblk;
1690 
1691 				/*
1692 				 * Quick walk to next block in the
1693 				 * STREAMS message.
1694 				 */
1695 				freeb(mp);
1696 				continue;
1697 			}
1698 		}
1699 
1700 		/*
1701 		 * We've collected the complete record header.  The data
1702 		 * in the current block is added to the end of the RPC
1703 		 * message.  Note that tail_mp is the same as mp after
1704 		 * this linkage.
1705 		 */
1706 		if (!head_mp)
1707 			head_mp = mp;
1708 		else if (tail_mp != mp) {
1709 			ASSERT((tail_mp->b_cont == NULL) ||
1710 			    (tail_mp->b_cont == mp));
1711 			tail_mp->b_cont = mp;
1712 		}
1713 		tail_mp = mp;
1714 
1715 		/*
1716 		 * Add the length of this block to the accumulated
1717 		 * fragment length.
1718 		 */
1719 		frag_len += len;
1720 		excess = frag_len - (frag_header & ~MIR_LASTFRAG);
1721 		/*
1722 		 * If we have not received all the data for this fragment,
1723 		 * then walk to the next block.
1724 		 */
1725 		if (excess < 0)
1726 			continue;
1727 
1728 		/*
1729 		 * We've received a complete fragment, so reset frag_len
1730 		 * for the next one.
1731 		 */
1732 		frag_len = -(int32_t)sizeof (uint32_t);
1733 
1734 		/*
1735 		 * Update rptr to point to the beginning of the next
1736 		 * fragment in this block.  If there are no more bytes
1737 		 * in the block (excess is 0), then rptr will be equal
1738 		 * to wptr.
1739 		 */
1740 		rptr = wptr - excess;
1741 
1742 		/*
1743 		 * Now we check to see if this fragment is the last one in
1744 		 * the RPC message.
1745 		 */
1746 		if (!(frag_header & MIR_LASTFRAG)) {
1747 			/*
1748 			 * This isn't the last one, so start processing the
1749 			 * next fragment.
1750 			 */
1751 			frag_header = 0;
1752 
1753 			/*
1754 			 * If excess is 0, the next fragment
1755 			 * starts at the beginning of the next block --
1756 			 * we "continue" to the end of the while loop and
1757 			 * walk to cont_mp.
1758 			 */
1759 			if (excess == 0)
1760 				continue;
1761 			RPCLOG0(32, "mir_do_rput: multi-fragment message with "
1762 			    "two or more fragments in one mblk\n");
1763 
1764 			/*
1765 			 * If excess is non-0, then the next fragment starts
1766 			 * in this block.  rptr points to the beginning
1767 			 * of the next fragment and we "goto same_mblk"
1768 			 * to continue processing.
1769 			 */
1770 			goto same_mblk;
1771 		}
1772 
1773 		/*
1774 		 * We've got a complete RPC message.  Before passing it
1775 		 * upstream, check to see if there is extra data in this
1776 		 * message block. If so, then we separate the excess
1777 		 * from the complete message. The excess data is processed
1778 		 * after the current message goes upstream.
1779 		 */
1780 		if (excess > 0) {
1781 			RPCLOG(32, "mir_do_rput: end of record, but excess "
1782 			    "data (%d bytes) in this mblk. dupb/copyb "
1783 			    "needed\n", excess);
1784 
1785 			/* Duplicate only the overlapping block. */
1786 			mp1 = dupb(tail_mp);
1787 
1788 			/*
1789 			 * dupb() might have failed due to ref count wrap around
1790 			 * so try a copyb().
1791 			 */
1792 			if (mp1 == NULL)
1793 				mp1 = copyb(tail_mp);
1794 
1795 			/*
1796 			 * Do not use bufcall() to schedule a "buffer
1797 			 * availability event."  The reason is that
1798 			 * bufcall() has problems.  For example, if memory
1799 			 * runs out, bufcall() itself will fail since it
1800 			 * needs to allocate memory.  The most appropriate
1801 			 * action right now is to disconnect this connection
1802 			 * as the system is under stress.  We should try to
1803 			 * free up resources.
1804 			 */
1805 			if (mp1 == NULL) {
1806 				freemsg(head_mp);
1807 				RPCLOG0(1, "mir_do_rput: dupb/copyb failed\n");
1808 				mir->mir_frag_header = 0;
1809 				mir->mir_frag_len = -(int)sizeof (uint32_t);
1810 				mir->mir_head_mp = NULL;
1811 				mir->mir_tail_mp = NULL;
1812 
1813 				mir_disconnect(q, mir);
1814 				return;
1815 			}
1816 
1817 			/*
1818 			 * The new message block is linked with the
1819 			 * continuation block in cont_mp.  We then point
1820 			 * cont_mp to the new block so that we will
1821 			 * process it next.
1822 			 */
1823 			mp1->b_cont = cont_mp;
1824 			cont_mp = mp1;
1825 			/*
1826 			 * Data in the new block begins at the
1827 			 * next fragment (rptr).
1828 			 */
1829 			cont_mp->b_rptr += (rptr - tail_mp->b_rptr);
1830 			ASSERT(cont_mp->b_rptr >= cont_mp->b_datap->db_base);
1831 			ASSERT(cont_mp->b_rptr <= cont_mp->b_wptr);
1832 
1833 			/* Data in the current fragment ends at rptr. */
1834 			tail_mp->b_wptr = rptr;
1835 			ASSERT(tail_mp->b_wptr <= tail_mp->b_datap->db_lim);
1836 			ASSERT(tail_mp->b_wptr >= tail_mp->b_rptr);
1837 
1838 		}
1839 
1840 		/* tail_mp is the last block with data for this RPC message. */
1841 		tail_mp->b_cont = NULL;
1842 
1843 		/* Pass the RPC message to the current consumer. */
1844 		switch (mir->mir_type) {
1845 		case RPC_CLIENT:
1846 			if (clnt_dispatch_notify(head_mp, mir->mir_zoneid)) {
1847 				/*
1848 				 * Mark this stream as active.  This marker
1849 				 * is used in mir_timer().
1850 				 */
1851 
1852 				mir->mir_clntreq = 1;
1853 				mir->mir_use_timestamp = lbolt;
1854 			} else
1855 				freemsg(head_mp);
1856 			break;
1857 
1858 		case RPC_SERVER:
1859 			/*
1860 			 * Check for flow control before passing the
1861 			 * message to KRPC.
1862 			 */
1863 
1864 			if (!mir->mir_hold_inbound) {
1865 				if (mir->mir_krpc_cell) {
1866 					/*
1867 					 * If the reference count is 0
1868 					 * (not including this request),
1869 					 * then the stream is transitioning
1870 					 * from idle to non-idle.  In this case,
1871 					 * we cancel the idle timer.
1872 					 */
1873 					if (mir->mir_ref_cnt++ == 0)
1874 						stop_timer = B_TRUE;
1875 					if (mir_check_len(q,
1876 					    (int32_t)msgdsize(mp), mp))
1877 						return;
1878 					svc_queuereq(q, head_mp); /* to KRPC */
1879 				} else {
1880 					/*
1881 					 * Count # of times this happens. Should
1882 					 * be never, but experience shows
1883 					 * otherwise.
1884 					 */
1885 					mir_krpc_cell_null++;
1886 					freemsg(head_mp);
1887 				}
1888 
1889 			} else {
1890 				/*
1891 				 * If the outbound side of the stream is
1892 				 * flow controlled, then hold this message
1893 				 * until client catches up. mir_hold_inbound
1894 				 * is set in mir_wput and cleared in mir_wsrv.
1895 				 */
1896 				if (srv)
1897 					(void) putbq(q, head_mp);
1898 				else
1899 					(void) putq(q, head_mp);
1900 				mir->mir_inrservice = B_TRUE;
1901 			}
1902 			break;
1903 		default:
1904 			RPCLOG(1, "mir_rput: unknown mir_type %d\n",
1905 			    mir->mir_type);
1906 			freemsg(head_mp);
1907 			break;
1908 		}
1909 
1910 		/*
1911 		 * Reset head_mp and frag_header since we're starting on a
1912 		 * new RPC fragment and message.
1913 		 */
1914 		head_mp = NULL;
1915 		tail_mp = NULL;
1916 		frag_header = 0;
1917 	} while ((mp = cont_mp) != NULL);
1918 
1919 	/*
1920 	 * Do a sanity check on the message length.  If this message is
1921 	 * getting excessively large, shut down the connection.
1922 	 */
1923 	if (head_mp != NULL && mir->mir_setup_complete &&
1924 	    mir_check_len(q, frag_len, head_mp))
1925 		return;
1926 
1927 	/* Save our local copies back in the mir structure. */
1928 	mir->mir_frag_header = frag_header;
1929 	mir->mir_frag_len = frag_len;
1930 	mir->mir_head_mp = head_mp;
1931 	mir->mir_tail_mp = tail_mp;
1932 
1933 	/*
1934 	 * The timer is stopped after the whole message chain is processed.
1935 	 * The reason is that stopping the timer releases the mir_mutex
1936 	 * lock temporarily.  This means that the request can be serviced
1937 	 * while we are still processing the message chain.  This is not
1938 	 * good.  So we stop the timer here instead.
1939 	 *
1940 	 * Note that if the timer fires before we stop it, it will not
1941 	 * do any harm as MIR_SVC_QUIESCED() is false and mir_timer()
1942 	 * will just return;
1943 	 */
1944 	if (stop_timer) {
1945 		RPCLOG(16, "mir_do_rput stopping idle timer on 0x%p because "
1946 		    "ref cnt going to non zero\n", (void *) WR(q));
1947 		mir_svc_idle_stop(WR(q), mir);
1948 	}
1949 	mutex_exit(&mir->mir_mutex);
1950 }
1951 
1952 static void
1953 mir_rput(queue_t *q, mblk_t *mp)
1954 {
1955 	mir_do_rput(q, mp, 0);
1956 }
1957 
1958 static void
1959 mir_rput_proto(queue_t *q, mblk_t *mp)
1960 {
1961 	mir_t	*mir = (mir_t *)q->q_ptr;
1962 	uint32_t	type;
1963 	uint32_t reason = 0;
1964 
1965 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
1966 
1967 	type = ((union T_primitives *)mp->b_rptr)->type;
1968 	switch (mir->mir_type) {
1969 	case RPC_CLIENT:
1970 		switch (type) {
1971 		case T_DISCON_IND:
1972 			reason = ((struct T_discon_ind *)
1973 			    (mp->b_rptr))->DISCON_reason;
1974 		    /*FALLTHROUGH*/
1975 		case T_ORDREL_IND:
1976 			mutex_enter(&mir->mir_mutex);
1977 			if (mir->mir_head_mp) {
1978 				freemsg(mir->mir_head_mp);
1979 				mir->mir_head_mp = (mblk_t *)0;
1980 				mir->mir_tail_mp = (mblk_t *)0;
1981 			}
1982 			/*
1983 			 * We are disconnecting, but not necessarily
1984 			 * closing. By not closing, we will fail to
1985 			 * pick up a possibly changed global timeout value,
1986 			 * unless we store it now.
1987 			 */
1988 			mir->mir_idle_timeout = clnt_idle_timeout;
1989 			mir_clnt_idle_stop(WR(q), mir);
1990 
1991 			/*
1992 			 * Even though we are unconnected, we still
1993 			 * leave the idle timer going on the client. The
1994 			 * reason for is that if we've disconnected due
1995 			 * to a server-side disconnect, reset, or connection
1996 			 * timeout, there is a possibility the client may
1997 			 * retry the RPC request. This retry needs to done on
1998 			 * the same bound address for the server to interpret
1999 			 * it as such. However, we don't want
2000 			 * to wait forever for that possibility. If the
2001 			 * end-point stays unconnected for mir_idle_timeout
2002 			 * units of time, then that is a signal to the
2003 			 * connection manager to give up waiting for the
2004 			 * application (eg. NFS) to send a retry.
2005 			 */
2006 			mir_clnt_idle_start(WR(q), mir);
2007 			mutex_exit(&mir->mir_mutex);
2008 			clnt_dispatch_notifyall(WR(q), type, reason);
2009 			freemsg(mp);
2010 			return;
2011 		case T_ERROR_ACK:
2012 		{
2013 			struct T_error_ack	*terror;
2014 
2015 			terror = (struct T_error_ack *)mp->b_rptr;
2016 			RPCLOG(1, "mir_rput_proto T_ERROR_ACK for queue 0x%p",
2017 			    (void *)q);
2018 			RPCLOG(1, " ERROR_prim: %s,",
2019 			    rpc_tpiprim2name(terror->ERROR_prim));
2020 			RPCLOG(1, " TLI_error: %s,",
2021 			    rpc_tpierr2name(terror->TLI_error));
2022 			RPCLOG(1, " UNIX_error: %d\n", terror->UNIX_error);
2023 			if (terror->ERROR_prim == T_DISCON_REQ)  {
2024 				clnt_dispatch_notifyall(WR(q), type, reason);
2025 				freemsg(mp);
2026 				return;
2027 			} else {
2028 				if (clnt_dispatch_notifyconn(WR(q), mp))
2029 					return;
2030 			}
2031 			break;
2032 		}
2033 		case T_OK_ACK:
2034 		{
2035 			struct T_ok_ack	*tok = (struct T_ok_ack *)mp->b_rptr;
2036 
2037 			if (tok->CORRECT_prim == T_DISCON_REQ) {
2038 				clnt_dispatch_notifyall(WR(q), type, reason);
2039 				freemsg(mp);
2040 				return;
2041 			} else {
2042 				if (clnt_dispatch_notifyconn(WR(q), mp))
2043 					return;
2044 			}
2045 			break;
2046 		}
2047 		case T_CONN_CON:
2048 		case T_INFO_ACK:
2049 		case T_OPTMGMT_ACK:
2050 			if (clnt_dispatch_notifyconn(WR(q), mp))
2051 				return;
2052 			break;
2053 		case T_BIND_ACK:
2054 			break;
2055 		default:
2056 			RPCLOG(1, "mir_rput: unexpected message %d "
2057 			    "for KRPC client\n",
2058 			    ((union T_primitives *)mp->b_rptr)->type);
2059 			break;
2060 		}
2061 		break;
2062 
2063 	case RPC_SERVER:
2064 		switch (type) {
2065 		case T_BIND_ACK:
2066 		{
2067 			struct T_bind_ack	*tbind;
2068 
2069 			/*
2070 			 * If this is a listening stream, then shut
2071 			 * off the idle timer.
2072 			 */
2073 			tbind = (struct T_bind_ack *)mp->b_rptr;
2074 			if (tbind->CONIND_number > 0) {
2075 				mutex_enter(&mir->mir_mutex);
2076 				mir_svc_idle_stop(WR(q), mir);
2077 
2078 				/*
2079 				 * mark this as a listen endpoint
2080 				 * for special handling.
2081 				 */
2082 
2083 				mir->mir_listen_stream = 1;
2084 				mutex_exit(&mir->mir_mutex);
2085 			}
2086 			break;
2087 		}
2088 		case T_DISCON_IND:
2089 		case T_ORDREL_IND:
2090 			RPCLOG(16, "mir_rput_proto: got %s indication\n",
2091 			    type == T_DISCON_IND ? "disconnect"
2092 			    : "orderly release");
2093 
2094 			/*
2095 			 * For listen endpoint just pass
2096 			 * on the message.
2097 			 */
2098 
2099 			if (mir->mir_listen_stream)
2100 				break;
2101 
2102 			mutex_enter(&mir->mir_mutex);
2103 
2104 			/*
2105 			 * If client wants to break off connection, record
2106 			 * that fact.
2107 			 */
2108 			mir_svc_start_close(WR(q), mir);
2109 
2110 			/*
2111 			 * If we are idle, then send the orderly release
2112 			 * or disconnect indication to nfsd.
2113 			 */
2114 			if (MIR_SVC_QUIESCED(mir)) {
2115 				mutex_exit(&mir->mir_mutex);
2116 				break;
2117 			}
2118 
2119 			RPCLOG(16, "mir_rput_proto: not idle, so "
2120 			    "disconnect/ord rel indication not passed "
2121 			    "upstream on 0x%p\n", (void *)q);
2122 
2123 			/*
2124 			 * Hold the indication until we get idle
2125 			 * If there already is an indication stored,
2126 			 * replace it if the new one is a disconnect. The
2127 			 * reasoning is that disconnection takes less time
2128 			 * to process, and once a client decides to
2129 			 * disconnect, we should do that.
2130 			 */
2131 			if (mir->mir_svc_pend_mp) {
2132 				if (type == T_DISCON_IND) {
2133 					RPCLOG(16, "mir_rput_proto: replacing"
2134 					    " held disconnect/ord rel"
2135 					    " indication with disconnect on"
2136 					    " 0x%p\n", (void *)q);
2137 
2138 					freemsg(mir->mir_svc_pend_mp);
2139 					mir->mir_svc_pend_mp = mp;
2140 				} else {
2141 					RPCLOG(16, "mir_rput_proto: already "
2142 					    "held a disconnect/ord rel "
2143 					    "indication. freeing ord rel "
2144 					    "ind on 0x%p\n", (void *)q);
2145 					freemsg(mp);
2146 				}
2147 			} else
2148 				mir->mir_svc_pend_mp = mp;
2149 
2150 			mutex_exit(&mir->mir_mutex);
2151 			return;
2152 
2153 		default:
2154 			/* nfsd handles server-side non-data messages. */
2155 			break;
2156 		}
2157 		break;
2158 
2159 	default:
2160 		break;
2161 	}
2162 
2163 	putnext(q, mp);
2164 }
2165 
2166 /*
2167  * The server-side read queues are used to hold inbound messages while
2168  * outbound flow control is exerted.  When outbound flow control is
2169  * relieved, mir_wsrv qenables the read-side queue.  Read-side queues
2170  * are not enabled by STREAMS and are explicitly noenable'ed in mir_open.
2171  *
2172  * For the server side,  we have two types of messages queued. The first type
2173  * are messages that are ready to be XDR decoded and and then sent to the
2174  * RPC program's dispatch routine. The second type are "raw" messages that
2175  * haven't been processed, i.e. assembled from rpc record fragements into
2176  * full requests. The only time we will see the second type of message
2177  * queued is if we have a memory allocation failure while processing a
2178  * a raw message. The field mir_first_non_processed_mblk will mark the
2179  * first such raw message. So the flow for server side is:
2180  *
2181  *	- send processed queued messages to kRPC until we run out or find
2182  *	  one that needs additional processing because we were short on memory
2183  *	  earlier
2184  *	- process a message that was deferred because of lack of
2185  *	  memory
2186  *	- continue processing messages until the queue empties or we
2187  *	  have to stop because of lack of memory
2188  *	- during each of the above phase, if the queue is empty and
2189  *	  there are no pending messages that were passed to the RPC
2190  *	  layer, send upstream the pending disconnect/ordrel indication if
2191  *	  there is one
2192  *
2193  * The read-side queue is also enabled by a bufcall callback if dupmsg
2194  * fails in mir_rput.
2195  */
2196 static void
2197 mir_rsrv(queue_t *q)
2198 {
2199 	mir_t	*mir;
2200 	mblk_t	*mp;
2201 	mblk_t	*cmp = NULL;
2202 	boolean_t stop_timer = B_FALSE;
2203 
2204 	mir = (mir_t *)q->q_ptr;
2205 	mutex_enter(&mir->mir_mutex);
2206 
2207 	mp = NULL;
2208 	switch (mir->mir_type) {
2209 	case RPC_SERVER:
2210 		if (mir->mir_ref_cnt == 0)
2211 			mir->mir_hold_inbound = 0;
2212 		if (mir->mir_hold_inbound) {
2213 
2214 			ASSERT(cmp == NULL);
2215 			if (q->q_first == NULL) {
2216 
2217 				MIR_CLEAR_INRSRV(mir);
2218 
2219 				if (MIR_SVC_QUIESCED(mir)) {
2220 					cmp = mir->mir_svc_pend_mp;
2221 					mir->mir_svc_pend_mp = NULL;
2222 				}
2223 			}
2224 
2225 			mutex_exit(&mir->mir_mutex);
2226 
2227 			if (cmp != NULL) {
2228 				RPCLOG(16, "mir_rsrv: line %d: sending a held "
2229 				    "disconnect/ord rel indication upstream\n",
2230 				    __LINE__);
2231 				putnext(q, cmp);
2232 			}
2233 
2234 			return;
2235 		}
2236 		while (mp = getq(q)) {
2237 			if (mir->mir_krpc_cell &&
2238 			    (mir->mir_svc_no_more_msgs == 0)) {
2239 				/*
2240 				 * If we were idle, turn off idle timer since
2241 				 * we aren't idle any more.
2242 				 */
2243 				if (mir->mir_ref_cnt++ == 0)
2244 					stop_timer = B_TRUE;
2245 				if (mir_check_len(q,
2246 				    (int32_t)msgdsize(mp), mp))
2247 					return;
2248 				svc_queuereq(q, mp);
2249 			} else {
2250 				/*
2251 				 * Count # of times this happens. Should be
2252 				 * never, but experience shows otherwise.
2253 				 */
2254 				if (mir->mir_krpc_cell == NULL)
2255 					mir_krpc_cell_null++;
2256 				freemsg(mp);
2257 			}
2258 		}
2259 		break;
2260 	case RPC_CLIENT:
2261 		break;
2262 	default:
2263 		RPCLOG(1, "mir_rsrv: unexpected mir_type %d\n", mir->mir_type);
2264 
2265 		if (q->q_first == NULL)
2266 			MIR_CLEAR_INRSRV(mir);
2267 
2268 		mutex_exit(&mir->mir_mutex);
2269 
2270 		return;
2271 	}
2272 
2273 	/*
2274 	 * The timer is stopped after all the messages are processed.
2275 	 * The reason is that stopping the timer releases the mir_mutex
2276 	 * lock temporarily.  This means that the request can be serviced
2277 	 * while we are still processing the message queue.  This is not
2278 	 * good.  So we stop the timer here instead.
2279 	 */
2280 	if (stop_timer)  {
2281 		RPCLOG(16, "mir_rsrv stopping idle timer on 0x%p because ref "
2282 		    "cnt going to non zero\n", (void *)WR(q));
2283 		mir_svc_idle_stop(WR(q), mir);
2284 	}
2285 
2286 	if (q->q_first == NULL) {
2287 
2288 		MIR_CLEAR_INRSRV(mir);
2289 
2290 		ASSERT(cmp == NULL);
2291 		if (mir->mir_type == RPC_SERVER && MIR_SVC_QUIESCED(mir)) {
2292 			cmp = mir->mir_svc_pend_mp;
2293 			mir->mir_svc_pend_mp = NULL;
2294 		}
2295 
2296 		mutex_exit(&mir->mir_mutex);
2297 
2298 		if (cmp != NULL) {
2299 			RPCLOG(16, "mir_rsrv: line %d: sending a held "
2300 			    "disconnect/ord rel indication upstream\n",
2301 			    __LINE__);
2302 			putnext(q, cmp);
2303 		}
2304 
2305 		return;
2306 	}
2307 	mutex_exit(&mir->mir_mutex);
2308 }
2309 
2310 static int mir_svc_policy_fails;
2311 
2312 /*
2313  * Called to send an event code to nfsd/lockd so that it initiates
2314  * connection close.
2315  */
2316 static int
2317 mir_svc_policy_notify(queue_t *q, int event)
2318 {
2319 	mblk_t	*mp;
2320 #ifdef DEBUG
2321 	mir_t *mir = (mir_t *)q->q_ptr;
2322 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
2323 #endif
2324 	ASSERT(q->q_flag & QREADR);
2325 
2326 	/*
2327 	 * Create an M_DATA message with the event code and pass it to the
2328 	 * Stream head (nfsd or whoever created the stream will consume it).
2329 	 */
2330 	mp = allocb(sizeof (int), BPRI_HI);
2331 
2332 	if (!mp) {
2333 
2334 		mir_svc_policy_fails++;
2335 		RPCLOG(16, "mir_svc_policy_notify: could not allocate event "
2336 		    "%d\n", event);
2337 		return (ENOMEM);
2338 	}
2339 
2340 	U32_TO_BE32(event, mp->b_rptr);
2341 	mp->b_wptr = mp->b_rptr + sizeof (int);
2342 	putnext(q, mp);
2343 	return (0);
2344 }
2345 
2346 /*
2347  * Server side: start the close phase. We want to get this rpcmod slot in an
2348  * idle state before mir_close() is called.
2349  */
2350 static void
2351 mir_svc_start_close(queue_t *wq, mir_t *mir)
2352 {
2353 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
2354 	ASSERT((wq->q_flag & QREADR) == 0);
2355 	ASSERT(mir->mir_type == RPC_SERVER);
2356 
2357 
2358 	/*
2359 	 * Do not accept any more messages.
2360 	 */
2361 	mir->mir_svc_no_more_msgs = 1;
2362 
2363 	/*
2364 	 * Next two statements will make the read service procedure invoke
2365 	 * svc_queuereq() on everything stuck in the streams read queue.
2366 	 * It's not necessary because enabling the write queue will
2367 	 * have the same effect, but why not speed the process along?
2368 	 */
2369 	mir->mir_hold_inbound = 0;
2370 	qenable(RD(wq));
2371 
2372 	/*
2373 	 * Meanwhile force the write service procedure to send the
2374 	 * responses downstream, regardless of flow control.
2375 	 */
2376 	qenable(wq);
2377 }
2378 
2379 /*
2380  * This routine is called directly by KRPC after a request is completed,
2381  * whether a reply was sent or the request was dropped.
2382  */
2383 static void
2384 mir_svc_release(queue_t *wq, mblk_t *mp)
2385 {
2386 	mir_t   *mir = (mir_t *)wq->q_ptr;
2387 	mblk_t	*cmp = NULL;
2388 
2389 	ASSERT((wq->q_flag & QREADR) == 0);
2390 	if (mp)
2391 		freemsg(mp);
2392 
2393 	mutex_enter(&mir->mir_mutex);
2394 
2395 	/*
2396 	 * Start idle processing if this is the last reference.
2397 	 */
2398 	if ((mir->mir_ref_cnt == 1) && (mir->mir_inrservice == 0)) {
2399 
2400 		RPCLOG(16, "mir_svc_release starting idle timer on 0x%p "
2401 		    "because ref cnt is zero\n", (void *) wq);
2402 
2403 		cmp = mir->mir_svc_pend_mp;
2404 		mir->mir_svc_pend_mp = NULL;
2405 		mir_svc_idle_start(wq, mir);
2406 	}
2407 
2408 	mir->mir_ref_cnt--;
2409 	ASSERT(mir->mir_ref_cnt >= 0);
2410 
2411 	/*
2412 	 * Wake up the thread waiting to close.
2413 	 */
2414 
2415 	if ((mir->mir_ref_cnt == 0) && mir->mir_closing)
2416 		cv_signal(&mir->mir_condvar);
2417 
2418 	mutex_exit(&mir->mir_mutex);
2419 
2420 	if (cmp) {
2421 		RPCLOG(16, "mir_svc_release: sending a held "
2422 		    "disconnect/ord rel indication upstream on queue 0x%p\n",
2423 		    (void *)RD(wq));
2424 
2425 		putnext(RD(wq), cmp);
2426 	}
2427 }
2428 
2429 /*
2430  * This routine is called by server-side KRPC when it is ready to
2431  * handle inbound messages on the stream.
2432  */
2433 static void
2434 mir_svc_start(queue_t *wq)
2435 {
2436 	mir_t   *mir = (mir_t *)wq->q_ptr;
2437 
2438 	/*
2439 	 * no longer need to take the mir_mutex because the
2440 	 * mir_setup_complete field has been moved out of
2441 	 * the binary field protected by the mir_mutex.
2442 	 */
2443 
2444 	mir->mir_setup_complete = 1;
2445 	qenable(RD(wq));
2446 }
2447 
2448 /*
2449  * client side wrapper for stopping timer with normal idle timeout.
2450  */
2451 static void
2452 mir_clnt_idle_stop(queue_t *wq, mir_t *mir)
2453 {
2454 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
2455 	ASSERT((wq->q_flag & QREADR) == 0);
2456 	ASSERT(mir->mir_type == RPC_CLIENT);
2457 
2458 	mir_timer_stop(mir);
2459 }
2460 
2461 /*
2462  * client side wrapper for stopping timer with normal idle timeout.
2463  */
2464 static void
2465 mir_clnt_idle_start(queue_t *wq, mir_t *mir)
2466 {
2467 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
2468 	ASSERT((wq->q_flag & QREADR) == 0);
2469 	ASSERT(mir->mir_type == RPC_CLIENT);
2470 
2471 	mir_timer_start(wq, mir, mir->mir_idle_timeout);
2472 }
2473 
2474 /*
2475  * client side only. Forces rpcmod to stop sending T_ORDREL_REQs on
2476  * end-points that aren't connected.
2477  */
2478 static void
2479 mir_clnt_idle_do_stop(queue_t *wq)
2480 {
2481 	mir_t   *mir = (mir_t *)wq->q_ptr;
2482 
2483 	RPCLOG(1, "mir_clnt_idle_do_stop: wq 0x%p\n", (void *)wq);
2484 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
2485 	mutex_enter(&mir->mir_mutex);
2486 	mir_clnt_idle_stop(wq, mir);
2487 	mutex_exit(&mir->mir_mutex);
2488 }
2489 
2490 /*
2491  * Timer handler.  It handles idle timeout and memory shortage problem.
2492  */
2493 static void
2494 mir_timer(void *arg)
2495 {
2496 	queue_t *wq = (queue_t *)arg;
2497 	mir_t *mir = (mir_t *)wq->q_ptr;
2498 	boolean_t notify;
2499 
2500 	mutex_enter(&mir->mir_mutex);
2501 
2502 	/*
2503 	 * mir_timer_call is set only when either mir_timer_[start|stop]
2504 	 * is progressing.  And mir_timer() can only be run while they
2505 	 * are progressing if the timer is being stopped.  So just
2506 	 * return.
2507 	 */
2508 	if (mir->mir_timer_call) {
2509 		mutex_exit(&mir->mir_mutex);
2510 		return;
2511 	}
2512 	mir->mir_timer_id = 0;
2513 
2514 	switch (mir->mir_type) {
2515 	case RPC_CLIENT:
2516 
2517 		/*
2518 		 * For clients, the timer fires at clnt_idle_timeout
2519 		 * intervals.  If the activity marker (mir_clntreq) is
2520 		 * zero, then the stream has been idle since the last
2521 		 * timer event and we notify KRPC.  If mir_clntreq is
2522 		 * non-zero, then the stream is active and we just
2523 		 * restart the timer for another interval.  mir_clntreq
2524 		 * is set to 1 in mir_wput for every request passed
2525 		 * downstream.
2526 		 *
2527 		 * If this was a memory shortage timer reset the idle
2528 		 * timeout regardless; the mir_clntreq will not be a
2529 		 * valid indicator.
2530 		 *
2531 		 * The timer is initially started in mir_wput during
2532 		 * RPC_CLIENT ioctl processing.
2533 		 *
2534 		 * The timer interval can be changed for individual
2535 		 * streams with the ND variable "mir_idle_timeout".
2536 		 */
2537 		if (mir->mir_clntreq > 0 && mir->mir_use_timestamp +
2538 		    MSEC_TO_TICK(mir->mir_idle_timeout) - lbolt >= 0) {
2539 			clock_t tout;
2540 
2541 			tout = mir->mir_idle_timeout -
2542 			    TICK_TO_MSEC(lbolt - mir->mir_use_timestamp);
2543 			if (tout < 0)
2544 				tout = 1000;
2545 #if 0
2546 			printf("mir_timer[%d < %d + %d]: reset client timer "
2547 			    "to %d (ms)\n", TICK_TO_MSEC(lbolt),
2548 			    TICK_TO_MSEC(mir->mir_use_timestamp),
2549 			    mir->mir_idle_timeout, tout);
2550 #endif
2551 			mir->mir_clntreq = 0;
2552 			mir_timer_start(wq, mir, tout);
2553 			mutex_exit(&mir->mir_mutex);
2554 			return;
2555 		}
2556 #if 0
2557 printf("mir_timer[%d]: doing client timeout\n", lbolt / hz);
2558 #endif
2559 		/*
2560 		 * We are disconnecting, but not necessarily
2561 		 * closing. By not closing, we will fail to
2562 		 * pick up a possibly changed global timeout value,
2563 		 * unless we store it now.
2564 		 */
2565 		mir->mir_idle_timeout = clnt_idle_timeout;
2566 		mir_clnt_idle_start(wq, mir);
2567 
2568 		mutex_exit(&mir->mir_mutex);
2569 		/*
2570 		 * We pass T_ORDREL_REQ as an integer value
2571 		 * to KRPC as the indication that the stream
2572 		 * is idle.  This is not a T_ORDREL_REQ message,
2573 		 * it is just a convenient value since we call
2574 		 * the same KRPC routine for T_ORDREL_INDs and
2575 		 * T_DISCON_INDs.
2576 		 */
2577 		clnt_dispatch_notifyall(wq, T_ORDREL_REQ, 0);
2578 		return;
2579 
2580 	case RPC_SERVER:
2581 
2582 		/*
2583 		 * For servers, the timer is only running when the stream
2584 		 * is really idle or memory is short.  The timer is started
2585 		 * by mir_wput when mir_type is set to RPC_SERVER and
2586 		 * by mir_svc_idle_start whenever the stream goes idle
2587 		 * (mir_ref_cnt == 0).  The timer is cancelled in
2588 		 * mir_rput whenever a new inbound request is passed to KRPC
2589 		 * and the stream was previously idle.
2590 		 *
2591 		 * The timer interval can be changed for individual
2592 		 * streams with the ND variable "mir_idle_timeout".
2593 		 *
2594 		 * If the stream is not idle do nothing.
2595 		 */
2596 		if (!MIR_SVC_QUIESCED(mir)) {
2597 			mutex_exit(&mir->mir_mutex);
2598 			return;
2599 		}
2600 
2601 		notify = !mir->mir_inrservice;
2602 		mutex_exit(&mir->mir_mutex);
2603 
2604 		/*
2605 		 * If there is no packet queued up in read queue, the stream
2606 		 * is really idle so notify nfsd to close it.
2607 		 */
2608 		if (notify) {
2609 			RPCLOG(16, "mir_timer: telling stream head listener "
2610 			    "to close stream (0x%p)\n", (void *) RD(wq));
2611 			(void) mir_svc_policy_notify(RD(wq), 1);
2612 		}
2613 		return;
2614 	default:
2615 		RPCLOG(1, "mir_timer: unexpected mir_type %d\n",
2616 		    mir->mir_type);
2617 		mutex_exit(&mir->mir_mutex);
2618 		return;
2619 	}
2620 }
2621 
2622 /*
2623  * Called by the RPC package to send either a call or a return, or a
2624  * transport connection request.  Adds the record marking header.
2625  */
2626 static void
2627 mir_wput(queue_t *q, mblk_t *mp)
2628 {
2629 	uint_t	frag_header;
2630 	mir_t	*mir = (mir_t *)q->q_ptr;
2631 	uchar_t	*rptr = mp->b_rptr;
2632 
2633 	if (!mir) {
2634 		freemsg(mp);
2635 		return;
2636 	}
2637 
2638 	if (mp->b_datap->db_type != M_DATA) {
2639 		mir_wput_other(q, mp);
2640 		return;
2641 	}
2642 
2643 	if (mir->mir_ordrel_pending == 1) {
2644 		freemsg(mp);
2645 		RPCLOG(16, "mir_wput wq 0x%p: got data after T_ORDREL_REQ\n",
2646 		    (void *)q);
2647 		return;
2648 	}
2649 
2650 	frag_header = (uint_t)DLEN(mp);
2651 	frag_header |= MIR_LASTFRAG;
2652 
2653 	/* Stick in the 4 byte record marking header. */
2654 	if ((rptr - mp->b_datap->db_base) < sizeof (uint32_t) ||
2655 	    !IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) {
2656 		/*
2657 		 * Since we know that M_DATA messages are created exclusively
2658 		 * by KRPC, we expect that KRPC will leave room for our header
2659 		 * and 4 byte align which is normal for XDR.
2660 		 * If KRPC (or someone else) does not cooperate, then we
2661 		 * just throw away the message.
2662 		 */
2663 		RPCLOG(1, "mir_wput: KRPC did not leave space for record "
2664 		    "fragment header (%d bytes left)\n",
2665 		    (int)(rptr - mp->b_datap->db_base));
2666 		freemsg(mp);
2667 		return;
2668 	}
2669 	rptr -= sizeof (uint32_t);
2670 	*(uint32_t *)rptr = htonl(frag_header);
2671 	mp->b_rptr = rptr;
2672 
2673 	mutex_enter(&mir->mir_mutex);
2674 	if (mir->mir_type == RPC_CLIENT) {
2675 		/*
2676 		 * For the client, set mir_clntreq to indicate that the
2677 		 * connection is active.
2678 		 */
2679 		mir->mir_clntreq = 1;
2680 		mir->mir_use_timestamp = lbolt;
2681 	}
2682 
2683 	/*
2684 	 * If we haven't already queued some data and the downstream module
2685 	 * can accept more data, send it on, otherwise we queue the message
2686 	 * and take other actions depending on mir_type.
2687 	 */
2688 	if (!mir->mir_inwservice && MIR_WCANPUTNEXT(mir, q)) {
2689 		mutex_exit(&mir->mir_mutex);
2690 
2691 		/*
2692 		 * Now we pass the RPC message downstream.
2693 		 */
2694 		putnext(q, mp);
2695 		return;
2696 	}
2697 
2698 	switch (mir->mir_type) {
2699 	case RPC_CLIENT:
2700 		/*
2701 		 * Check for a previous duplicate request on the
2702 		 * queue.  If there is one, then we throw away
2703 		 * the current message and let the previous one
2704 		 * go through.  If we can't find a duplicate, then
2705 		 * send this one.  This tap dance is an effort
2706 		 * to reduce traffic and processing requirements
2707 		 * under load conditions.
2708 		 */
2709 		if (mir_clnt_dup_request(q, mp)) {
2710 			mutex_exit(&mir->mir_mutex);
2711 			freemsg(mp);
2712 			return;
2713 		}
2714 		break;
2715 	case RPC_SERVER:
2716 		/*
2717 		 * Set mir_hold_inbound so that new inbound RPC
2718 		 * messages will be held until the client catches
2719 		 * up on the earlier replies.  This flag is cleared
2720 		 * in mir_wsrv after flow control is relieved;
2721 		 * the read-side queue is also enabled at that time.
2722 		 */
2723 		mir->mir_hold_inbound = 1;
2724 		break;
2725 	default:
2726 		RPCLOG(1, "mir_wput: unexpected mir_type %d\n", mir->mir_type);
2727 		break;
2728 	}
2729 	mir->mir_inwservice = 1;
2730 	(void) putq(q, mp);
2731 	mutex_exit(&mir->mir_mutex);
2732 }
2733 
2734 static void
2735 mir_wput_other(queue_t *q, mblk_t *mp)
2736 {
2737 	mir_t	*mir = (mir_t *)q->q_ptr;
2738 	struct iocblk	*iocp;
2739 	uchar_t	*rptr = mp->b_rptr;
2740 	bool_t	flush_in_svc = FALSE;
2741 
2742 	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
2743 	switch (mp->b_datap->db_type) {
2744 	case M_IOCTL:
2745 		iocp = (struct iocblk *)rptr;
2746 		switch (iocp->ioc_cmd) {
2747 		case RPC_CLIENT:
2748 			mutex_enter(&mir->mir_mutex);
2749 			if (mir->mir_type != 0 &&
2750 			    mir->mir_type != iocp->ioc_cmd) {
2751 ioc_eperm:
2752 				mutex_exit(&mir->mir_mutex);
2753 				iocp->ioc_error = EPERM;
2754 				iocp->ioc_count = 0;
2755 				mp->b_datap->db_type = M_IOCACK;
2756 				qreply(q, mp);
2757 				return;
2758 			}
2759 
2760 			mir->mir_type = iocp->ioc_cmd;
2761 
2762 			/*
2763 			 * Clear mir_hold_inbound which was set to 1 by
2764 			 * mir_open.  This flag is not used on client
2765 			 * streams.
2766 			 */
2767 			mir->mir_hold_inbound = 0;
2768 			mir->mir_max_msg_sizep = &clnt_max_msg_size;
2769 
2770 			/*
2771 			 * Start the idle timer.  See mir_timer() for more
2772 			 * information on how client timers work.
2773 			 */
2774 			mir->mir_idle_timeout = clnt_idle_timeout;
2775 			mir_clnt_idle_start(q, mir);
2776 			mutex_exit(&mir->mir_mutex);
2777 
2778 			mp->b_datap->db_type = M_IOCACK;
2779 			qreply(q, mp);
2780 			return;
2781 		case RPC_SERVER:
2782 			mutex_enter(&mir->mir_mutex);
2783 			if (mir->mir_type != 0 &&
2784 			    mir->mir_type != iocp->ioc_cmd)
2785 				goto ioc_eperm;
2786 
2787 			/*
2788 			 * We don't clear mir_hold_inbound here because
2789 			 * mir_hold_inbound is used in the flow control
2790 			 * model. If we cleared it here, then we'd commit
2791 			 * a small violation to the model where the transport
2792 			 * might immediately block downstream flow.
2793 			 */
2794 
2795 			mir->mir_type = iocp->ioc_cmd;
2796 			mir->mir_max_msg_sizep = &svc_max_msg_size;
2797 
2798 			/*
2799 			 * Start the idle timer.  See mir_timer() for more
2800 			 * information on how server timers work.
2801 			 *
2802 			 * Note that it is important to start the idle timer
2803 			 * here so that connections time out even if we
2804 			 * never receive any data on them.
2805 			 */
2806 			mir->mir_idle_timeout = svc_idle_timeout;
2807 			RPCLOG(16, "mir_wput_other starting idle timer on 0x%p "
2808 			    "because we got RPC_SERVER ioctl\n", (void *)q);
2809 			mir_svc_idle_start(q, mir);
2810 			mutex_exit(&mir->mir_mutex);
2811 
2812 			mp->b_datap->db_type = M_IOCACK;
2813 			qreply(q, mp);
2814 			return;
2815 		default:
2816 			break;
2817 		}
2818 		break;
2819 
2820 	case M_PROTO:
2821 		if (mir->mir_type == RPC_CLIENT) {
2822 			/*
2823 			 * We are likely being called from the context of a
2824 			 * service procedure. So we need to enqueue. However
2825 			 * enqueing may put our message behind data messages.
2826 			 * So flush the data first.
2827 			 */
2828 			flush_in_svc = TRUE;
2829 		}
2830 		if ((mp->b_wptr - rptr) < sizeof (uint32_t) ||
2831 		    !IS_P2ALIGNED(rptr, sizeof (uint32_t)))
2832 			break;
2833 
2834 		switch (((union T_primitives *)rptr)->type) {
2835 		case T_DATA_REQ:
2836 			/* Don't pass T_DATA_REQ messages downstream. */
2837 			freemsg(mp);
2838 			return;
2839 		case T_ORDREL_REQ:
2840 			RPCLOG(8, "mir_wput_other wq 0x%p: got T_ORDREL_REQ\n",
2841 			    (void *)q);
2842 			mutex_enter(&mir->mir_mutex);
2843 			if (mir->mir_type != RPC_SERVER) {
2844 				/*
2845 				 * We are likely being called from
2846 				 * clnt_dispatch_notifyall(). Sending
2847 				 * a T_ORDREL_REQ will result in
2848 				 * a some kind of _IND message being sent,
2849 				 * will be another call to
2850 				 * clnt_dispatch_notifyall(). To keep the stack
2851 				 * lean, queue this message.
2852 				 */
2853 				mir->mir_inwservice = 1;
2854 				(void) putq(q, mp);
2855 				mutex_exit(&mir->mir_mutex);
2856 				return;
2857 			}
2858 
2859 			/*
2860 			 * Mark the structure such that we don't accept any
2861 			 * more requests from client. We could defer this
2862 			 * until we actually send the orderly release
2863 			 * request downstream, but all that does is delay
2864 			 * the closing of this stream.
2865 			 */
2866 			RPCLOG(16, "mir_wput_other wq 0x%p: got T_ORDREL_REQ "
2867 			    " so calling mir_svc_start_close\n", (void *)q);
2868 
2869 			mir_svc_start_close(q, mir);
2870 
2871 			/*
2872 			 * If we have sent down a T_ORDREL_REQ, don't send
2873 			 * any more.
2874 			 */
2875 			if (mir->mir_ordrel_pending) {
2876 				freemsg(mp);
2877 				mutex_exit(&mir->mir_mutex);
2878 				return;
2879 			}
2880 
2881 			/*
2882 			 * If the stream is not idle, then we hold the
2883 			 * orderly release until it becomes idle.  This
2884 			 * ensures that KRPC will be able to reply to
2885 			 * all requests that we have passed to it.
2886 			 *
2887 			 * We also queue the request if there is data already
2888 			 * queued, because we cannot allow the T_ORDREL_REQ
2889 			 * to go before data. When we had a separate reply
2890 			 * count, this was not a problem, because the
2891 			 * reply count was reconciled when mir_wsrv()
2892 			 * completed.
2893 			 */
2894 			if (!MIR_SVC_QUIESCED(mir) ||
2895 			    mir->mir_inwservice == 1) {
2896 				mir->mir_inwservice = 1;
2897 				(void) putq(q, mp);
2898 
2899 				RPCLOG(16, "mir_wput_other: queuing "
2900 				    "T_ORDREL_REQ on 0x%p\n", (void *)q);
2901 
2902 				mutex_exit(&mir->mir_mutex);
2903 				return;
2904 			}
2905 
2906 			/*
2907 			 * Mark the structure so that we know we sent
2908 			 * an orderly release request, and reset the idle timer.
2909 			 */
2910 			mir->mir_ordrel_pending = 1;
2911 
2912 			RPCLOG(16, "mir_wput_other: calling mir_svc_idle_start"
2913 			    " on 0x%p because we got T_ORDREL_REQ\n",
2914 			    (void *)q);
2915 
2916 			mir_svc_idle_start(q, mir);
2917 			mutex_exit(&mir->mir_mutex);
2918 
2919 			/*
2920 			 * When we break, we will putnext the T_ORDREL_REQ.
2921 			 */
2922 			break;
2923 
2924 		case T_CONN_REQ:
2925 			mutex_enter(&mir->mir_mutex);
2926 			if (mir->mir_head_mp != NULL) {
2927 				freemsg(mir->mir_head_mp);
2928 				mir->mir_head_mp = NULL;
2929 				mir->mir_tail_mp = NULL;
2930 			}
2931 			mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
2932 			/*
2933 			 * Restart timer in case mir_clnt_idle_do_stop() was
2934 			 * called.
2935 			 */
2936 			mir->mir_idle_timeout = clnt_idle_timeout;
2937 			mir_clnt_idle_stop(q, mir);
2938 			mir_clnt_idle_start(q, mir);
2939 			mutex_exit(&mir->mir_mutex);
2940 			break;
2941 
2942 		default:
2943 			/*
2944 			 * T_DISCON_REQ is one of the interesting default
2945 			 * cases here. Ideally, an M_FLUSH is done before
2946 			 * T_DISCON_REQ is done. However, that is somewhat
2947 			 * cumbersome for clnt_cots.c to do. So we queue
2948 			 * T_DISCON_REQ, and let the service procedure
2949 			 * flush all M_DATA.
2950 			 */
2951 			break;
2952 		}
2953 		/* fallthru */;
2954 	default:
2955 		if (mp->b_datap->db_type >= QPCTL) {
2956 			if (mp->b_datap->db_type == M_FLUSH) {
2957 				if (mir->mir_type == RPC_CLIENT &&
2958 				    *mp->b_rptr & FLUSHW) {
2959 					RPCLOG(32, "mir_wput_other: flushing "
2960 					    "wq 0x%p\n", (void *)q);
2961 					if (*mp->b_rptr & FLUSHBAND) {
2962 						flushband(q, *(mp->b_rptr + 1),
2963 						    FLUSHDATA);
2964 					} else {
2965 						flushq(q, FLUSHDATA);
2966 					}
2967 				} else {
2968 					RPCLOG(32, "mir_wput_other: ignoring "
2969 					    "M_FLUSH on wq 0x%p\n", (void *)q);
2970 				}
2971 			}
2972 			break;
2973 		}
2974 
2975 		mutex_enter(&mir->mir_mutex);
2976 		if (mir->mir_inwservice == 0 && MIR_WCANPUTNEXT(mir, q)) {
2977 			mutex_exit(&mir->mir_mutex);
2978 			break;
2979 		}
2980 		mir->mir_inwservice = 1;
2981 		mir->mir_inwflushdata = flush_in_svc;
2982 		(void) putq(q, mp);
2983 		mutex_exit(&mir->mir_mutex);
2984 		qenable(q);
2985 
2986 		return;
2987 	}
2988 	putnext(q, mp);
2989 }
2990 
2991 static void
2992 mir_wsrv(queue_t *q)
2993 {
2994 	mblk_t	*mp;
2995 	mir_t	*mir;
2996 	bool_t flushdata;
2997 
2998 	mir = (mir_t *)q->q_ptr;
2999 	mutex_enter(&mir->mir_mutex);
3000 
3001 	flushdata = mir->mir_inwflushdata;
3002 	mir->mir_inwflushdata = 0;
3003 
3004 	while (mp = getq(q)) {
3005 		if (mp->b_datap->db_type == M_DATA) {
3006 			/*
3007 			 * Do not send any more data if we have sent
3008 			 * a T_ORDREL_REQ.
3009 			 */
3010 			if (flushdata || mir->mir_ordrel_pending == 1) {
3011 				freemsg(mp);
3012 				continue;
3013 			}
3014 
3015 			/*
3016 			 * Make sure that the stream can really handle more
3017 			 * data.
3018 			 */
3019 			if (!MIR_WCANPUTNEXT(mir, q)) {
3020 				(void) putbq(q, mp);
3021 				mutex_exit(&mir->mir_mutex);
3022 				return;
3023 			}
3024 
3025 			/*
3026 			 * Now we pass the RPC message downstream.
3027 			 */
3028 			mutex_exit(&mir->mir_mutex);
3029 			putnext(q, mp);
3030 			mutex_enter(&mir->mir_mutex);
3031 			continue;
3032 		}
3033 
3034 		/*
3035 		 * This is not an RPC message, pass it downstream
3036 		 * (ignoring flow control) if the server side is not sending a
3037 		 * T_ORDREL_REQ downstream.
3038 		 */
3039 		if (mir->mir_type != RPC_SERVER ||
3040 		    ((union T_primitives *)mp->b_rptr)->type !=
3041 		    T_ORDREL_REQ) {
3042 			mutex_exit(&mir->mir_mutex);
3043 			putnext(q, mp);
3044 			mutex_enter(&mir->mir_mutex);
3045 			continue;
3046 		}
3047 
3048 		if (mir->mir_ordrel_pending == 1) {
3049 			/*
3050 			 * Don't send two T_ORDRELs
3051 			 */
3052 			freemsg(mp);
3053 			continue;
3054 		}
3055 
3056 		/*
3057 		 * Mark the structure so that we know we sent an orderly
3058 		 * release request.  We will check to see slot is idle at the
3059 		 * end of this routine, and if so, reset the idle timer to
3060 		 * handle orderly release timeouts.
3061 		 */
3062 		mir->mir_ordrel_pending = 1;
3063 		RPCLOG(16, "mir_wsrv: sending ordrel req on q 0x%p\n",
3064 		    (void *)q);
3065 		/*
3066 		 * Send the orderly release downstream. If there are other
3067 		 * pending replies we won't be able to send them.  However,
3068 		 * the only reason we should send the orderly release is if
3069 		 * we were idle, or if an unusual event occurred.
3070 		 */
3071 		mutex_exit(&mir->mir_mutex);
3072 		putnext(q, mp);
3073 		mutex_enter(&mir->mir_mutex);
3074 	}
3075 
3076 	if (q->q_first == NULL)
3077 		/*
3078 		 * If we call mir_svc_idle_start() below, then
3079 		 * clearing mir_inwservice here will also result in
3080 		 * any thread waiting in mir_close() to be signaled.
3081 		 */
3082 		mir->mir_inwservice = 0;
3083 
3084 	if (mir->mir_type != RPC_SERVER) {
3085 		mutex_exit(&mir->mir_mutex);
3086 		return;
3087 	}
3088 
3089 	/*
3090 	 * If idle we call mir_svc_idle_start to start the timer (or wakeup
3091 	 * a close). Also make sure not to start the idle timer on the
3092 	 * listener stream. This can cause nfsd to send an orderly release
3093 	 * command on the listener stream.
3094 	 */
3095 	if (MIR_SVC_QUIESCED(mir) && !(mir->mir_listen_stream)) {
3096 		RPCLOG(16, "mir_wsrv: calling mir_svc_idle_start on 0x%p "
3097 		    "because mir slot is idle\n", (void *)q);
3098 		mir_svc_idle_start(q, mir);
3099 	}
3100 
3101 	/*
3102 	 * If outbound flow control has been relieved, then allow new
3103 	 * inbound requests to be processed.
3104 	 */
3105 	if (mir->mir_hold_inbound) {
3106 		mir->mir_hold_inbound = 0;
3107 		qenable(RD(q));
3108 	}
3109 	mutex_exit(&mir->mir_mutex);
3110 }
3111 
3112 static void
3113 mir_disconnect(queue_t *q, mir_t *mir)
3114 {
3115 	ASSERT(MUTEX_HELD(&mir->mir_mutex));
3116 
3117 	switch (mir->mir_type) {
3118 	case RPC_CLIENT:
3119 		/*
3120 		 * We are disconnecting, but not necessarily
3121 		 * closing. By not closing, we will fail to
3122 		 * pick up a possibly changed global timeout value,
3123 		 * unless we store it now.
3124 		 */
3125 		mir->mir_idle_timeout = clnt_idle_timeout;
3126 		mir_clnt_idle_start(WR(q), mir);
3127 		mutex_exit(&mir->mir_mutex);
3128 
3129 		/*
3130 		 * T_DISCON_REQ is passed to KRPC as an integer value
3131 		 * (this is not a TPI message).  It is used as a
3132 		 * convenient value to indicate a sanity check
3133 		 * failure -- the same KRPC routine is also called
3134 		 * for T_DISCON_INDs and T_ORDREL_INDs.
3135 		 */
3136 		clnt_dispatch_notifyall(WR(q), T_DISCON_REQ, 0);
3137 		break;
3138 
3139 	case RPC_SERVER:
3140 		mir->mir_svc_no_more_msgs = 1;
3141 		mir_svc_idle_stop(WR(q), mir);
3142 		mutex_exit(&mir->mir_mutex);
3143 		RPCLOG(16, "mir_disconnect: telling "
3144 		    "stream head listener to disconnect stream "
3145 		    "(0x%p)\n", (void *) q);
3146 		(void) mir_svc_policy_notify(q, 2);
3147 		break;
3148 
3149 	default:
3150 		mutex_exit(&mir->mir_mutex);
3151 		break;
3152 	}
3153 }
3154 
3155 /*
3156  * do a sanity check on the length of the fragment.
3157  * returns 1 if bad else 0.
3158  */
3159 static int
3160 mir_check_len(queue_t *q, int32_t frag_len,
3161     mblk_t *head_mp)
3162 {
3163 	mir_t   *mir;
3164 
3165 	mir = (mir_t *)q->q_ptr;
3166 
3167 	/*
3168 	 * Do a sanity check on the message length.  If this message is
3169 	 * getting excessively large, shut down the connection.
3170 	 */
3171 
3172 	if ((frag_len <= 0) || (mir->mir_max_msg_sizep == NULL) ||
3173 	    (frag_len <= *mir->mir_max_msg_sizep)) {
3174 		return (0);
3175 	}
3176 
3177 	freemsg(head_mp);
3178 	mir->mir_head_mp = (mblk_t *)0;
3179 	mir->mir_frag_len = -(int)sizeof (uint32_t);
3180 	if (mir->mir_type != RPC_SERVER || mir->mir_setup_complete) {
3181 		cmn_err(CE_NOTE,
3182 		    "KRPC: record fragment from %s of size(%d) exceeds "
3183 		    "maximum (%u). Disconnecting",
3184 		    (mir->mir_type == RPC_CLIENT) ? "server" :
3185 		    (mir->mir_type == RPC_SERVER) ? "client" :
3186 		    "test tool",
3187 		    frag_len, *mir->mir_max_msg_sizep);
3188 	}
3189 
3190 	mir_disconnect(q, mir);
3191 	return (1);
3192 }
3193