xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsddi.c (revision 9b9d39d2a32ff806d2431dbcc50968ef1e6d46b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2018, Joyent, Inc.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/modctl.h>
33 #include <sys/stat.h>
34 #include <sys/stream.h>
35 #include <sys/strsun.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/priv_names.h>
39 #include <inet/common.h>
40 
41 #define	_SUN_TPI_VERSION 2
42 #include <sys/tihdr.h>
43 #include <sys/timod.h>
44 #include <sys/tiuser.h>
45 #include <sys/suntpi.h>
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/mi.h>
49 #include <inet/proto_set.h>
50 #include <sys/ib/clients/rds/rds.h>
51 #include <sys/policy.h>
52 #include <inet/ipclassifier.h>
53 #include <sys/ib/clients/rds/rds_kstat.h>
54 #include "sys/random.h"
55 #include <sys/ib/clients/rds/rds_transport.h>
56 #include <sys/ib/ibtl/ibti.h>
57 
58 
59 #define	RDS_NAME	"rds"
60 #define	RDS_STRTAB	rdsinfo
61 #define	RDS_DEVDESC	"RDS STREAMS driver"
62 #define	RDS_DEVMINOR	0
63 #define	RDS_DEVMTFLAGS D_MP | D_SYNCSTR
64 #define	RDS_DEFAULT_PRIV_MODE	0666
65 
66 #define	rds_smallest_port	1
67 #define	rds_largest_port	65535
68 
69 #define	RDS_RECV_HIWATER	(56 * 1024)
70 #define	RDS_RECV_LOWATER	128
71 #define	RDS_XMIT_HIWATER	(56 * 1024)
72 #define	RDS_XMIT_LOWATER	1024
73 
74 #define	RDS_DPRINTF2	0 &&
75 #define	LABEL	"RDS"
76 
77 typedef struct rdsahdr_s {
78 	in_port_t	uha_src_port;	/* Source port */
79 	in_port_t	uha_dst_port;	/* Destination port */
80 } rdsha_t;
81 
82 #define	RDSH_SIZE	4
83 
84 int rds_recv_hiwat = RDS_RECV_HIWATER;
85 int rds_recv_lowat = RDS_RECV_LOWATER;
86 int rds_xmit_hiwat = RDS_XMIT_HIWATER;
87 int rds_xmit_lowat = RDS_XMIT_LOWATER;
88 
89 int rdsdebug;
90 
91 static dev_info_t *rds_dev_info;
92 
93 /* Hint not protected by any lock */
94 static	in_port_t	rds_next_port_to_try;
95 
96 ldi_ident_t rds_li;
97 static int loopmax = rds_largest_port - rds_smallest_port + 1;
98 
99 /* global configuration variables */
100 uint_t  UserBufferSize;
101 uint_t  rds_rx_pkts_pending_hwm;
102 
103 extern void rds_ioctl(queue_t *, mblk_t *);
104 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp);
105 
106 int rds_open_transport_driver();
107 int rds_close_transport_driver();
108 
109 #define	RDS_CURRENT_PORT_QUOTA()					\
110 	(rds_rx_pkts_pending_hwm/RDS_GET_NPORT())
111 
112 krwlock_t	rds_transport_lock;
113 ldi_handle_t	rds_transport_handle = NULL;
114 rds_transport_ops_t *rds_transport_ops = NULL;
115 
116 static int
117 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
118 {
119 	int	ret;
120 
121 	if (cmd != DDI_ATTACH)
122 		return (DDI_FAILURE);
123 
124 	rds_dev_info = devi;
125 
126 	ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR,
127 	    RDS_DEVMINOR, DDI_PSEUDO, 0);
128 	if (ret != DDI_SUCCESS) {
129 		return (ret);
130 	}
131 
132 	return (DDI_SUCCESS);
133 }
134 
135 static int
136 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
137 {
138 	if (cmd != DDI_DETACH)
139 		return (DDI_FAILURE);
140 
141 	ASSERT(devi == rds_dev_info);
142 
143 	ddi_remove_minor_node(devi, NULL);
144 
145 	return (DDI_SUCCESS);
146 }
147 
148 /* ARGSUSED */
149 static int
150 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
151 {
152 	int error = DDI_FAILURE;
153 
154 	switch (cmd) {
155 	case DDI_INFO_DEVT2DEVINFO:
156 		if (rds_dev_info != NULL) {
157 			*result = (void *)rds_dev_info;
158 			error = DDI_SUCCESS;
159 		}
160 		break;
161 
162 	case DDI_INFO_DEVT2INSTANCE:
163 		*result = NULL;
164 		error = DDI_SUCCESS;
165 		break;
166 
167 	default:
168 		break;
169 	}
170 
171 	return (error);
172 }
173 
174 
175 /*ARGSUSED*/
176 static int
177 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
178 {
179 	rds_t	*rds;
180 	int	ret;
181 
182 	if (is_system_labeled()) {
183 		/*
184 		 * RDS socket is not supported on labeled systems
185 		 */
186 		return (ESOCKTNOSUPPORT);
187 	}
188 
189 	/* Open the transport driver if IB HW is present */
190 	rw_enter(&rds_transport_lock, RW_READER);
191 	if (rds_transport_handle == NULL) {
192 		rw_exit(&rds_transport_lock);
193 		ret = rds_open_transport_driver();
194 		rw_enter(&rds_transport_lock, RW_READER);
195 
196 		if (ret != 0) {
197 			/* Transport driver failed to load */
198 			rw_exit(&rds_transport_lock);
199 			return (ret);
200 		}
201 	}
202 	rw_exit(&rds_transport_lock);
203 
204 	if (sflag == MODOPEN) {
205 		return (EINVAL);
206 	}
207 
208 	/* Reopen not supported */
209 	if (q->q_ptr != NULL) {
210 		dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr));
211 		return (0);
212 	}
213 
214 	rds = rds_create(q, credp);
215 	if (rds == NULL) {
216 		dprint(2, ("%s: rds_create failed", LABEL));
217 		return (0);
218 	}
219 
220 	q->q_ptr = WR(q)->q_ptr = rds;
221 	rds->rds_state = TS_UNBND;
222 	rds->rds_family = AF_INET_OFFLOAD;
223 
224 	q->q_hiwat = rds_recv_hiwat;
225 	q->q_lowat = rds_recv_lowat;
226 
227 	qprocson(q);
228 
229 	WR(q)->q_hiwat = rds_xmit_hiwat;
230 	WR(q)->q_lowat = rds_xmit_lowat;
231 
232 	/* Set the Stream head watermarks */
233 	(void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat);
234 	(void) proto_set_rx_lowat(q, NULL, rds_recv_lowat);
235 
236 	return (0);
237 }
238 
239 /* ARGSUSED */
240 static int
241 rds_close(queue_t *q, int flags __unused, cred_t *credp __unused)
242 {
243 	rds_t *rdsp = (rds_t *)q->q_ptr;
244 
245 	qprocsoff(q);
246 
247 	/*
248 	 * NPORT should be decremented only if this socket was previously
249 	 * bound to an RDS port.
250 	 */
251 	if (rdsp->rds_state >= TS_IDLE) {
252 		RDS_DECR_NPORT();
253 		RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA());
254 		rds_transport_ops->
255 		    rds_transport_resume_port(ntohs(rdsp->rds_port));
256 	}
257 
258 	/* close the transport driver if this is the last socket */
259 	if (RDS_GET_NPORT() == 1) {
260 		(void) rds_close_transport_driver();
261 	}
262 
263 	/*
264 	 * We set the flags without holding a lock as this is
265 	 * just a hint for the fanout lookup to skip this rds.
266 	 * We dont free the struct until it's out of the hash and
267 	 * the ref count goes down.
268 	 */
269 	rdsp->rds_flags |= RDS_CLOSING;
270 	rds_bind_hash_remove(rdsp, B_FALSE);
271 	mutex_enter(&rdsp->rds_lock);
272 	ASSERT(rdsp->rds_refcnt > 0);
273 	if (rdsp->rds_refcnt != 1) {
274 		cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock);
275 	}
276 	mutex_exit(&rdsp->rds_lock);
277 	RDS_DEC_REF_CNT(rdsp);
278 	RD(q)->q_ptr = NULL;
279 	WR(q)->q_ptr = NULL;
280 	return (0);
281 }
282 
283 /*
284  * Add a new message to the socket
285  */
286 int
287 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr,
288     in_port_t local_port, in_port_t rem_port, zoneid_t zoneid)
289 {
290 	rds_t *rds;
291 	struct  T_unitdata_ind  *tudi;
292 	int	udi_size;	/* Size of T_unitdata_ind */
293 	mblk_t *mp1;
294 	sin_t	*sin;
295 	int error = 0;
296 
297 	local_port = htons(local_port);
298 	rem_port = htons(rem_port);
299 
300 	ASSERT(mp->b_datap->db_type == M_DATA);
301 	rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid);
302 	if (rds == NULL) {
303 		dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL,
304 		    local_addr, rem_addr, ntohs(local_port), ntohs(rem_port)));
305 		freemsg(mp);
306 		return (error);
307 	}
308 
309 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
310 
311 	/* Allocate a message block for the T_UNITDATA_IND structure. */
312 	mp1 = allocb(udi_size, BPRI_MED);
313 	if (mp1 == NULL) {
314 		dprint(2, ("%s: allocb failed", LABEL));
315 		freemsg(mp);
316 		return (ENOMEM);
317 	}
318 
319 	mp1->b_cont = mp;
320 	mp = mp1;
321 	mp->b_datap->db_type = M_PROTO;
322 	tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr;
323 	mp->b_wptr = (uchar_t *)tudi + udi_size;
324 	tudi->PRIM_type = T_UNITDATA_IND;
325 	tudi->SRC_length = sizeof (sin_t);
326 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
327 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
328 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
329 	tudi->OPT_length = udi_size;
330 	sin = (sin_t *)&tudi[1];
331 	sin->sin_addr.s_addr = rem_addr;
332 	sin->sin_port = ntohs(rem_port);
333 	sin->sin_family = rds->rds_family;
334 	*(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0;
335 	*(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0;
336 
337 	putnext(rds->rds_ulpd, mp);
338 
339 	/* check port quota */
340 	if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) {
341 		ulong_t current_port_quota = RDS_GET_PORT_QUOTA();
342 		if (rds->rds_port_quota > current_port_quota) {
343 			/* this may result in stalling the port */
344 			rds->rds_port_quota = current_port_quota;
345 			(void) proto_set_rx_hiwat(rds->rds_ulpd, NULL,
346 			    rds->rds_port_quota * UserBufferSize);
347 			RDS_INCR_PORT_QUOTA_ADJUSTED();
348 		}
349 	}
350 
351 	/*
352 	 * canputnext() check is done after putnext as the protocol does
353 	 * not allow dropping any received packet.
354 	 */
355 	if (!canputnext(rds->rds_ulpd)) {
356 		error = ENOSPC;
357 	}
358 
359 	RDS_DEC_REF_CNT(rds);
360 	return (error);
361 }
362 
363 
364 /* Default structure copied into T_INFO_ACK messages */
365 static struct T_info_ack rds_g_t_info_ack_ipv4 = {
366 	T_INFO_ACK,
367 	65535,	/* TSDU_size. Excl. headers */
368 	T_INVALID,	/* ETSU_size.  rds does not support expedited data. */
369 	T_INVALID,	/* CDATA_size. rds does not support connect data. */
370 	T_INVALID,	/* DDATA_size. rds does not support disconnect data. */
371 	sizeof (sin_t),	/* ADDR_size. */
372 	0,		/* OPT_size - not initialized here */
373 	65535,		/* TIDU_size.  Excl. headers */
374 	T_CLTS,		/* SERV_type.  rds supports connection-less. */
375 	TS_UNBND,	/* CURRENT_state.  This is set from rds_state. */
376 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
377 };
378 
379 static in_port_t
380 rds_update_next_port(in_port_t port)
381 {
382 	(void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t));
383 	if (port < rds_smallest_port)
384 		port = rds_smallest_port;
385 	return (port);
386 }
387 
388 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
389 static void
390 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
391 {
392 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
393 		qreply(q, mp);
394 }
395 
396 static void
397 rds_capability_req(queue_t *q, mblk_t *mp)
398 {
399 	t_uscalar_t	cap_bits1;
400 	struct T_capability_ack *tcap;
401 
402 	cap_bits1 =
403 	    ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1;
404 
405 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
406 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
407 	if (mp == NULL)
408 		return;
409 	tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr;
410 	tcap->CAP_bits1 = 0;
411 
412 	if (cap_bits1 & TC1_INFO) {
413 		tcap->CAP_bits1 |= TC1_INFO;
414 		*(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4;
415 	}
416 
417 	qreply(q, mp);
418 }
419 
420 static void
421 rds_info_req(queue_t *q, mblk_t *omp)
422 {
423 	rds_t *rds = (rds_t *)q->q_ptr;
424 	struct T_info_ack *tap;
425 	mblk_t *mp;
426 
427 	/* Create a T_INFO_ACK message. */
428 	mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO,
429 	    T_INFO_ACK);
430 	if (mp == NULL)
431 		return;
432 	tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr;
433 	*tap = rds_g_t_info_ack_ipv4;
434 	tap->CURRENT_state = rds->rds_state;
435 	tap->OPT_size = 128;
436 	qreply(q, mp);
437 }
438 
439 /*
440  * NO locking protection here as sockfs will only send down
441  * one bind operation at a time.
442  */
443 static void
444 rds_bind(queue_t *q, mblk_t *mp)
445 {
446 	sin_t		*sin;
447 	rds_t *rds;
448 	struct T_bind_req *tbr;
449 	in_port_t	port;	/* Host byte order */
450 	in_port_t	requested_port; /* Host byte order */
451 	struct T_bind_ack *tba;
452 	int		count;
453 	rds_bf_t	*rdsbf;
454 	in_port_t	lport;	/* Network byte order */
455 
456 	rds = (rds_t *)q->q_ptr;
457 	if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) {
458 		rds_err_ack(q, mp, TPROTO, 0);
459 		return;
460 	}
461 
462 	/*
463 	 * We don't allow multiple binds
464 	 */
465 	if (rds->rds_state != TS_UNBND) {
466 		rds_err_ack(q, mp, TOUTSTATE, 0);
467 		return;
468 	}
469 
470 	tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr;
471 	switch (tbr->ADDR_length) {
472 	case sizeof (sin_t):    /* Complete IPv4 address */
473 		sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset,
474 		    sizeof (sin_t));
475 		if (sin == NULL || !OK_32PTR((char *)sin)) {
476 			rds_err_ack(q, mp, TSYSERR, EINVAL);
477 			return;
478 		}
479 		if (rds->rds_family != AF_INET_OFFLOAD ||
480 		    sin->sin_family != AF_INET_OFFLOAD) {
481 			rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
482 			return;
483 		}
484 		if (sin->sin_addr.s_addr == INADDR_ANY) {
485 			rds_err_ack(q, mp, TBADADDR, 0);
486 			return;
487 		}
488 
489 		/*
490 		 * verify that the address is hosted on IB
491 		 * only exception is the loopback address.
492 		 */
493 		if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) &&
494 		    !rds_verify_bind_address(sin->sin_addr.s_addr)) {
495 			rds_err_ack(q, mp, TBADADDR, 0);
496 			return;
497 		}
498 
499 		port = ntohs(sin->sin_port);
500 		break;
501 	default:	/* Invalid request */
502 		rds_err_ack(q, mp, TBADADDR, 0);
503 		return;
504 	}
505 
506 	requested_port = port;
507 
508 	/*
509 	 * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6
510 	 * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ
511 	 * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ
512 	 */
513 
514 	if (requested_port == 0) {
515 		/*
516 		 * If the application passed in zero for the port number, it
517 		 * doesn't care which port number we bind to. Get one in the
518 		 * valid range.
519 		 */
520 		port = rds_update_next_port(rds_next_port_to_try);
521 	}
522 
523 	ASSERT(port != 0);
524 	count = 0;
525 	for (;;) {
526 		rds_t		*rds1;
527 		ASSERT(sin->sin_addr.s_addr != INADDR_ANY);
528 		/*
529 		 * Walk through the list of rds streams bound to
530 		 * requested port with the same IP address.
531 		 */
532 		lport = htons(port);
533 		rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)];
534 		mutex_enter(&rdsbf->rds_bf_lock);
535 		for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL;
536 		    rds1 = rds1->rds_bind_hash) {
537 			if (lport != rds1->rds_port ||
538 			    rds1->rds_src != sin->sin_addr.s_addr ||
539 			    rds1->rds_zoneid != rds->rds_zoneid)
540 
541 				continue;
542 			break;
543 		}
544 
545 		if (rds1 == NULL) {
546 			/*
547 			 * No other stream has this IP address
548 			 * and port number. We can use it.
549 			 */
550 			break;
551 		}
552 		mutex_exit(&rdsbf->rds_bf_lock);
553 		if (requested_port != 0) {
554 			/*
555 			 * We get here only when requested port
556 			 * is bound (and only first  of the for()
557 			 * loop iteration).
558 			 *
559 			 * The semantics of this bind request
560 			 * require it to fail so we return from
561 			 * the routine (and exit the loop).
562 			 *
563 			 */
564 			rds_err_ack(q, mp, TADDRBUSY, 0);
565 			return;
566 		}
567 
568 		port = rds_update_next_port(port + 1);
569 
570 		if (++count >= loopmax) {
571 			/*
572 			 * We've tried every possible port number and
573 			 * there are none available, so send an error
574 			 * to the user.
575 			 */
576 			rds_err_ack(q, mp, TNOADDR, 0);
577 			return;
578 		}
579 	}
580 
581 	/*
582 	 * Copy the source address into our rds structure.
583 	 */
584 	rds->rds_src = sin->sin_addr.s_addr;
585 	rds->rds_port = lport;
586 
587 	/*
588 	 * reset the next port if we choose the port
589 	 */
590 	if (requested_port == 0) {
591 		rds_next_port_to_try = port + 1;
592 	}
593 
594 	rds->rds_state = TS_IDLE;
595 	rds_bind_hash_insert(rdsbf, rds);
596 	mutex_exit(&rdsbf->rds_bf_lock);
597 
598 	/* Reset the message type in preparation for shipping it back. */
599 	mp->b_datap->db_type = M_PCPROTO;
600 	tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr;
601 	tba->PRIM_type = T_BIND_ACK;
602 
603 	/* Increment the number of ports and set the port quota */
604 	RDS_INCR_NPORT();
605 	rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA();
606 	RDS_SET_PORT_QUOTA(rds->rds_port_quota);
607 	(void) proto_set_rx_hiwat(RD(q), NULL,
608 	    rds->rds_port_quota * UserBufferSize);
609 
610 	qreply(q, mp);
611 }
612 
613 static void
614 rds_wput_other(queue_t *q, mblk_t *mp)
615 {
616 	uchar_t *rptr = mp->b_rptr;
617 	struct datab *db;
618 	cred_t *cr;
619 
620 	db = mp->b_datap;
621 	switch (db->db_type) {
622 	case M_DATA:
623 		/* Not connected */
624 		freemsg(mp);
625 		return;
626 	case M_PROTO:
627 	case M_PCPROTO:
628 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr <
629 		    sizeof (t_scalar_t)) {
630 			freemsg(mp);
631 			return;
632 		}
633 		switch (((union T_primitives *)(uintptr_t)rptr)->type) {
634 		case T_CAPABILITY_REQ:
635 			rds_capability_req(q, mp);
636 			return;
637 
638 		case T_INFO_REQ:
639 			rds_info_req(q, mp);
640 			return;
641 		case O_T_BIND_REQ:
642 		case T_BIND_REQ:
643 			rds_bind(q, mp);
644 			return;
645 		case T_SVR4_OPTMGMT_REQ:
646 		case T_OPTMGMT_REQ:
647 			/*
648 			 * All Solaris components should pass a db_credp
649 			 * for this TPI message, hence we ASSERT.
650 			 * But in case there is some other M_PROTO that looks
651 			 * like a TPI message sent by some other kernel
652 			 * component, we check and return an error.
653 			 */
654 			cr = msg_getcred(mp, NULL);
655 			ASSERT(cr != NULL);
656 			if (cr == NULL) {
657 				rds_err_ack(q, mp, TSYSERR, EINVAL);
658 				return;
659 			}
660 			if (((union T_primitives *)(uintptr_t)rptr)->type ==
661 			    T_SVR4_OPTMGMT_REQ) {
662 				svr4_optcom_req(q, mp, cr, &rds_opt_obj);
663 			} else {
664 				tpi_optcom_req(q, mp, cr, &rds_opt_obj);
665 			}
666 			return;
667 		case T_CONN_REQ:
668 			/*
669 			 * We should not receive T_CONN_REQ as sockfs only
670 			 * sends down T_CONN_REQ if family == AF_INET/AF_INET6
671 			 * and type == SOCK_DGRAM/SOCK_RAW. For all others
672 			 * it simply calls soisconnected. see sotpi_connect()
673 			 * for details.
674 			 */
675 		/* FALLTHRU */
676 		default:
677 			cmn_err(CE_PANIC, "type %d \n",
678 			    ((union T_primitives *)(uintptr_t)rptr)->type);
679 		}
680 		break;
681 	case M_FLUSH:
682 		if (*rptr & FLUSHW)
683 			flushq(q, FLUSHDATA);
684 		break;
685 	case M_IOCTL:
686 		rds_ioctl(q, mp);
687 		break;
688 	case M_IOCDATA:
689 		/* IOCTL continuation following copyin or copyout. */
690 		if (mi_copy_state(q, mp, NULL) == -1) {
691 			/*
692 			 * The copy operation failed.  mi_copy_state already
693 			 * cleaned up, so we're out of here.
694 			 */
695 			return;
696 		}
697 		/*
698 		 * If we just completed a copy in, continue processing
699 		 * in rds_ioctl_copyin_done. If it was a copy out, we call
700 		 * mi_copyout again.  If there is nothing more to copy out,
701 		 * it will complete the IOCTL.
702 		 */
703 
704 		if (MI_COPY_DIRECTION(mp) == MI_COPY_IN)
705 			rds_ioctl_copyin_done(q, mp);
706 		else
707 			mi_copyout(q, mp);
708 		return;
709 
710 	default:
711 		cmn_err(CE_PANIC, "types %d \n", db->db_type);
712 	}
713 }
714 
715 static int
716 rds_wput(queue_t *q, mblk_t *mp)
717 {
718 	struct	datab	*db;
719 	uchar_t	*rptr = mp->b_rptr;
720 
721 	db = mp->b_datap;
722 	switch (db->db_type) {
723 	case M_PROTO:
724 	case M_PCPROTO:
725 		ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
726 		    (uintptr_t)INT_MAX);
727 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
728 		    sizeof (struct T_unitdata_req)) {
729 			if (((union T_primitives *)(uintptr_t)rptr)->type
730 			    == T_UNITDATA_REQ) {
731 				/*
732 				 *  We should never come here for T_UNITDATA_REQ
733 				 */
734 				cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n");
735 			}
736 		}
737 		/* FALLTHRU */
738 	default:
739 		rds_wput_other(q, mp);
740 		return (0);
741 	}
742 }
743 
744 static int
745 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop)
746 {
747 	uchar_t	*rptr = mp->b_rptr;
748 	rds_t	*rds;
749 	mblk_t	*mp1;
750 	sin_t	*sin;
751 	ipaddr_t dst;
752 	uint16_t port;
753 	int ret = 0;
754 
755 #define	tudr	((struct T_unitdata_req *)(uintptr_t)rptr)
756 
757 	rds = (rds_t *)q->q_ptr;
758 	/* Handle UNITDATA_REQ messages here */
759 	if (rds->rds_state == TS_UNBND) {
760 		/* If a port has not been bound to the stream, fail. */
761 		dprint(2, ("%s: socket is not bound to a port", LABEL));
762 		freemsg(mp);
763 		return (EPROTO);
764 	}
765 
766 	mp1 = mp->b_cont;
767 	mp->b_cont = NULL;
768 	if (mp1 == NULL) {
769 		dprint(2, ("%s: No message to send", LABEL));
770 		freemsg(mp);
771 		return (EPROTO);
772 	}
773 
774 	/*
775 	 * No options allowed
776 	 */
777 	if (tudr->OPT_length != 0) {
778 		ret = EINVAL;
779 		goto done;
780 	}
781 
782 	ASSERT(mp1->b_datap->db_ref == 1);
783 
784 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) >
785 	    mp->b_wptr) {
786 		ret = EDESTADDRREQ;
787 		goto done;
788 	}
789 
790 	sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset];
791 	if (!OK_32PTR((char *)sin) || tudr->DEST_length !=
792 	    sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) {
793 		ret = EDESTADDRREQ;
794 		goto done;
795 	}
796 	/* Extract port and ipaddr */
797 	port = sin->sin_port;
798 	dst = sin->sin_addr.s_addr;
799 
800 	if (port == 0 || dst == INADDR_ANY) {
801 		ret = EDESTADDRREQ;
802 		goto done;
803 	}
804 
805 	ASSERT(rds_transport_ops != NULL);
806 	ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst,
807 	    ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid);
808 	if (ret != 0) {
809 		if ((ret != ENOBUFS) && (ret != ENOMEM)) {
810 			/* ENOMEM is actually EWOULDBLOCK */
811 			dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret));
812 			goto done;
813 		}
814 	}
815 done:
816 	freemsg(mp1);
817 	freemsg(mp);
818 	return (ret);
819 }
820 
821 /*
822  * Make sure we dont return EINVAL and EWOULDBLOCK as it has
823  * special meanings for the synchronous streams (rwnext()).
824  * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg()
825  */
826 static int
827 rds_wrw(queue_t *q, struiod_t *dp)
828 {
829 	mblk_t  *mp = dp->d_mp;
830 	int error = 0;
831 	struct  datab   *db;
832 	uchar_t *rptr;
833 
834 	db = mp->b_datap;
835 	rptr = mp->b_rptr;
836 	switch (db->db_type) {
837 	case M_PROTO:
838 	case M_PCPROTO:
839 		ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
840 		    (uintptr_t)INT_MAX);
841 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
842 		    sizeof (struct T_unitdata_req)) {
843 			/* Detect valid T_UNITDATA_REQ here */
844 			if (((union T_primitives *)(uintptr_t)rptr)->type
845 			    == T_UNITDATA_REQ)
846 				break;
847 		}
848 		/* FALLTHRU */
849 	default:
850 
851 		if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
852 		/*
853 		 * Uio error of some sort, so just return the error.
854 		 */
855 			goto done;
856 		}
857 		dp->d_mp = 0;
858 		rds_wput_other(q, mp);
859 		return (0);
860 	}
861 
862 	dp->d_mp = 0;
863 	error = rds_wput_data(q, mp, &dp->d_uio);
864 done:
865 	if (error == EWOULDBLOCK || error == EINVAL)
866 		error = EIO;
867 
868 	return (error);
869 }
870 
871 static int
872 rds_rsrv(queue_t *q)
873 {
874 	rds_t	*rds = (rds_t *)q->q_ptr;
875 	ulong_t current_port_quota;
876 
877 	/* update the port quota to the current level */
878 	current_port_quota = RDS_GET_PORT_QUOTA();
879 	if (rds->rds_port_quota != current_port_quota) {
880 		rds->rds_port_quota = current_port_quota;
881 		(void) proto_set_rx_hiwat(q, NULL,
882 		    rds->rds_port_quota * UserBufferSize);
883 	}
884 
885 	/* No more messages in the q, unstall the socket */
886 	rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port));
887 	return (0);
888 }
889 
890 int
891 rds_close_transport_driver()
892 {
893 	ASSERT(rds_transport_ops != NULL);
894 
895 	rw_enter(&rds_transport_lock, RW_WRITER);
896 	if (rds_transport_handle != NULL) {
897 		rds_transport_ops->rds_transport_close_ib();
898 		(void) ldi_close(rds_transport_handle, FNDELAY, kcred);
899 		rds_transport_handle = NULL;
900 	}
901 	rw_exit(&rds_transport_lock);
902 
903 	return (0);
904 }
905 
906 
907 int
908 rds_open_transport_driver()
909 {
910 	int ret = 0;
911 
912 	rw_enter(&rds_transport_lock, RW_WRITER);
913 	if (rds_transport_handle != NULL) {
914 		/*
915 		 * Someone beat us to it.
916 		 */
917 		goto done;
918 	}
919 
920 	if (ibt_hw_is_present() == 0) {
921 		ret = ENODEV;
922 		goto done;
923 	}
924 
925 	if (rds_li == NULL) {
926 		ret = EPROTONOSUPPORT;
927 		goto done;
928 	}
929 
930 	ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib",
931 	    FREAD | FWRITE, kcred, &rds_transport_handle, rds_li);
932 	if (ret != 0) {
933 		ret = EPROTONOSUPPORT;
934 		rds_transport_handle = NULL;
935 		goto done;
936 	}
937 
938 	ret = rds_transport_ops->rds_transport_open_ib();
939 	if (ret != 0) {
940 		(void) ldi_close(rds_transport_handle, FNDELAY, kcred);
941 		rds_transport_handle = NULL;
942 	}
943 done:
944 	rw_exit(&rds_transport_lock);
945 	return (ret);
946 }
947 
948 static struct module_info info = {
949 	0, "rds", 1, INFPSZ, 65536, 1024
950 };
951 
952 static struct qinit rinit = {
953 	NULL, rds_rsrv, rds_open, rds_close, NULL, &info
954 };
955 
956 static struct qinit winit = {
957 	rds_wput, NULL, rds_open, rds_close, NULL, &info,
958 	NULL, rds_wrw, NULL, STRUIOT_STANDARD
959 };
960 
961 struct streamtab rdsinfo = {
962 	&rinit, &winit, NULL, NULL
963 };
964 
965 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach,
966     nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB, ddi_quiesce_not_supported);
967 
968 /*
969  * Module linkage information for the kernel.
970  */
971 static struct modldrv modldrv = {
972 	&mod_driverops,
973 	RDS_DEVDESC,
974 	&rds_devops
975 };
976 
977 static struct modlinkage modlinkage = {
978 	MODREV_1,
979 	&modldrv,
980 	NULL
981 };
982 
983 int
984 _init(void)
985 {
986 	int	ret;
987 
988 	rds_init();
989 
990 	ret = mod_install(&modlinkage);
991 	if (ret != 0)
992 		goto done;
993 	ret = ldi_ident_from_mod(&modlinkage, &rds_li);
994 	if (ret != 0)
995 		rds_li = NULL;
996 done:
997 	return (ret);
998 }
999 
1000 int
1001 _fini(void)
1002 {
1003 	int	ret;
1004 
1005 	ret = mod_remove(&modlinkage);
1006 	if (ret != 0) {
1007 		return (ret);
1008 	}
1009 
1010 	rds_fini();
1011 
1012 	ldi_ident_release(rds_li);
1013 	return (0);
1014 }
1015 
1016 int
1017 _info(struct modinfo *modinfop)
1018 {
1019 	return (mod_info(&modlinkage, modinfop));
1020 }
1021