xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsddi.c (revision 46b592853d0f4f11781b6b0a7533f267c6aee132)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/conf.h>
28 #include <sys/modctl.h>
29 #include <sys/stat.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/priv_names.h>
35 #include <inet/common.h>
36 
37 #define	_SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/timod.h>
40 #include <sys/tiuser.h>
41 #include <sys/suntpi.h>
42 #include <inet/common.h>
43 #include <inet/ip.h>
44 #include <inet/mi.h>
45 #include <inet/proto_set.h>
46 #include <sys/ib/clients/rds/rds.h>
47 #include <sys/policy.h>
48 #include <inet/ipclassifier.h>
49 #include <sys/ib/clients/rds/rds_kstat.h>
50 #include "sys/random.h"
51 #include <sys/ib/clients/rds/rds_transport.h>
52 #include <sys/ib/ibtl/ibti.h>
53 
54 
55 #define	RDS_NAME	"rds"
56 #define	RDS_STRTAB	rdsinfo
57 #define	RDS_DEVDESC	"RDS STREAMS driver"
58 #define	RDS_DEVMINOR	0
59 #define	RDS_DEVMTFLAGS D_MP | D_SYNCSTR
60 #define	RDS_DEFAULT_PRIV_MODE	0666
61 
62 #define	rds_smallest_port	1
63 #define	rds_largest_port	65535
64 
65 #define	RDS_RECV_HIWATER	(56 * 1024)
66 #define	RDS_RECV_LOWATER	128
67 #define	RDS_XMIT_HIWATER	(56 * 1024)
68 #define	RDS_XMIT_LOWATER	1024
69 
70 #define	RDS_DPRINTF2	0 &&
71 #define	LABEL	"RDS"
72 
73 typedef struct rdsahdr_s {
74 	in_port_t	uha_src_port;	/* Source port */
75 	in_port_t	uha_dst_port;	/* Destination port */
76 } rdsha_t;
77 
78 #define	RDSH_SIZE	4
79 
80 int rds_recv_hiwat = RDS_RECV_HIWATER;
81 int rds_recv_lowat = RDS_RECV_LOWATER;
82 int rds_xmit_hiwat = RDS_XMIT_HIWATER;
83 int rds_xmit_lowat = RDS_XMIT_LOWATER;
84 
85 int rdsdebug;
86 
87 static dev_info_t *rds_dev_info;
88 
89 /* Hint not protected by any lock */
90 static	in_port_t	rds_next_port_to_try;
91 
92 ldi_ident_t rds_li;
93 static int loopmax = rds_largest_port - rds_smallest_port + 1;
94 
95 /* global configuration variables */
96 uint_t  UserBufferSize;
97 uint_t  rds_rx_pkts_pending_hwm;
98 
99 extern void rds_ioctl(queue_t *, mblk_t *);
100 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp);
101 
102 int rds_open_transport_driver();
103 int rds_close_transport_driver();
104 
105 #define	RDS_CURRENT_PORT_QUOTA()					\
106 	(rds_rx_pkts_pending_hwm/RDS_GET_NPORT())
107 
108 krwlock_t	rds_transport_lock;
109 ldi_handle_t	rds_transport_handle = NULL;
110 rds_transport_ops_t *rds_transport_ops = NULL;
111 
112 static int
113 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
114 {
115 	int	ret;
116 
117 	if (cmd != DDI_ATTACH)
118 		return (DDI_FAILURE);
119 
120 	rds_dev_info = devi;
121 
122 	ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR,
123 	    RDS_DEVMINOR, DDI_PSEUDO, 0);
124 	if (ret != DDI_SUCCESS) {
125 		return (ret);
126 	}
127 
128 	return (DDI_SUCCESS);
129 }
130 
131 static int
132 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
133 {
134 	if (cmd != DDI_DETACH)
135 		return (DDI_FAILURE);
136 
137 	ASSERT(devi == rds_dev_info);
138 
139 	ddi_remove_minor_node(devi, NULL);
140 
141 	return (DDI_SUCCESS);
142 }
143 
144 /* ARGSUSED */
145 static int
146 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
147 {
148 	int error = DDI_FAILURE;
149 
150 	switch (cmd) {
151 	case DDI_INFO_DEVT2DEVINFO:
152 		if (rds_dev_info != NULL) {
153 			*result = (void *)rds_dev_info;
154 			error = DDI_SUCCESS;
155 		}
156 		break;
157 
158 	case DDI_INFO_DEVT2INSTANCE:
159 		*result = NULL;
160 		error = DDI_SUCCESS;
161 		break;
162 
163 	default:
164 		break;
165 	}
166 
167 	return (error);
168 }
169 
170 
171 /*ARGSUSED*/
172 static int
173 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
174 {
175 	rds_t	*rds;
176 	int	ret;
177 
178 	if (is_system_labeled()) {
179 		/*
180 		 * RDS socket is not supported on labeled systems
181 		 */
182 		return (ESOCKTNOSUPPORT);
183 	}
184 
185 	/* Open the transport driver if IB HW is present */
186 	rw_enter(&rds_transport_lock, RW_READER);
187 	if (rds_transport_handle == NULL) {
188 		rw_exit(&rds_transport_lock);
189 		ret = rds_open_transport_driver();
190 		rw_enter(&rds_transport_lock, RW_READER);
191 
192 		if (ret != 0) {
193 			/* Transport driver failed to load */
194 			rw_exit(&rds_transport_lock);
195 			return (ret);
196 		}
197 	}
198 	rw_exit(&rds_transport_lock);
199 
200 	if (sflag == MODOPEN) {
201 		return (EINVAL);
202 	}
203 
204 	/* Reopen not supported */
205 	if (q->q_ptr != NULL) {
206 		dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr));
207 		return (0);
208 	}
209 
210 	rds = rds_create(q, credp);
211 	if (rds == NULL) {
212 		dprint(2, ("%s: rds_create failed", LABEL));
213 		return (0);
214 	}
215 
216 	q->q_ptr = WR(q)->q_ptr = rds;
217 	rds->rds_state = TS_UNBND;
218 	rds->rds_family = AF_INET_OFFLOAD;
219 
220 	q->q_hiwat = rds_recv_hiwat;
221 	q->q_lowat = rds_recv_lowat;
222 
223 	qprocson(q);
224 
225 	WR(q)->q_hiwat = rds_xmit_hiwat;
226 	WR(q)->q_lowat = rds_xmit_lowat;
227 
228 	/* Set the Stream head watermarks */
229 	(void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat);
230 	(void) proto_set_rx_lowat(q, NULL, rds_recv_lowat);
231 
232 	return (0);
233 }
234 
235 static int
236 rds_close(queue_t *q)
237 {
238 	rds_t *rdsp = (rds_t *)q->q_ptr;
239 
240 	qprocsoff(q);
241 
242 	/*
243 	 * NPORT should be decremented only if this socket was previously
244 	 * bound to an RDS port.
245 	 */
246 	if (rdsp->rds_state >= TS_IDLE) {
247 		RDS_DECR_NPORT();
248 		RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA());
249 		rds_transport_ops->
250 		    rds_transport_resume_port(ntohs(rdsp->rds_port));
251 	}
252 
253 	/* close the transport driver if this is the last socket */
254 	if (RDS_GET_NPORT() == 1) {
255 		(void) rds_close_transport_driver();
256 	}
257 
258 	/*
259 	 * We set the flags without holding a lock as this is
260 	 * just a hint for the fanout lookup to skip this rds.
261 	 * We dont free the struct until it's out of the hash and
262 	 * the ref count goes down.
263 	 */
264 	rdsp->rds_flags |= RDS_CLOSING;
265 	rds_bind_hash_remove(rdsp, B_FALSE);
266 	mutex_enter(&rdsp->rds_lock);
267 	ASSERT(rdsp->rds_refcnt > 0);
268 	if (rdsp->rds_refcnt != 1) {
269 		cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock);
270 	}
271 	mutex_exit(&rdsp->rds_lock);
272 	RDS_DEC_REF_CNT(rdsp);
273 	RD(q)->q_ptr = NULL;
274 	WR(q)->q_ptr = NULL;
275 	return (0);
276 }
277 
278 /*
279  * Add a new message to the socket
280  */
281 int
282 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr,
283     in_port_t local_port, in_port_t rem_port, zoneid_t zoneid)
284 {
285 	rds_t *rds;
286 	struct  T_unitdata_ind  *tudi;
287 	int	udi_size;	/* Size of T_unitdata_ind */
288 	mblk_t *mp1;
289 	sin_t	*sin;
290 	int error = 0;
291 
292 	local_port = htons(local_port);
293 	rem_port = htons(rem_port);
294 
295 	ASSERT(mp->b_datap->db_type == M_DATA);
296 	rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid);
297 	if (rds == NULL) {
298 		dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL,
299 		    local_addr, rem_addr, ntohs(local_port), ntohs(rem_port)));
300 		freemsg(mp);
301 		return (error);
302 	}
303 
304 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
305 
306 	/* Allocate a message block for the T_UNITDATA_IND structure. */
307 	mp1 = allocb(udi_size, BPRI_MED);
308 	if (mp1 == NULL) {
309 		dprint(2, ("%s: allocb failed", LABEL));
310 		freemsg(mp);
311 		return (ENOMEM);
312 	}
313 
314 	mp1->b_cont = mp;
315 	mp = mp1;
316 	mp->b_datap->db_type = M_PROTO;
317 	tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr;
318 	mp->b_wptr = (uchar_t *)tudi + udi_size;
319 	tudi->PRIM_type = T_UNITDATA_IND;
320 	tudi->SRC_length = sizeof (sin_t);
321 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
322 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
323 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
324 	tudi->OPT_length = udi_size;
325 	sin = (sin_t *)&tudi[1];
326 	sin->sin_addr.s_addr = rem_addr;
327 	sin->sin_port = ntohs(rem_port);
328 	sin->sin_family = rds->rds_family;
329 	*(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0;
330 	*(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0;
331 
332 	putnext(rds->rds_ulpd, mp);
333 
334 	/* check port quota */
335 	if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) {
336 		ulong_t current_port_quota = RDS_GET_PORT_QUOTA();
337 		if (rds->rds_port_quota > current_port_quota) {
338 			/* this may result in stalling the port */
339 			rds->rds_port_quota = current_port_quota;
340 			(void) proto_set_rx_hiwat(rds->rds_ulpd, NULL,
341 			    rds->rds_port_quota * UserBufferSize);
342 			RDS_INCR_PORT_QUOTA_ADJUSTED();
343 		}
344 	}
345 
346 	/*
347 	 * canputnext() check is done after putnext as the protocol does
348 	 * not allow dropping any received packet.
349 	 */
350 	if (!canputnext(rds->rds_ulpd)) {
351 		error = ENOSPC;
352 	}
353 
354 	RDS_DEC_REF_CNT(rds);
355 	return (error);
356 }
357 
358 
359 /* Default structure copied into T_INFO_ACK messages */
360 static struct T_info_ack rds_g_t_info_ack_ipv4 = {
361 	T_INFO_ACK,
362 	65535,	/* TSDU_size. Excl. headers */
363 	T_INVALID,	/* ETSU_size.  rds does not support expedited data. */
364 	T_INVALID,	/* CDATA_size. rds does not support connect data. */
365 	T_INVALID,	/* DDATA_size. rds does not support disconnect data. */
366 	sizeof (sin_t),	/* ADDR_size. */
367 	0,		/* OPT_size - not initialized here */
368 	65535,		/* TIDU_size.  Excl. headers */
369 	T_CLTS,		/* SERV_type.  rds supports connection-less. */
370 	TS_UNBND,	/* CURRENT_state.  This is set from rds_state. */
371 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
372 };
373 
374 static in_port_t
375 rds_update_next_port(in_port_t port)
376 {
377 	(void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t));
378 	if (port < rds_smallest_port)
379 		port = rds_smallest_port;
380 	return (port);
381 }
382 
383 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
384 static void
385 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
386 {
387 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
388 		qreply(q, mp);
389 }
390 
391 static void
392 rds_capability_req(queue_t *q, mblk_t *mp)
393 {
394 	t_uscalar_t	cap_bits1;
395 	struct T_capability_ack *tcap;
396 
397 	cap_bits1 =
398 	    ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1;
399 
400 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
401 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
402 	if (mp == NULL)
403 		return;
404 	tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr;
405 	tcap->CAP_bits1 = 0;
406 
407 	if (cap_bits1 & TC1_INFO) {
408 		tcap->CAP_bits1 |= TC1_INFO;
409 		*(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4;
410 	}
411 
412 	qreply(q, mp);
413 }
414 
415 static void
416 rds_info_req(queue_t *q, mblk_t *omp)
417 {
418 	rds_t *rds = (rds_t *)q->q_ptr;
419 	struct T_info_ack *tap;
420 	mblk_t *mp;
421 
422 	/* Create a T_INFO_ACK message. */
423 	mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO,
424 	    T_INFO_ACK);
425 	if (mp == NULL)
426 		return;
427 	tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr;
428 	*tap = rds_g_t_info_ack_ipv4;
429 	tap->CURRENT_state = rds->rds_state;
430 	tap->OPT_size = 128;
431 	qreply(q, mp);
432 }
433 
434 /*
435  * NO locking protection here as sockfs will only send down
436  * one bind operation at a time.
437  */
438 static void
439 rds_bind(queue_t *q, mblk_t *mp)
440 {
441 	sin_t		*sin;
442 	rds_t *rds;
443 	struct T_bind_req *tbr;
444 	in_port_t	port;	/* Host byte order */
445 	in_port_t	requested_port; /* Host byte order */
446 	struct T_bind_ack *tba;
447 	int		count;
448 	rds_bf_t	*rdsbf;
449 	in_port_t	lport;	/* Network byte order */
450 
451 	rds = (rds_t *)q->q_ptr;
452 	if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) {
453 		rds_err_ack(q, mp, TPROTO, 0);
454 		return;
455 	}
456 
457 	/*
458 	 * We don't allow multiple binds
459 	 */
460 	if (rds->rds_state != TS_UNBND) {
461 		rds_err_ack(q, mp, TOUTSTATE, 0);
462 		return;
463 	}
464 
465 	tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr;
466 	switch (tbr->ADDR_length) {
467 	case sizeof (sin_t):    /* Complete IPv4 address */
468 		sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset,
469 		    sizeof (sin_t));
470 		if (sin == NULL || !OK_32PTR((char *)sin)) {
471 			rds_err_ack(q, mp, TSYSERR, EINVAL);
472 			return;
473 		}
474 		if (rds->rds_family != AF_INET_OFFLOAD ||
475 		    sin->sin_family != AF_INET_OFFLOAD) {
476 			rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
477 			return;
478 		}
479 		if (sin->sin_addr.s_addr == INADDR_ANY) {
480 			rds_err_ack(q, mp, TBADADDR, 0);
481 			return;
482 		}
483 
484 		/*
485 		 * verify that the address is hosted on IB
486 		 * only exception is the loopback address.
487 		 */
488 		if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) &&
489 		    !rds_verify_bind_address(sin->sin_addr.s_addr)) {
490 			rds_err_ack(q, mp, TBADADDR, 0);
491 			return;
492 		}
493 
494 		port = ntohs(sin->sin_port);
495 		break;
496 	default:	/* Invalid request */
497 		rds_err_ack(q, mp, TBADADDR, 0);
498 		return;
499 	}
500 
501 	requested_port = port;
502 
503 	/*
504 	 * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6
505 	 * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ
506 	 * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ
507 	 */
508 
509 	if (requested_port == 0) {
510 		/*
511 		 * If the application passed in zero for the port number, it
512 		 * doesn't care which port number we bind to. Get one in the
513 		 * valid range.
514 		 */
515 		port = rds_update_next_port(rds_next_port_to_try);
516 	}
517 
518 	ASSERT(port != 0);
519 	count = 0;
520 	for (;;) {
521 		rds_t		*rds1;
522 		ASSERT(sin->sin_addr.s_addr != INADDR_ANY);
523 		/*
524 		 * Walk through the list of rds streams bound to
525 		 * requested port with the same IP address.
526 		 */
527 		lport = htons(port);
528 		rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)];
529 		mutex_enter(&rdsbf->rds_bf_lock);
530 		for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL;
531 		    rds1 = rds1->rds_bind_hash) {
532 			if (lport != rds1->rds_port ||
533 			    rds1->rds_src != sin->sin_addr.s_addr ||
534 			    rds1->rds_zoneid != rds->rds_zoneid)
535 
536 				continue;
537 			break;
538 		}
539 
540 		if (rds1 == NULL) {
541 			/*
542 			 * No other stream has this IP address
543 			 * and port number. We can use it.
544 			 */
545 			break;
546 		}
547 		mutex_exit(&rdsbf->rds_bf_lock);
548 		if (requested_port != 0) {
549 			/*
550 			 * We get here only when requested port
551 			 * is bound (and only first  of the for()
552 			 * loop iteration).
553 			 *
554 			 * The semantics of this bind request
555 			 * require it to fail so we return from
556 			 * the routine (and exit the loop).
557 			 *
558 			 */
559 			rds_err_ack(q, mp, TADDRBUSY, 0);
560 			return;
561 		}
562 
563 		port = rds_update_next_port(port + 1);
564 
565 		if (++count >= loopmax) {
566 			/*
567 			 * We've tried every possible port number and
568 			 * there are none available, so send an error
569 			 * to the user.
570 			 */
571 			rds_err_ack(q, mp, TNOADDR, 0);
572 			return;
573 		}
574 	}
575 
576 	/*
577 	 * Copy the source address into our rds structure.
578 	 */
579 	rds->rds_src = sin->sin_addr.s_addr;
580 	rds->rds_port = lport;
581 
582 	/*
583 	 * reset the next port if we choose the port
584 	 */
585 	if (requested_port == 0) {
586 		rds_next_port_to_try = port + 1;
587 	}
588 
589 	rds->rds_state = TS_IDLE;
590 	rds_bind_hash_insert(rdsbf, rds);
591 	mutex_exit(&rdsbf->rds_bf_lock);
592 
593 	/* Reset the message type in preparation for shipping it back. */
594 	mp->b_datap->db_type = M_PCPROTO;
595 	tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr;
596 	tba->PRIM_type = T_BIND_ACK;
597 
598 	/* Increment the number of ports and set the port quota */
599 	RDS_INCR_NPORT();
600 	rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA();
601 	RDS_SET_PORT_QUOTA(rds->rds_port_quota);
602 	(void) proto_set_rx_hiwat(RD(q), NULL,
603 	    rds->rds_port_quota * UserBufferSize);
604 
605 	qreply(q, mp);
606 }
607 
608 static void
609 rds_wput_other(queue_t *q, mblk_t *mp)
610 {
611 	uchar_t *rptr = mp->b_rptr;
612 	struct datab *db;
613 	cred_t *cr;
614 
615 	db = mp->b_datap;
616 	switch (db->db_type) {
617 	case M_DATA:
618 		/* Not connected */
619 		freemsg(mp);
620 		return;
621 	case M_PROTO:
622 	case M_PCPROTO:
623 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr <
624 		    sizeof (t_scalar_t)) {
625 			freemsg(mp);
626 			return;
627 		}
628 		switch (((union T_primitives *)(uintptr_t)rptr)->type) {
629 		case T_CAPABILITY_REQ:
630 			rds_capability_req(q, mp);
631 			return;
632 
633 		case T_INFO_REQ:
634 			rds_info_req(q, mp);
635 			return;
636 		case O_T_BIND_REQ:
637 		case T_BIND_REQ:
638 			rds_bind(q, mp);
639 			return;
640 		case T_SVR4_OPTMGMT_REQ:
641 		case T_OPTMGMT_REQ:
642 			/*
643 			 * All Solaris components should pass a db_credp
644 			 * for this TPI message, hence we ASSERT.
645 			 * But in case there is some other M_PROTO that looks
646 			 * like a TPI message sent by some other kernel
647 			 * component, we check and return an error.
648 			 */
649 			cr = msg_getcred(mp, NULL);
650 			ASSERT(cr != NULL);
651 			if (cr == NULL) {
652 				rds_err_ack(q, mp, TSYSERR, EINVAL);
653 				return;
654 			}
655 			if (((union T_primitives *)(uintptr_t)rptr)->type ==
656 			    T_SVR4_OPTMGMT_REQ) {
657 				(void) svr4_optcom_req(q, mp, cr, &rds_opt_obj,
658 				    B_FALSE);
659 			} else {
660 				(void) tpi_optcom_req(q, mp, cr, &rds_opt_obj,
661 				    B_FALSE);
662 			}
663 			return;
664 		case T_CONN_REQ:
665 			/*
666 			 * We should not receive T_CONN_REQ as sockfs only
667 			 * sends down T_CONN_REQ if family == AF_INET/AF_INET6
668 			 * and type == SOCK_DGRAM/SOCK_RAW. For all others
669 			 * it simply calls soisconnected. see sotpi_connect()
670 			 * for details.
671 			 */
672 		/* FALLTHRU */
673 		default:
674 			cmn_err(CE_PANIC, "type %d \n",
675 			    ((union T_primitives *)(uintptr_t)rptr)->type);
676 		}
677 		break;
678 	case M_FLUSH:
679 		if (*rptr & FLUSHW)
680 			flushq(q, FLUSHDATA);
681 		break;
682 	case M_IOCTL:
683 		rds_ioctl(q, mp);
684 		break;
685 	case M_IOCDATA:
686 		/* IOCTL continuation following copyin or copyout. */
687 		if (mi_copy_state(q, mp, NULL) == -1) {
688 			/*
689 			 * The copy operation failed.  mi_copy_state already
690 			 * cleaned up, so we're out of here.
691 			 */
692 			return;
693 		}
694 		/*
695 		 * If we just completed a copy in, continue processing
696 		 * in rds_ioctl_copyin_done. If it was a copy out, we call
697 		 * mi_copyout again.  If there is nothing more to copy out,
698 		 * it will complete the IOCTL.
699 		 */
700 
701 		if (MI_COPY_DIRECTION(mp) == MI_COPY_IN)
702 			rds_ioctl_copyin_done(q, mp);
703 		else
704 			mi_copyout(q, mp);
705 		return;
706 
707 	default:
708 		cmn_err(CE_PANIC, "types %d \n", db->db_type);
709 	}
710 }
711 
712 static int
713 rds_wput(queue_t *q, mblk_t *mp)
714 {
715 	struct	datab	*db;
716 	uchar_t	*rptr = mp->b_rptr;
717 
718 	db = mp->b_datap;
719 	switch (db->db_type) {
720 	case M_PROTO:
721 	case M_PCPROTO:
722 		ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
723 		    (uintptr_t)INT_MAX);
724 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
725 		    sizeof (struct T_unitdata_req)) {
726 			if (((union T_primitives *)(uintptr_t)rptr)->type
727 			    == T_UNITDATA_REQ) {
728 				/*
729 				 *  We should never come here for T_UNITDATA_REQ
730 				 */
731 				cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n");
732 			}
733 		}
734 		/* FALLTHRU */
735 	default:
736 		rds_wput_other(q, mp);
737 		return (0);
738 	}
739 }
740 
741 static int
742 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop)
743 {
744 	uchar_t	*rptr = mp->b_rptr;
745 	rds_t	*rds;
746 	mblk_t	*mp1;
747 	sin_t	*sin;
748 	ipaddr_t dst;
749 	uint16_t port;
750 	int ret = 0;
751 
752 #define	tudr	((struct T_unitdata_req *)(uintptr_t)rptr)
753 
754 	rds = (rds_t *)q->q_ptr;
755 	/* Handle UNITDATA_REQ messages here */
756 	if (rds->rds_state == TS_UNBND) {
757 		/* If a port has not been bound to the stream, fail. */
758 		dprint(2, ("%s: socket is not bound to a port", LABEL));
759 		freemsg(mp);
760 		return (EPROTO);
761 	}
762 
763 	mp1 = mp->b_cont;
764 	mp->b_cont = NULL;
765 	if (mp1 == NULL) {
766 		dprint(2, ("%s: No message to send", LABEL));
767 		freemsg(mp);
768 		return (EPROTO);
769 	}
770 
771 	/*
772 	 * No options allowed
773 	 */
774 	if (tudr->OPT_length != 0) {
775 		ret = EINVAL;
776 		goto done;
777 	}
778 
779 	ASSERT(mp1->b_datap->db_ref == 1);
780 
781 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) >
782 	    mp->b_wptr) {
783 		ret = EDESTADDRREQ;
784 		goto done;
785 	}
786 
787 	sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset];
788 	if (!OK_32PTR((char *)sin) || tudr->DEST_length !=
789 	    sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) {
790 		ret = EDESTADDRREQ;
791 		goto done;
792 	}
793 	/* Extract port and ipaddr */
794 	port = sin->sin_port;
795 	dst = sin->sin_addr.s_addr;
796 
797 	if (port == 0 || dst == INADDR_ANY) {
798 		ret = EDESTADDRREQ;
799 		goto done;
800 	}
801 
802 	ASSERT(rds_transport_ops != NULL);
803 	ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst,
804 	    ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid);
805 	if (ret != 0) {
806 		if ((ret != ENOBUFS) && (ret != ENOMEM)) {
807 			/* ENOMEM is actually EWOULDBLOCK */
808 			dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret));
809 			goto done;
810 		}
811 	}
812 done:
813 	freemsg(mp1);
814 	freemsg(mp);
815 	return (ret);
816 }
817 
818 /*
819  * Make sure we dont return EINVAL and EWOULDBLOCK as it has
820  * special meanings for the synchronous streams (rwnext()).
821  * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg()
822  */
823 static int
824 rds_wrw(queue_t *q, struiod_t *dp)
825 {
826 	mblk_t  *mp = dp->d_mp;
827 	int error = 0;
828 	struct  datab   *db;
829 	uchar_t *rptr;
830 
831 	db = mp->b_datap;
832 	rptr = mp->b_rptr;
833 	switch (db->db_type) {
834 	case M_PROTO:
835 	case M_PCPROTO:
836 		ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
837 		    (uintptr_t)INT_MAX);
838 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
839 		    sizeof (struct T_unitdata_req)) {
840 			/* Detect valid T_UNITDATA_REQ here */
841 			if (((union T_primitives *)(uintptr_t)rptr)->type
842 			    == T_UNITDATA_REQ)
843 			break;
844 		}
845 		/* FALLTHRU */
846 	default:
847 
848 		if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
849 		/*
850 		 * Uio error of some sort, so just return the error.
851 		 */
852 			goto done;
853 		}
854 		dp->d_mp = 0;
855 		rds_wput_other(q, mp);
856 		return (0);
857 	}
858 
859 	dp->d_mp = 0;
860 	error = rds_wput_data(q, mp, &dp->d_uio);
861 done:
862 	if (error == EWOULDBLOCK || error == EINVAL)
863 		error = EIO;
864 
865 	return (error);
866 }
867 
868 static void
869 rds_rsrv(queue_t *q)
870 {
871 	rds_t	*rds = (rds_t *)q->q_ptr;
872 	ulong_t current_port_quota;
873 
874 	/* update the port quota to the current level */
875 	current_port_quota = RDS_GET_PORT_QUOTA();
876 	if (rds->rds_port_quota != current_port_quota) {
877 		rds->rds_port_quota = current_port_quota;
878 		(void) proto_set_rx_hiwat(q, NULL,
879 		    rds->rds_port_quota * UserBufferSize);
880 	}
881 
882 	/* No more messages in the q, unstall the socket */
883 	rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port));
884 }
885 
886 int
887 rds_close_transport_driver()
888 {
889 	ASSERT(rds_transport_ops != NULL);
890 
891 	rw_enter(&rds_transport_lock, RW_WRITER);
892 	if (rds_transport_handle != NULL) {
893 		rds_transport_ops->rds_transport_close_ib();
894 		(void) ldi_close(rds_transport_handle, FNDELAY, kcred);
895 		rds_transport_handle = NULL;
896 	}
897 	rw_exit(&rds_transport_lock);
898 
899 	return (0);
900 }
901 
902 
903 int
904 rds_open_transport_driver()
905 {
906 	int ret = 0;
907 
908 	rw_enter(&rds_transport_lock, RW_WRITER);
909 	if (rds_transport_handle != NULL) {
910 		/*
911 		 * Someone beat us to it.
912 		 */
913 		goto done;
914 	}
915 
916 	if (ibt_hw_is_present() == 0) {
917 		ret = ENODEV;
918 		goto done;
919 	}
920 
921 	if (rds_li == NULL) {
922 		ret = EPROTONOSUPPORT;
923 		goto done;
924 	}
925 
926 	ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib",
927 	    FREAD | FWRITE, kcred, &rds_transport_handle, rds_li);
928 	if (ret != 0) {
929 		ret = EPROTONOSUPPORT;
930 		rds_transport_handle = NULL;
931 		goto done;
932 	}
933 
934 	ret = rds_transport_ops->rds_transport_open_ib();
935 	if (ret != 0) {
936 		(void) ldi_close(rds_transport_handle, FNDELAY, kcred);
937 		rds_transport_handle = NULL;
938 	}
939 done:
940 	rw_exit(&rds_transport_lock);
941 	return (ret);
942 }
943 
944 static struct module_info info = {
945 	0, "rds", 1, INFPSZ, 65536, 1024
946 };
947 
948 static struct qinit rinit = {
949 	NULL, (pfi_t)rds_rsrv, rds_open, rds_close, NULL, &info
950 };
951 
952 static struct qinit winit = {
953 	(pfi_t)rds_wput, NULL, rds_open, rds_close, NULL, &info,
954 	NULL, rds_wrw, NULL, STRUIOT_STANDARD
955 };
956 
957 struct streamtab rdsinfo = {
958 	&rinit, &winit, NULL, NULL
959 };
960 
961 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach,
962     nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB, ddi_quiesce_not_supported);
963 
964 /*
965  * Module linkage information for the kernel.
966  */
967 static struct modldrv modldrv = {
968 	&mod_driverops,
969 	RDS_DEVDESC,
970 	&rds_devops
971 };
972 
973 static struct modlinkage modlinkage = {
974 	MODREV_1,
975 	&modldrv,
976 	NULL
977 };
978 
979 int
980 _init(void)
981 {
982 	int	ret;
983 
984 	rds_init();
985 
986 	ret = mod_install(&modlinkage);
987 	if (ret != 0)
988 		goto done;
989 	ret = ldi_ident_from_mod(&modlinkage, &rds_li);
990 	if (ret != 0)
991 		rds_li = NULL;
992 done:
993 	return (ret);
994 }
995 
996 int
997 _fini(void)
998 {
999 	int	ret;
1000 
1001 	ret = mod_remove(&modlinkage);
1002 	if (ret != 0) {
1003 		return (ret);
1004 	}
1005 
1006 	rds_fini();
1007 
1008 	ldi_ident_release(rds_li);
1009 	return (0);
1010 }
1011 
1012 int
1013 _info(struct modinfo *modinfop)
1014 {
1015 	return (mod_info(&modlinkage, modinfop));
1016 }
1017