1 /*
2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 */
4
5 /*
6 * This file contains code imported from the OFED rds source file af_rds.c
7 * Oracle elects to have and use the contents of af_rds.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
10 */
11
12 /*
13 * Copyright (c) 2006 Oracle. All rights reserved.
14 *
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
20 *
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
23 * conditions are met:
24 *
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
27 * disclaimer.
28 *
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
33 *
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 * SOFTWARE.
42 *
43 */
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/conf.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/modctl.h>
50 #include <sys/rds.h>
51 #include <sys/stropts.h>
52 #include <sys/socket.h>
53 #include <sys/socketvar.h>
54 #include <sys/sockio.h>
55 #include <sys/sysmacros.h>
56
57 #include <inet/ip.h>
58 #include <net/if_types.h>
59
60 #include <sys/ib/clients/rdsv3/rdsv3.h>
61 #include <sys/ib/clients/rdsv3/rdma.h>
62 #include <sys/ib/clients/rdsv3/rdma_transport.h>
63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
64
65 extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
66 extern int rdsv3_verify_bind_address(ipaddr_t addr);
67
68 extern ddi_taskq_t *rdsv3_taskq;
69 extern struct rdma_cm_id *rdsv3_rdma_listen_id;
70
71 /* this is just used for stats gathering :/ */
72 kmutex_t rdsv3_sock_lock;
73 static unsigned long rdsv3_sock_count;
74 list_t rdsv3_sock_list;
75
76 /*
77 * This is called as the final descriptor referencing this socket is closed.
78 * We have to unbind the socket so that another socket can be bound to the
79 * address it was using.
80 *
81 * We have to be careful about racing with the incoming path. sock_orphan()
82 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
83 * messages shouldn't be queued.
84 */
85 /* ARGSUSED */
86 static int
rdsv3_release(sock_lower_handle_t proto_handle,int flgs,cred_t * cr)87 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
88 {
89 struct rsock *sk = (struct rsock *)proto_handle;
90 struct rdsv3_sock *rs;
91
92 if (!sk)
93 goto out;
94
95 rs = rdsv3_sk_to_rs(sk);
96 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
97
98 rdsv3_sk_sock_orphan(sk);
99 rdsv3_cong_remove_socket(rs);
100 rdsv3_remove_bound(rs);
101
102 /*
103 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
104 * that ensures the recv path has completed messing
105 * with the socket.
106 *
107 * Note2 - rdsv3_clear_recv_queue(rs) should be called first
108 * to prevent some race conditions, which is different from
109 * the Linux code.
110 */
111 rdsv3_clear_recv_queue(rs);
112 rdsv3_send_drop_to(rs, NULL);
113 rdsv3_rdma_drop_keys(rs);
114 (void) rdsv3_notify_queue_get(rs, NULL);
115
116 mutex_enter(&rdsv3_sock_lock);
117 list_remove_node(&rs->rs_item);
118 rdsv3_sock_count--;
119 mutex_exit(&rdsv3_sock_lock);
120
121 while (sk->sk_refcount > 1) {
122 /* wait for 1 sec and try again */
123 delay(drv_usectohz(1000000));
124 }
125
126 /* this will free the rs and sk */
127 rdsv3_sk_sock_put(sk);
128
129 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
130 out:
131 return (0);
132 }
133
134 void
__rdsv3_wake_sk_sleep(struct rsock * sk)135 __rdsv3_wake_sk_sleep(struct rsock *sk)
136 {
137 /* wakup anyone waiting in recvmsg */
138 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
139 rdsv3_wake_up(sk->sk_sleep);
140 }
141
142 /*
143 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
144 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
145 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
146 * this seems more conservative.
147 * NB - normally, one would use sk_callback_lock for this, but we can
148 * get here from interrupts, whereas the network code grabs sk_callback_lock
149 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
150 */
151 void
rdsv3_wake_sk_sleep(struct rdsv3_sock * rs)152 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
153 {
154 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
155
156 rw_enter(&rs->rs_recv_lock, RW_READER);
157 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
158 rw_exit(&rs->rs_recv_lock);
159 }
160
161 /*ARGSUSED*/
162 static int
rdsv3_getname(sock_lower_handle_t proto_handle,struct sockaddr * addr,socklen_t * addr_len,cred_t * cr)163 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
164 socklen_t *addr_len, cred_t *cr)
165 {
166 struct rsock *sk = (struct rsock *)proto_handle;
167 struct sockaddr_in *sin = (struct sockaddr_in *)addr;
168 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
169
170 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
171 rs->rs_bound_port);
172
173 sin->sin_port = rs->rs_bound_port;
174 sin->sin_addr.s_addr = rs->rs_bound_addr;
175
176 sin->sin_family = AF_INET_OFFLOAD;
177
178 *addr_len = sizeof (*sin);
179 return (0);
180 }
181
182 /*
183 * RDS' poll is without a doubt the least intuitive part of the interface,
184 * as POLLIN and POLLOUT do not behave entirely as you would expect from
185 * a network protocol.
186 *
187 * POLLIN is asserted if
188 * - there is data on the receive queue.
189 * - to signal that a previously congested destination may have become
190 * uncongested
191 * - A notification has been queued to the socket (this can be a congestion
192 * update, or a RDMA completion).
193 *
194 * POLLOUT is asserted if there is room on the send queue. This does not mean
195 * however, that the next sendmsg() call will succeed. If the application tries
196 * to send to a congested destination, the system call may still fail (and
197 * return ENOBUFS).
198 */
199 /* ARGSUSED */
200 static short
rdsv3_poll(sock_lower_handle_t proto_handle,short events,int anyyet,cred_t * cr)201 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
202 cred_t *cr)
203 {
204 struct rsock *sk = (struct rsock *)proto_handle;
205 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
206 unsigned short mask = 0;
207
208 #if 0
209 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
210 #endif
211
212 /*
213 * If rs_seen_congestion is on, wait until it's off.
214 * This is implemented for the following OFED code.
215 * if (rs->rs_seen_congestion)
216 * poll_wait(file, &rds_poll_waitq, wait);
217 */
218 mutex_enter(&rs->rs_congested_lock);
219 while (rs->rs_seen_congestion) {
220 cv_wait(&rs->rs_congested_cv,
221 &rs->rs_congested_lock);
222 }
223 mutex_exit(&rs->rs_congested_lock);
224
225 rw_enter(&rs->rs_recv_lock, RW_READER);
226 if (!rs->rs_cong_monitor) {
227 /*
228 * When a congestion map was updated, we signal POLLIN for
229 * "historical" reasons. Applications can also poll for
230 * WRBAND instead.
231 */
232 if (rdsv3_cong_updated_since(&rs->rs_cong_track))
233 mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
234 } else {
235 mutex_enter(&rs->rs_lock);
236 if (rs->rs_cong_notify)
237 mask |= (POLLIN | POLLRDNORM);
238 mutex_exit(&rs->rs_lock);
239 }
240 if (!list_is_empty(&rs->rs_recv_queue) ||
241 !list_is_empty(&rs->rs_notify_queue))
242 mask |= (POLLIN | POLLRDNORM);
243 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
244 mask |= (POLLOUT | POLLWRNORM);
245
246 /* clear state any time we wake a seen-congested socket */
247 if (mask) {
248 mutex_enter(&rs->rs_congested_lock);
249 rs->rs_seen_congestion = 0;
250 mutex_exit(&rs->rs_congested_lock);
251 }
252
253 rw_exit(&rs->rs_recv_lock);
254
255 #if 0
256 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
257 #endif
258
259 return (mask);
260 }
261
262 /* ARGSUSED */
263 static int
rdsv3_ioctl(sock_lower_handle_t proto_handle,int cmd,intptr_t arg,int mode,int32_t * rvalp,cred_t * cr)264 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
265 int mode, int32_t *rvalp, cred_t *cr)
266 {
267 ksocket_t so4;
268 struct lifconf lifc;
269 struct lifreq lifr, *lifrp;
270 struct ifconf ifc;
271 struct ifreq ifr;
272 int rval = 0, rc, len;
273 int numifs;
274 int bufsize;
275 void *buf;
276
277 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
278
279 /* Only ipv4 for now */
280 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
281 CRED());
282 if (rval != 0) {
283 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
284 rval);
285 return (rval);
286 }
287
288 switch (cmd) {
289 case SIOCGLIFNUM :
290 case SIOCGIFNUM :
291 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
292 if (rval != 0) break;
293 if (cmd == SIOCGLIFNUM) {
294 struct lifnum lifn;
295 lifn.lifn_family = AF_INET_OFFLOAD;
296 lifn.lifn_flags = 0;
297 lifn.lifn_count = numifs;
298 (void) ddi_copyout(&lifn, (void *)arg,
299 sizeof (struct lifnum), 0);
300 } else {
301 len = 0;
302 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
303 rc++, lifrp++) {
304 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
305 len++;
306 }
307 }
308 (void) ddi_copyout(&len, (void *)arg,
309 sizeof (int), 0);
310 }
311 kmem_free(buf, bufsize);
312 break;
313
314 case SIOCGLIFCONF :
315 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
316 != 0) {
317 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
318 rval = EFAULT;
319 break;
320 }
321
322 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
323 if (rval != 0) {
324 RDSV3_DPRINTF2("rdsv3_ioctl",
325 "rdsv3_do_ip_ioctl failed: %d", rval);
326 break;
327 }
328
329 if ((lifc.lifc_len > 0) && (numifs > 0)) {
330 if (ddi_copyout(buf, (void *)lifc.lifc_req,
331 (lifc.lifc_len < bufsize) ? lifc.lifc_len :
332 bufsize, 0) != 0) {
333 RDSV3_DPRINTF2("rdsv3_ioctl",
334 "copyout of records failed");
335 rval = EFAULT;
336 }
337
338 }
339
340 lifc.lifc_len = bufsize;
341 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
342 0) != 0) {
343 RDSV3_DPRINTF2("rdsv3_ioctl",
344 "copyout of lifconf failed");
345 rval = EFAULT;
346 }
347
348 kmem_free(buf, bufsize);
349 break;
350
351 case SIOCGIFCONF :
352 case O_SIOCGIFCONF :
353 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
354 != 0) {
355 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
356 rval = EFAULT;
357 break;
358 }
359
360 RDSV3_DPRINTF2("rdsv3_ioctl",
361 "O_SIOCGIFCONF: ifc_len: %d, req: %p",
362 ifc.ifc_len, ifc.ifc_req);
363
364 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
365 if (rval != 0) {
366 RDSV3_DPRINTF2("rdsv3_ioctl",
367 "rdsv3_do_ip_ioctl_old failed: %d", rval);
368 break;
369 }
370
371 if ((ifc.ifc_len > 0) && (numifs > 0)) {
372 if (ddi_copyout(buf, (void *)ifc.ifc_req,
373 (ifc.ifc_len < bufsize) ? ifc.ifc_len :
374 bufsize, 0) != 0) {
375 RDSV3_DPRINTF2("rdsv3_ioctl",
376 "copyout of records failed");
377 rval = EFAULT;
378 }
379
380 }
381
382 ifc.ifc_len = bufsize;
383 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
384 0) != 0) {
385 RDSV3_DPRINTF2("rdsv3_ioctl",
386 "copyout of ifconf failed");
387 rval = EFAULT;
388 }
389
390 kmem_free(buf, bufsize);
391 break;
392
393 case SIOCGLIFFLAGS :
394 case SIOCSLIFFLAGS :
395 case SIOCGLIFMTU :
396 case SIOCGLIFNETMASK :
397 case SIOCGLIFINDEX :
398 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
399 != 0) {
400 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
401 rval = EFAULT;
402 break;
403 }
404
405 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
406 if (rc != 0) {
407 RDSV3_DPRINTF2("rdsv3_ioctl",
408 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
409 rc, lifr.lifr_name, cmd);
410 break;
411 }
412
413 (void) ddi_copyout(&lifr, (void *)arg,
414 sizeof (struct lifreq), 0);
415 break;
416
417 case SIOCGIFFLAGS :
418 case SIOCSIFFLAGS :
419 case SIOCGIFMTU :
420 case SIOCGIFNETMASK :
421 case SIOCGIFINDEX :
422 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
423 != 0) {
424 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
425 rval = EFAULT;
426 break;
427 }
428
429 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
430
431 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
432 if (rc != 0) {
433 RDSV3_DPRINTF2("rdsv3_ioctl",
434 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
435 rc, ifr.ifr_name, cmd);
436
437 break;
438 }
439
440 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
441
442 (void) ddi_copyout(&ifr, (void *)arg,
443 sizeof (struct ifreq), 0);
444 break;
445
446 default:
447 if ((cmd >= RDS_INFO_FIRST) &&
448 (cmd <= RDS_INFO_LAST)) {
449 return (rdsv3_info_ioctl((struct rsock *)proto_handle,
450 cmd, (char *)arg, rvalp));
451 }
452 RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d", cmd);
453 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
454 rval = EOPNOTSUPP;
455 }
456
457 (void) ksocket_close(so4, CRED());
458
459 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
460
461 *rvalp = rval;
462 return (rval);
463 }
464
465 static int
rdsv3_cancel_sent_to(struct rdsv3_sock * rs,char * optval,int len)466 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
467 {
468 struct sockaddr_in sin;
469
470 /* racing with another thread binding seems ok here */
471 if (rs->rs_bound_addr == 0)
472 return (-ENOTCONN); /* XXX not a great errno */
473
474 if (len < sizeof (struct sockaddr_in))
475 return (-EINVAL);
476
477 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
478 0) != 0) {
479 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
480 return (-EFAULT);
481 }
482
483 rdsv3_send_drop_to(rs, &sin);
484
485 return (0);
486 }
487
488 static int
rdsv3_set_bool_option(unsigned char * optvar,char * optval,int optlen)489 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
490 {
491 int value = *optval;
492
493 if (optlen < sizeof (int))
494 return (-EINVAL);
495 *optvar = !!value;
496 return (0);
497 }
498
499 static int
rdsv3_cong_monitor(struct rdsv3_sock * rs,char * optval,int optlen)500 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
501 {
502 int ret;
503
504 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
505 if (ret == 0) {
506 if (rs->rs_cong_monitor) {
507 rdsv3_cong_add_socket(rs);
508 } else {
509 rdsv3_cong_remove_socket(rs);
510 rs->rs_cong_mask = 0;
511 rs->rs_cong_notify = 0;
512 }
513 }
514 return (ret);
515 }
516
517 /*ARGSUSED*/
518 static int
rdsv3_setsockopt(sock_lower_handle_t proto_handle,int level,int optname,const void * optval,socklen_t optlen,cred_t * cr)519 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
520 int optname, const void *optval, socklen_t optlen, cred_t *cr)
521 {
522 struct rsock *sk = (struct rsock *)proto_handle;
523 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
524 int ret = 0;
525
526 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
527 rs, level, optname);
528
529 switch (optname) {
530 case RDS_CANCEL_SENT_TO:
531 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
532 break;
533 case RDS_GET_MR:
534 ret = rdsv3_get_mr(rs, optval, optlen);
535 break;
536 case RDS_GET_MR_FOR_DEST:
537 ret = rdsv3_get_mr_for_dest(rs, optval, optlen);
538 break;
539 case RDS_FREE_MR:
540 ret = rdsv3_free_mr(rs, optval, optlen);
541 break;
542 case RDS_RECVERR:
543 ret = rdsv3_set_bool_option(&rs->rs_recverr,
544 (char *)optval, optlen);
545 break;
546 case RDS_CONG_MONITOR:
547 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
548 break;
549 case SO_SNDBUF:
550 sk->sk_sndbuf = *(uint_t *)optval;
551 return (ret);
552 case SO_RCVBUF:
553 sk->sk_rcvbuf = *(uint_t *)optval;
554 return (ret);
555 default:
556 #if 1
557 break;
558 #else
559 ret = -ENOPROTOOPT;
560 #endif
561 }
562 return (ret);
563 }
564
565 /* XXX */
566 /*ARGSUSED*/
567 static int
rdsv3_getsockopt(sock_lower_handle_t proto_handle,int level,int optname,void * optval,socklen_t * optlen,cred_t * cr)568 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
569 int optname, void *optval, socklen_t *optlen, cred_t *cr)
570 {
571 struct rsock *sk = (struct rsock *)proto_handle;
572 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
573 int ret = 0;
574
575 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
576 rs, optname, *optlen);
577
578 switch (optname) {
579 case SO_SNDBUF:
580 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
581 sk->sk_sndbuf);
582 if (*optlen != 0) {
583 *((int *)optval) = sk->sk_sndbuf;
584 *optlen = sizeof (uint_t);
585 }
586 return (ret);
587 case SO_RCVBUF:
588 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
589 sk->sk_rcvbuf);
590 if (*optlen != 0) {
591 *((int *)optval) = sk->sk_rcvbuf;
592 *optlen = sizeof (uint_t);
593 }
594 return (ret);
595 case RDS_RECVERR:
596 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
597 rs->rs_recverr);
598 if (*optlen < sizeof (int))
599 return (-EINVAL);
600 else {
601 *(int *)optval = rs->rs_recverr;
602 *optlen = sizeof (int);
603 }
604 return (0);
605 default:
606 RDSV3_DPRINTF2("rdsv3_getsockopt",
607 "Unknown: level: %d optname: %d", level, optname);
608 ret = -ENOPROTOOPT;
609 }
610
611 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
612 rs, optname, ret);
613 return (ret);
614 }
615
616 /*ARGSUSED*/
rdsv3_connect(sock_lower_handle_t proto_handle,const struct sockaddr * addr,socklen_t addr_len,sock_connid_t * conn,cred_t * cr)617 static int rdsv3_connect(sock_lower_handle_t proto_handle,
618 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
619 cred_t *cr)
620 {
621 struct rsock *sk = (struct rsock *)proto_handle;
622 struct sockaddr_in *sin = (struct sockaddr_in *)addr;
623 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
624 int ret = 0;
625
626 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
627
628 mutex_enter(&sk->sk_lock);
629
630 if (addr_len != sizeof (struct sockaddr_in)) {
631 ret = -EINVAL;
632 goto out;
633 }
634
635 if (sin->sin_family != AF_INET_OFFLOAD) {
636 ret = -EAFNOSUPPORT;
637 goto out;
638 }
639
640 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
641 ret = -EDESTADDRREQ;
642 goto out;
643 }
644
645 rs->rs_conn_addr = sin->sin_addr.s_addr;
646 rs->rs_conn_port = sin->sin_port;
647
648 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
649
650 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
651
652 out:
653 mutex_exit(&sk->sk_lock);
654 return (ret);
655 }
656
657 /*ARGSUSED*/
658 static int
rdsv3_shutdown(sock_lower_handle_t proto_handle,int how,cred_t * cr)659 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
660 {
661 struct rsock *sk = (struct rsock *)proto_handle;
662 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
663
664 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
665
666 return (0);
667 }
668
669 /*ARGSUSED*/
670 void
rdsv3_activate(sock_lower_handle_t proto_handle,sock_upper_handle_t sock_handle,sock_upcalls_t * sock_upcalls,int flags,cred_t * cr)671 rdsv3_activate(sock_lower_handle_t proto_handle,
672 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
673 int flags, cred_t *cr)
674 {
675 struct rsock *sk = (struct rsock *)proto_handle;
676 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
677
678 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
679
680 sk->sk_upcalls = sock_upcalls;
681 sk->sk_upper_handle = sock_handle;
682
683 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
684 }
685
686
687 /* ARGSUSED */
688 int
rdsv3_send_uio(sock_lower_handle_t proto_handle,uio_t * uio,struct nmsghdr * msg,cred_t * cr)689 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
690 struct nmsghdr *msg, cred_t *cr)
691 {
692 struct rsock *sk = (struct rsock *)proto_handle;
693 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
694 int ret;
695
696 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
697 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
698
699 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
700 if (ret < 0) {
701 return (-ret);
702 }
703
704 return (0);
705 }
706
707 /* ARGSUSED */
708 int
rdsv3_recv_uio(sock_lower_handle_t proto_handle,uio_t * uio,struct nmsghdr * msg,cred_t * cr)709 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
710 struct nmsghdr *msg, cred_t *cr)
711 {
712 struct rsock *sk = (struct rsock *)proto_handle;
713 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
714 int ret;
715
716 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
717 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
718
719 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
720
721 if (ret < 0) {
722 return (-ret);
723 }
724
725 return (0);
726 }
727
728 /*ARGSUSED*/
729 int
rdsv3_getpeername(sock_lower_handle_t proto_handle,struct sockaddr * addr,socklen_t * addr_len,cred_t * cr)730 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
731 socklen_t *addr_len, cred_t *cr)
732 {
733 struct sockaddr_in *sin = (struct sockaddr_in *)addr;
734 struct rsock *sk = (struct rsock *)proto_handle;
735 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
736
737 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
738
739 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
740
741 /* racey, don't care */
742 if (!rs->rs_conn_addr)
743 return (-ENOTCONN);
744
745 sin->sin_port = rs->rs_conn_port;
746 sin->sin_addr.s_addr = rs->rs_conn_addr;
747
748 sin->sin_family = AF_INET_OFFLOAD;
749
750 *addr_len = sizeof (*sin);
751 return (0);
752 }
753
754 void
rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)755 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
756 {
757 struct rsock *sk = (struct rsock *)proto_handle;
758 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
759
760 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
761 }
762
763 #ifndef __lock_lint
764 static struct sock_downcalls_s rdsv3_sock_downcalls = {
765 .sd_close = rdsv3_release,
766 .sd_bind = rdsv3_bind,
767 .sd_connect = rdsv3_connect,
768 .sd_accept = NULL,
769 .sd_getsockname = rdsv3_getname,
770 .sd_poll = rdsv3_poll,
771 .sd_ioctl = rdsv3_ioctl,
772 .sd_listen = NULL,
773 .sd_shutdown = rdsv3_shutdown,
774 .sd_setsockopt = rdsv3_setsockopt,
775 .sd_getsockopt = rdsv3_getsockopt,
776 .sd_send_uio = rdsv3_send_uio,
777 .sd_recv_uio = rdsv3_recv_uio,
778 .sd_activate = rdsv3_activate,
779 .sd_getpeername = rdsv3_getpeername,
780 .sd_send = NULL,
781 .sd_clr_flowctrl = NULL
782 };
783 #else
784 static struct sock_downcalls_s rdsv3_sock_downcalls = {
785 rdsv3_activate,
786 NULL,
787 rdsv3_bind,
788 NULL,
789 rdsv3_connect,
790 rdsv3_getpeername,
791 rdsv3_getname,
792 rdsv3_getsockopt,
793 rdsv3_setsockopt,
794 NULL,
795 rdsv3_send_uio,
796 rdsv3_recv_uio,
797 rdsv3_poll,
798 rdsv3_shutdown,
799 NULL,
800 rdsv3_ioctl,
801 rdsv3_release
802 };
803 #endif
804
805 sock_lower_handle_t
rdsv3_create(int family,int type,int proto,sock_downcalls_t ** sock_downcalls,uint_t * smodep,int * errorp,int flags,cred_t * credp)806 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
807 uint_t *smodep, int *errorp, int flags, cred_t *credp)
808 {
809 struct rdsv3_sock *rs;
810 struct rsock *sk;
811
812 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
813 "flags: %d", family, type, proto, flags);
814
815 sk = rdsv3_sk_alloc();
816 if (sk == NULL)
817 return (NULL);
818 rdsv3_sock_init_data(sk);
819
820 rs = rdsv3_sk_to_rs(sk);
821 rs->rs_sk = sk;
822 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
823 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
824 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
825 offsetof(struct rdsv3_message, m_sock_item));
826 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
827 offsetof(struct rdsv3_incoming, i_item));
828 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
829 offsetof(struct rdsv3_notifier, n_list));
830 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
831 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
832 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
833 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
834 mutex_init(&rs->rs_congested_lock, NULL, MUTEX_DRIVER, NULL);
835 cv_init(&rs->rs_congested_cv, NULL, CV_DRIVER, NULL);
836 rs->rs_cred = credp;
837 rs->rs_zoneid = getzoneid();
838 crhold(credp);
839
840 mutex_enter(&rdsv3_sock_lock);
841 list_insert_tail(&rdsv3_sock_list, rs);
842 rdsv3_sock_count++;
843 /* Initialize RDMA/IB on the 1st socket if not done at attach */
844 if (rdsv3_sock_count == 1) {
845 rdsv3_rdma_init();
846 }
847 mutex_exit(&rdsv3_sock_lock);
848
849 *errorp = 0;
850 *smodep = SM_ATOMIC;
851 *sock_downcalls = &rdsv3_sock_downcalls;
852
853 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
854
855 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
856 }
857
858 void
rdsv3_sock_addref(struct rdsv3_sock * rs)859 rdsv3_sock_addref(struct rdsv3_sock *rs)
860 {
861 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
862 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
863 }
864
865 void
rdsv3_sock_put(struct rdsv3_sock * rs)866 rdsv3_sock_put(struct rdsv3_sock *rs)
867 {
868 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
869 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
870 }
871
872 static void
rdsv3_sock_inc_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)873 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
874 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
875 {
876 struct rdsv3_sock *rs;
877 struct rdsv3_incoming *inc;
878 unsigned int total = 0;
879
880 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
881 rdsv3_sk_to_rs(sock));
882
883 len /= sizeof (struct rds_info_message);
884
885 mutex_enter(&rdsv3_sock_lock);
886
887 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
888 rw_enter(&rs->rs_recv_lock, RW_READER);
889
890 /* XXX too lazy to maintain counts.. */
891 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
892 total++;
893 if (total <= len)
894 rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
895 rs->rs_bound_addr, 1);
896 }
897
898 rw_exit(&rs->rs_recv_lock);
899 }
900
901 mutex_exit(&rdsv3_sock_lock);
902
903 lens->nr = total;
904 lens->each = sizeof (struct rds_info_message);
905
906 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
907 rdsv3_sk_to_rs(sock));
908 }
909
910 static void
rdsv3_sock_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)911 rdsv3_sock_info(struct rsock *sock, unsigned int len,
912 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
913 {
914 struct rds_info_socket sinfo;
915 struct rdsv3_sock *rs;
916 unsigned long bytes;
917
918 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
919 rdsv3_sk_to_rs(sock));
920
921 len /= sizeof (struct rds_info_socket);
922
923 mutex_enter(&rdsv3_sock_lock);
924
925 if ((len < rdsv3_sock_count) || (iter->addr == NULL))
926 goto out;
927
928 bytes = sizeof (struct rds_info_socket);
929 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
930 sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
931 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
932 sinfo.bound_addr = rs->rs_bound_addr;
933 sinfo.connected_addr = rs->rs_conn_addr;
934 sinfo.bound_port = rs->rs_bound_port;
935 sinfo.connected_port = rs->rs_conn_port;
936
937 rdsv3_info_copy(iter, &sinfo, bytes);
938 }
939
940 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
941 rdsv3_sk_to_rs(sock));
942
943 out:
944 lens->nr = rdsv3_sock_count;
945 lens->each = sizeof (struct rds_info_socket);
946
947 mutex_exit(&rdsv3_sock_lock);
948 }
949
950 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL;
951 uint_t rdsv3_rdma_init_delay = 5; /* secs */
952 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
953
954 void
rdsv3_exit(void)955 rdsv3_exit(void)
956 {
957 RDSV3_DPRINTF4("rdsv3_exit", "Enter");
958
959 if (rdsv3_rdma_dwp) {
960 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
961 }
962
963 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
964 NULL, DDI_SLEEP);
965 while (rdsv3_rdma_listen_id != NULL) {
966 #ifndef __lock_lint
967 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
968 __func__, __LINE__);
969 #endif
970 delay(drv_usectohz(1000));
971 }
972
973 rdsv3_conn_exit();
974 rdsv3_cong_exit();
975 rdsv3_sysctl_exit();
976 rdsv3_threads_exit();
977 rdsv3_stats_exit();
978 rdsv3_info_deregister_func(RDS_INFO_SOCKETS, rdsv3_sock_info);
979 rdsv3_info_deregister_func(RDS_INFO_RECV_MESSAGES,
980 rdsv3_sock_inc_info);
981
982 if (rdsv3_rdma_dwp) {
983 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
984 rdsv3_rdma_dwp = NULL;
985 }
986
987 RDSV3_DPRINTF4("rdsv3_exit", "Return");
988 }
989
990 /*ARGSUSED*/
991 int
rdsv3_init()992 rdsv3_init()
993 {
994 int ret;
995
996 RDSV3_DPRINTF4("rdsv3_init", "Enter");
997
998 rdsv3_cong_init();
999
1000 ret = rdsv3_conn_init();
1001 if (ret)
1002 goto out;
1003 ret = rdsv3_threads_init();
1004 if (ret)
1005 goto out_conn;
1006 ret = rdsv3_sysctl_init();
1007 if (ret)
1008 goto out_threads;
1009 ret = rdsv3_stats_init();
1010 if (ret)
1011 goto out_sysctl;
1012
1013 rdsv3_info_register_func(RDS_INFO_SOCKETS, rdsv3_sock_info);
1014 rdsv3_info_register_func(RDS_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
1015
1016 /* rdsv3_rdma_init need to be called with a little delay */
1017 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
1018 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
1019 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
1020 rdsv3_rdma_init_delay);
1021
1022 RDSV3_DPRINTF4("rdsv3_init", "Return");
1023
1024 goto out;
1025
1026 out_sysctl:
1027 rdsv3_sysctl_exit();
1028 out_threads:
1029 rdsv3_threads_exit();
1030 out_conn:
1031 rdsv3_conn_exit();
1032 rdsv3_cong_exit();
1033 out:
1034 return (ret);
1035 }
1036