1 /* 2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 #include <linux/module.h> 34 #include <linux/errno.h> 35 #include <linux/kernel.h> 36 #include <linux/gfp.h> 37 #include <linux/in.h> 38 #include <linux/ipv6.h> 39 #include <linux/poll.h> 40 #include <net/sock.h> 41 42 #include "rds.h" 43 44 /* this is just used for stats gathering :/ */ 45 static DEFINE_SPINLOCK(rds_sock_lock); 46 static LIST_HEAD(rds_sock_list); 47 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); 48 49 /* 50 * This is called as the final descriptor referencing this socket is closed. 51 * We have to unbind the socket so that another socket can be bound to the 52 * address it was using. 53 * 54 * We have to be careful about racing with the incoming path. sock_orphan() 55 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 56 * messages shouldn't be queued. 57 */ 58 static int rds_release(struct socket *sock) 59 { 60 struct sock *sk = sock->sk; 61 struct rds_sock *rs; 62 63 if (!sk) 64 goto out; 65 66 rs = rds_sk_to_rs(sk); 67 68 sock_orphan(sk); 69 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so 70 * that ensures the recv path has completed messing 71 * with the socket. */ 72 rds_clear_recv_queue(rs); 73 rds_cong_remove_socket(rs); 74 75 rds_remove_bound(rs); 76 77 rds_send_drop_to(rs, NULL); 78 rds_rdma_drop_keys(rs); 79 rds_notify_queue_get(rs, NULL); 80 rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); 81 82 spin_lock_bh(&rds_sock_lock); 83 list_del_init(&rs->rs_item); 84 spin_unlock_bh(&rds_sock_lock); 85 86 rds_trans_put(rs->rs_transport); 87 88 sock->sk = NULL; 89 sock_put(sk); 90 out: 91 return 0; 92 } 93 94 /* 95 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. 96 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 97 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 98 * this seems more conservative. 99 * NB - normally, one would use sk_callback_lock for this, but we can 100 * get here from interrupts, whereas the network code grabs sk_callback_lock 101 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 102 */ 103 void rds_wake_sk_sleep(struct rds_sock *rs) 104 { 105 unsigned long flags; 106 107 read_lock_irqsave(&rs->rs_recv_lock, flags); 108 __rds_wake_sk_sleep(rds_rs_to_sk(rs)); 109 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 110 } 111 112 static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 113 int peer) 114 { 115 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 116 struct sockaddr_in6 *sin6; 117 struct sockaddr_in *sin; 118 int uaddr_len; 119 120 /* racey, don't care */ 121 if (peer) { 122 if (ipv6_addr_any(&rs->rs_conn_addr)) 123 return -ENOTCONN; 124 125 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { 126 sin = (struct sockaddr_in *)uaddr; 127 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 128 sin->sin_family = AF_INET; 129 sin->sin_port = rs->rs_conn_port; 130 sin->sin_addr.s_addr = rs->rs_conn_addr_v4; 131 uaddr_len = sizeof(*sin); 132 } else { 133 sin6 = (struct sockaddr_in6 *)uaddr; 134 sin6->sin6_family = AF_INET6; 135 sin6->sin6_port = rs->rs_conn_port; 136 sin6->sin6_addr = rs->rs_conn_addr; 137 sin6->sin6_flowinfo = 0; 138 /* scope_id is the same as in the bound address. */ 139 sin6->sin6_scope_id = rs->rs_bound_scope_id; 140 uaddr_len = sizeof(*sin6); 141 } 142 } else { 143 /* If socket is not yet bound and the socket is connected, 144 * set the return address family to be the same as the 145 * connected address, but with 0 address value. If it is not 146 * connected, set the family to be AF_UNSPEC (value 0) and 147 * the address size to be that of an IPv4 address. 148 */ 149 if (ipv6_addr_any(&rs->rs_bound_addr)) { 150 if (ipv6_addr_any(&rs->rs_conn_addr)) { 151 sin = (struct sockaddr_in *)uaddr; 152 memset(sin, 0, sizeof(*sin)); 153 sin->sin_family = AF_UNSPEC; 154 return sizeof(*sin); 155 } 156 157 #if IS_ENABLED(CONFIG_IPV6) 158 if (!(ipv6_addr_type(&rs->rs_conn_addr) & 159 IPV6_ADDR_MAPPED)) { 160 sin6 = (struct sockaddr_in6 *)uaddr; 161 memset(sin6, 0, sizeof(*sin6)); 162 sin6->sin6_family = AF_INET6; 163 return sizeof(*sin6); 164 } 165 #endif 166 167 sin = (struct sockaddr_in *)uaddr; 168 memset(sin, 0, sizeof(*sin)); 169 sin->sin_family = AF_INET; 170 return sizeof(*sin); 171 } 172 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 173 sin = (struct sockaddr_in *)uaddr; 174 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 175 sin->sin_family = AF_INET; 176 sin->sin_port = rs->rs_bound_port; 177 sin->sin_addr.s_addr = rs->rs_bound_addr_v4; 178 uaddr_len = sizeof(*sin); 179 } else { 180 sin6 = (struct sockaddr_in6 *)uaddr; 181 sin6->sin6_family = AF_INET6; 182 sin6->sin6_port = rs->rs_bound_port; 183 sin6->sin6_addr = rs->rs_bound_addr; 184 sin6->sin6_flowinfo = 0; 185 sin6->sin6_scope_id = rs->rs_bound_scope_id; 186 uaddr_len = sizeof(*sin6); 187 } 188 } 189 190 return uaddr_len; 191 } 192 193 /* 194 * RDS' poll is without a doubt the least intuitive part of the interface, 195 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from 196 * a network protocol. 197 * 198 * EPOLLIN is asserted if 199 * - there is data on the receive queue. 200 * - to signal that a previously congested destination may have become 201 * uncongested 202 * - A notification has been queued to the socket (this can be a congestion 203 * update, or a RDMA completion, or a MSG_ZEROCOPY completion). 204 * 205 * EPOLLOUT is asserted if there is room on the send queue. This does not mean 206 * however, that the next sendmsg() call will succeed. If the application tries 207 * to send to a congested destination, the system call may still fail (and 208 * return ENOBUFS). 209 */ 210 static __poll_t rds_poll(struct file *file, struct socket *sock, 211 poll_table *wait) 212 { 213 struct sock *sk = sock->sk; 214 struct rds_sock *rs = rds_sk_to_rs(sk); 215 __poll_t mask = 0; 216 unsigned long flags; 217 218 poll_wait(file, sk_sleep(sk), wait); 219 220 if (READ_ONCE(rs->rs_seen_congestion)) 221 poll_wait(file, &rds_poll_waitq, wait); 222 223 read_lock_irqsave(&rs->rs_recv_lock, flags); 224 if (!rs->rs_cong_monitor) { 225 /* When a congestion map was updated, we signal EPOLLIN for 226 * "historical" reasons. Applications can also poll for 227 * WRBAND instead. */ 228 if (rds_cong_updated_since(&rs->rs_cong_track)) 229 mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); 230 } else { 231 spin_lock(&rs->rs_lock); 232 if (rs->rs_cong_notify) 233 mask |= (EPOLLIN | EPOLLRDNORM); 234 spin_unlock(&rs->rs_lock); 235 } 236 if (!list_empty(&rs->rs_recv_queue) || 237 !list_empty(&rs->rs_notify_queue) || 238 !list_empty(&rs->rs_zcookie_queue.zcookie_head)) 239 mask |= (EPOLLIN | EPOLLRDNORM); 240 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) 241 mask |= (EPOLLOUT | EPOLLWRNORM); 242 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 243 mask |= EPOLLERR; 244 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 245 246 /* clear state any time we wake a seen-congested socket */ 247 if (mask) 248 WRITE_ONCE(rs->rs_seen_congestion, 0); 249 250 return mask; 251 } 252 253 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 254 { 255 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 256 rds_tos_t utos, tos = 0; 257 258 switch (cmd) { 259 case SIOCRDSSETTOS: 260 if (get_user(utos, (rds_tos_t __user *)arg)) 261 return -EFAULT; 262 263 if (rs->rs_transport && 264 rs->rs_transport->get_tos_map) 265 tos = rs->rs_transport->get_tos_map(utos); 266 else 267 return -ENOIOCTLCMD; 268 269 spin_lock_bh(&rds_sock_lock); 270 if (rs->rs_tos || rs->rs_conn) { 271 spin_unlock_bh(&rds_sock_lock); 272 return -EINVAL; 273 } 274 rs->rs_tos = tos; 275 spin_unlock_bh(&rds_sock_lock); 276 break; 277 case SIOCRDSGETTOS: 278 spin_lock_bh(&rds_sock_lock); 279 tos = rs->rs_tos; 280 spin_unlock_bh(&rds_sock_lock); 281 if (put_user(tos, (rds_tos_t __user *)arg)) 282 return -EFAULT; 283 break; 284 default: 285 return -ENOIOCTLCMD; 286 } 287 288 return 0; 289 } 290 291 static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) 292 { 293 struct sockaddr_in6 sin6; 294 struct sockaddr_in sin; 295 int ret = 0; 296 297 /* racing with another thread binding seems ok here */ 298 if (ipv6_addr_any(&rs->rs_bound_addr)) { 299 ret = -ENOTCONN; /* XXX not a great errno */ 300 goto out; 301 } 302 303 if (len < sizeof(struct sockaddr_in)) { 304 ret = -EINVAL; 305 goto out; 306 } else if (len < sizeof(struct sockaddr_in6)) { 307 /* Assume IPv4 */ 308 if (copy_from_sockptr(&sin, optval, 309 sizeof(struct sockaddr_in))) { 310 ret = -EFAULT; 311 goto out; 312 } 313 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); 314 sin6.sin6_port = sin.sin_port; 315 } else { 316 if (copy_from_sockptr(&sin6, optval, 317 sizeof(struct sockaddr_in6))) { 318 ret = -EFAULT; 319 goto out; 320 } 321 } 322 323 rds_send_drop_to(rs, &sin6); 324 out: 325 return ret; 326 } 327 328 static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, 329 int optlen) 330 { 331 int value; 332 333 if (optlen < sizeof(int)) 334 return -EINVAL; 335 if (copy_from_sockptr(&value, optval, sizeof(int))) 336 return -EFAULT; 337 *optvar = !!value; 338 return 0; 339 } 340 341 static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) 342 { 343 int ret; 344 345 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 346 if (ret == 0) { 347 if (rs->rs_cong_monitor) { 348 rds_cong_add_socket(rs); 349 } else { 350 rds_cong_remove_socket(rs); 351 rs->rs_cong_mask = 0; 352 rs->rs_cong_notify = 0; 353 } 354 } 355 return ret; 356 } 357 358 static int rds_set_transport(struct net *net, struct rds_sock *rs, 359 sockptr_t optval, int optlen) 360 { 361 int t_type; 362 363 if (rs->rs_transport) 364 return -EOPNOTSUPP; /* previously attached to transport */ 365 366 if (optlen != sizeof(int)) 367 return -EINVAL; 368 369 if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) 370 return -EFAULT; 371 372 if (t_type < 0 || t_type >= RDS_TRANS_COUNT) 373 return -EINVAL; 374 375 /* RDS/IB is restricted to the initial network namespace */ 376 if (t_type != RDS_TRANS_TCP && !net_eq(net, &init_net)) 377 return -EPROTOTYPE; 378 379 rs->rs_transport = rds_trans_get(t_type); 380 381 return rs->rs_transport ? 0 : -ENOPROTOOPT; 382 } 383 384 static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, 385 int optlen, int optname) 386 { 387 int val, valbool; 388 389 if (optlen != sizeof(int)) 390 return -EFAULT; 391 392 if (copy_from_sockptr(&val, optval, sizeof(int))) 393 return -EFAULT; 394 395 valbool = val ? 1 : 0; 396 397 if (optname == SO_TIMESTAMP_NEW) 398 sock_set_flag(sk, SOCK_TSTAMP_NEW); 399 400 if (valbool) 401 sock_set_flag(sk, SOCK_RCVTSTAMP); 402 else 403 sock_reset_flag(sk, SOCK_RCVTSTAMP); 404 405 return 0; 406 } 407 408 static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, 409 int optlen) 410 { 411 struct rds_rx_trace_so trace; 412 int i; 413 414 if (optlen != sizeof(struct rds_rx_trace_so)) 415 return -EFAULT; 416 417 if (copy_from_sockptr(&trace, optval, sizeof(trace))) 418 return -EFAULT; 419 420 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) 421 return -EFAULT; 422 423 rs->rs_rx_traces = trace.rx_traces; 424 for (i = 0; i < rs->rs_rx_traces; i++) { 425 if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { 426 rs->rs_rx_traces = 0; 427 return -EFAULT; 428 } 429 rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; 430 } 431 432 return 0; 433 } 434 435 static int rds_setsockopt(struct socket *sock, int level, int optname, 436 sockptr_t optval, unsigned int optlen) 437 { 438 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 439 struct net *net = sock_net(sock->sk); 440 int ret; 441 442 if (level != SOL_RDS) { 443 ret = -ENOPROTOOPT; 444 goto out; 445 } 446 447 switch (optname) { 448 case RDS_CANCEL_SENT_TO: 449 ret = rds_cancel_sent_to(rs, optval, optlen); 450 break; 451 case RDS_GET_MR: 452 ret = rds_get_mr(rs, optval, optlen); 453 break; 454 case RDS_GET_MR_FOR_DEST: 455 ret = rds_get_mr_for_dest(rs, optval, optlen); 456 break; 457 case RDS_FREE_MR: 458 ret = rds_free_mr(rs, optval, optlen); 459 break; 460 case RDS_RECVERR: 461 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); 462 break; 463 case RDS_CONG_MONITOR: 464 ret = rds_cong_monitor(rs, optval, optlen); 465 break; 466 case SO_RDS_TRANSPORT: 467 lock_sock(sock->sk); 468 ret = rds_set_transport(net, rs, optval, optlen); 469 release_sock(sock->sk); 470 break; 471 case SO_TIMESTAMP_OLD: 472 case SO_TIMESTAMP_NEW: 473 lock_sock(sock->sk); 474 ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); 475 release_sock(sock->sk); 476 break; 477 case SO_RDS_MSG_RXPATH_LATENCY: 478 ret = rds_recv_track_latency(rs, optval, optlen); 479 break; 480 default: 481 ret = -ENOPROTOOPT; 482 } 483 out: 484 return ret; 485 } 486 487 static int rds_getsockopt(struct socket *sock, int level, int optname, 488 char __user *optval, int __user *optlen) 489 { 490 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 491 int ret = -ENOPROTOOPT, len; 492 int trans; 493 494 if (level != SOL_RDS) 495 goto out; 496 497 if (get_user(len, optlen)) { 498 ret = -EFAULT; 499 goto out; 500 } 501 502 switch (optname) { 503 case RDS_INFO_FIRST ... RDS_INFO_LAST: 504 ret = rds_info_getsockopt(sock, optname, optval, 505 optlen); 506 break; 507 508 case RDS_RECVERR: 509 if (len < sizeof(int)) 510 ret = -EINVAL; 511 else 512 if (put_user(rs->rs_recverr, (int __user *) optval) || 513 put_user(sizeof(int), optlen)) 514 ret = -EFAULT; 515 else 516 ret = 0; 517 break; 518 case SO_RDS_TRANSPORT: 519 if (len < sizeof(int)) { 520 ret = -EINVAL; 521 break; 522 } 523 trans = (rs->rs_transport ? rs->rs_transport->t_type : 524 RDS_TRANS_NONE); /* unbound */ 525 if (put_user(trans, (int __user *)optval) || 526 put_user(sizeof(int), optlen)) 527 ret = -EFAULT; 528 else 529 ret = 0; 530 break; 531 default: 532 break; 533 } 534 535 out: 536 return ret; 537 538 } 539 540 static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, 541 int addr_len, int flags) 542 { 543 struct sock *sk = sock->sk; 544 struct sockaddr_in *sin; 545 struct rds_sock *rs = rds_sk_to_rs(sk); 546 int ret = 0; 547 548 if (addr_len < offsetofend(struct sockaddr, sa_family)) 549 return -EINVAL; 550 551 lock_sock(sk); 552 553 switch (uaddr->sa_family) { 554 case AF_INET: 555 sin = (struct sockaddr_in *)uaddr; 556 if (addr_len < sizeof(struct sockaddr_in)) { 557 ret = -EINVAL; 558 break; 559 } 560 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 561 ret = -EDESTADDRREQ; 562 break; 563 } 564 if (ipv4_is_multicast(sin->sin_addr.s_addr) || 565 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { 566 ret = -EINVAL; 567 break; 568 } 569 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); 570 rs->rs_conn_port = sin->sin_port; 571 break; 572 573 #if IS_ENABLED(CONFIG_IPV6) 574 case AF_INET6: { 575 struct sockaddr_in6 *sin6; 576 int addr_type; 577 578 sin6 = (struct sockaddr_in6 *)uaddr; 579 if (addr_len < sizeof(struct sockaddr_in6)) { 580 ret = -EINVAL; 581 break; 582 } 583 addr_type = ipv6_addr_type(&sin6->sin6_addr); 584 if (!(addr_type & IPV6_ADDR_UNICAST)) { 585 __be32 addr4; 586 587 if (!(addr_type & IPV6_ADDR_MAPPED)) { 588 ret = -EPROTOTYPE; 589 break; 590 } 591 592 /* It is a mapped address. Need to do some sanity 593 * checks. 594 */ 595 addr4 = sin6->sin6_addr.s6_addr32[3]; 596 if (addr4 == htonl(INADDR_ANY) || 597 addr4 == htonl(INADDR_BROADCAST) || 598 ipv4_is_multicast(addr4)) { 599 ret = -EPROTOTYPE; 600 break; 601 } 602 } 603 604 if (addr_type & IPV6_ADDR_LINKLOCAL) { 605 /* If socket is already bound to a link local address, 606 * the peer address must be on the same link. 607 */ 608 if (sin6->sin6_scope_id == 0 || 609 (!ipv6_addr_any(&rs->rs_bound_addr) && 610 rs->rs_bound_scope_id && 611 sin6->sin6_scope_id != rs->rs_bound_scope_id)) { 612 ret = -EINVAL; 613 break; 614 } 615 /* Remember the connected address scope ID. It will 616 * be checked against the binding local address when 617 * the socket is bound. 618 */ 619 rs->rs_bound_scope_id = sin6->sin6_scope_id; 620 } 621 rs->rs_conn_addr = sin6->sin6_addr; 622 rs->rs_conn_port = sin6->sin6_port; 623 break; 624 } 625 #endif 626 627 default: 628 ret = -EAFNOSUPPORT; 629 break; 630 } 631 632 release_sock(sk); 633 return ret; 634 } 635 636 static struct proto rds_proto = { 637 .name = "RDS", 638 .owner = THIS_MODULE, 639 .obj_size = sizeof(struct rds_sock), 640 }; 641 642 static const struct proto_ops rds_proto_ops = { 643 .family = AF_RDS, 644 .owner = THIS_MODULE, 645 .release = rds_release, 646 .bind = rds_bind, 647 .connect = rds_connect, 648 .socketpair = sock_no_socketpair, 649 .accept = sock_no_accept, 650 .getname = rds_getname, 651 .poll = rds_poll, 652 .ioctl = rds_ioctl, 653 .listen = sock_no_listen, 654 .shutdown = sock_no_shutdown, 655 .setsockopt = rds_setsockopt, 656 .getsockopt = rds_getsockopt, 657 .sendmsg = rds_sendmsg, 658 .recvmsg = rds_recvmsg, 659 .mmap = sock_no_mmap, 660 }; 661 662 static void rds_sock_destruct(struct sock *sk) 663 { 664 struct rds_sock *rs = rds_sk_to_rs(sk); 665 666 WARN_ON((&rs->rs_item != rs->rs_item.next || 667 &rs->rs_item != rs->rs_item.prev)); 668 } 669 670 static int __rds_create(struct socket *sock, struct sock *sk, int protocol) 671 { 672 struct rds_sock *rs; 673 674 sock_init_data(sock, sk); 675 sock->ops = &rds_proto_ops; 676 sk->sk_protocol = protocol; 677 sk->sk_destruct = rds_sock_destruct; 678 679 rs = rds_sk_to_rs(sk); 680 spin_lock_init(&rs->rs_lock); 681 rwlock_init(&rs->rs_recv_lock); 682 INIT_LIST_HEAD(&rs->rs_send_queue); 683 INIT_LIST_HEAD(&rs->rs_recv_queue); 684 INIT_LIST_HEAD(&rs->rs_notify_queue); 685 INIT_LIST_HEAD(&rs->rs_cong_list); 686 rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); 687 spin_lock_init(&rs->rs_rdma_lock); 688 rs->rs_rdma_keys = RB_ROOT; 689 rs->rs_rx_traces = 0; 690 rs->rs_tos = 0; 691 rs->rs_conn = NULL; 692 693 spin_lock_bh(&rds_sock_lock); 694 list_add_tail(&rs->rs_item, &rds_sock_list); 695 spin_unlock_bh(&rds_sock_lock); 696 697 return 0; 698 } 699 700 static int rds_create(struct net *net, struct socket *sock, int protocol, 701 int kern) 702 { 703 struct sock *sk; 704 705 if (sock->type != SOCK_SEQPACKET || protocol) 706 return -ESOCKTNOSUPPORT; 707 708 sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); 709 if (!sk) 710 return -ENOMEM; 711 712 return __rds_create(sock, sk, protocol); 713 } 714 715 void rds_sock_addref(struct rds_sock *rs) 716 { 717 sock_hold(rds_rs_to_sk(rs)); 718 } 719 720 void rds_sock_put(struct rds_sock *rs) 721 { 722 sock_put(rds_rs_to_sk(rs)); 723 } 724 725 static const struct net_proto_family rds_family_ops = { 726 .family = AF_RDS, 727 .create = rds_create, 728 .owner = THIS_MODULE, 729 }; 730 731 static void rds_sock_inc_info(struct socket *sock, unsigned int len, 732 struct rds_info_iterator *iter, 733 struct rds_info_lengths *lens) 734 { 735 struct net *net = sock_net(sock->sk); 736 struct rds_sock *rs; 737 struct rds_incoming *inc; 738 unsigned int total = 0; 739 740 len /= sizeof(struct rds_info_message); 741 742 spin_lock_bh(&rds_sock_lock); 743 744 list_for_each_entry(rs, &rds_sock_list, rs_item) { 745 /* Only show sockets in the caller's netns. */ 746 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 747 continue; 748 /* This option only supports IPv4 sockets. */ 749 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 750 continue; 751 752 read_lock(&rs->rs_recv_lock); 753 754 /* XXX too lazy to maintain counts.. */ 755 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 756 total++; 757 if (total <= len) 758 rds_inc_info_copy(inc, iter, 759 inc->i_saddr.s6_addr32[3], 760 rs->rs_bound_addr_v4, 761 1); 762 } 763 764 read_unlock(&rs->rs_recv_lock); 765 } 766 767 spin_unlock_bh(&rds_sock_lock); 768 769 lens->nr = total; 770 lens->each = sizeof(struct rds_info_message); 771 } 772 773 #if IS_ENABLED(CONFIG_IPV6) 774 static void rds6_sock_inc_info(struct socket *sock, unsigned int len, 775 struct rds_info_iterator *iter, 776 struct rds_info_lengths *lens) 777 { 778 struct net *net = sock_net(sock->sk); 779 struct rds_incoming *inc; 780 unsigned int total = 0; 781 struct rds_sock *rs; 782 783 len /= sizeof(struct rds6_info_message); 784 785 spin_lock_bh(&rds_sock_lock); 786 787 list_for_each_entry(rs, &rds_sock_list, rs_item) { 788 /* Only show sockets in the caller's netns. */ 789 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 790 continue; 791 read_lock(&rs->rs_recv_lock); 792 793 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 794 total++; 795 if (total <= len) 796 rds6_inc_info_copy(inc, iter, &inc->i_saddr, 797 &rs->rs_bound_addr, 1); 798 } 799 800 read_unlock(&rs->rs_recv_lock); 801 } 802 803 spin_unlock_bh(&rds_sock_lock); 804 805 lens->nr = total; 806 lens->each = sizeof(struct rds6_info_message); 807 } 808 #endif 809 810 static void rds_sock_info(struct socket *sock, unsigned int len, 811 struct rds_info_iterator *iter, 812 struct rds_info_lengths *lens) 813 { 814 struct net *net = sock_net(sock->sk); 815 struct rds_info_socket sinfo; 816 unsigned int copied = 0; 817 unsigned int cnt = 0; 818 struct rds_sock *rs; 819 820 len /= sizeof(struct rds_info_socket); 821 822 spin_lock_bh(&rds_sock_lock); 823 824 /* First pass: count entries visible in the caller's netns. */ 825 list_for_each_entry(rs, &rds_sock_list, rs_item) { 826 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 827 continue; 828 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 829 continue; 830 cnt++; 831 } 832 833 if (len < cnt) 834 goto out; 835 836 list_for_each_entry(rs, &rds_sock_list, rs_item) { 837 if (copied >= cnt) 838 break; 839 /* Only show sockets in the caller's netns. */ 840 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 841 continue; 842 /* This option only supports IPv4 sockets. */ 843 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 844 continue; 845 sinfo.sndbuf = rds_sk_sndbuf(rs); 846 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 847 sinfo.bound_addr = rs->rs_bound_addr_v4; 848 sinfo.connected_addr = rs->rs_conn_addr_v4; 849 sinfo.bound_port = rs->rs_bound_port; 850 sinfo.connected_port = rs->rs_conn_port; 851 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 852 853 rds_info_copy(iter, &sinfo, sizeof(sinfo)); 854 copied++; 855 } 856 /* A concurrent rds_bind() can change rs_bound_addr between the 857 * two passes without holding rds_sock_lock, so copied may be 858 * less than cnt. Report what was actually copied. 859 */ 860 cnt = copied; 861 862 out: 863 lens->nr = cnt; 864 lens->each = sizeof(struct rds_info_socket); 865 866 spin_unlock_bh(&rds_sock_lock); 867 } 868 869 #if IS_ENABLED(CONFIG_IPV6) 870 static void rds6_sock_info(struct socket *sock, unsigned int len, 871 struct rds_info_iterator *iter, 872 struct rds_info_lengths *lens) 873 { 874 struct net *net = sock_net(sock->sk); 875 struct rds6_info_socket sinfo6; 876 unsigned int copied = 0; 877 unsigned int cnt = 0; 878 struct rds_sock *rs; 879 880 len /= sizeof(struct rds6_info_socket); 881 882 spin_lock_bh(&rds_sock_lock); 883 884 /* First pass: count entries visible in the caller's netns. */ 885 list_for_each_entry(rs, &rds_sock_list, rs_item) { 886 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 887 continue; 888 cnt++; 889 } 890 891 if (len < cnt) 892 goto out; 893 894 list_for_each_entry(rs, &rds_sock_list, rs_item) { 895 if (copied >= cnt) 896 break; 897 /* Only show sockets in the caller's netns. */ 898 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 899 continue; 900 sinfo6.sndbuf = rds_sk_sndbuf(rs); 901 sinfo6.rcvbuf = rds_sk_rcvbuf(rs); 902 sinfo6.bound_addr = rs->rs_bound_addr; 903 sinfo6.connected_addr = rs->rs_conn_addr; 904 sinfo6.bound_port = rs->rs_bound_port; 905 sinfo6.connected_port = rs->rs_conn_port; 906 sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); 907 908 rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); 909 copied++; 910 } 911 cnt = copied; 912 913 out: 914 lens->nr = cnt; 915 lens->each = sizeof(struct rds6_info_socket); 916 917 spin_unlock_bh(&rds_sock_lock); 918 } 919 #endif 920 921 static void rds_exit(void) 922 { 923 sock_unregister(rds_family_ops.family); 924 proto_unregister(&rds_proto); 925 rds_conn_exit(); 926 rds_cong_exit(); 927 rds_sysctl_exit(); 928 rds_threads_exit(); 929 rds_stats_exit(); 930 rds_page_exit(); 931 rds_bind_lock_destroy(); 932 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); 933 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 934 #if IS_ENABLED(CONFIG_IPV6) 935 rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); 936 rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 937 #endif 938 } 939 module_exit(rds_exit); 940 941 u32 rds_gen_num; 942 943 static int __init rds_init(void) 944 { 945 int ret; 946 947 net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); 948 949 ret = rds_bind_lock_init(); 950 if (ret) 951 goto out; 952 953 ret = rds_conn_init(); 954 if (ret) 955 goto out_bind; 956 957 ret = rds_threads_init(); 958 if (ret) 959 goto out_conn; 960 ret = rds_sysctl_init(); 961 if (ret) 962 goto out_threads; 963 ret = rds_stats_init(); 964 if (ret) 965 goto out_sysctl; 966 ret = proto_register(&rds_proto, 1); 967 if (ret) 968 goto out_stats; 969 ret = sock_register(&rds_family_ops); 970 if (ret) 971 goto out_proto; 972 973 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); 974 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 975 #if IS_ENABLED(CONFIG_IPV6) 976 rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); 977 rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 978 #endif 979 980 goto out; 981 982 out_proto: 983 proto_unregister(&rds_proto); 984 out_stats: 985 rds_stats_exit(); 986 out_sysctl: 987 rds_sysctl_exit(); 988 out_threads: 989 rds_threads_exit(); 990 out_conn: 991 rds_conn_exit(); 992 rds_cong_exit(); 993 rds_page_exit(); 994 out_bind: 995 rds_bind_lock_destroy(); 996 out: 997 return ret; 998 } 999 module_init(rds_init); 1000 1001 #define DRV_VERSION "4.0" 1002 #define DRV_RELDATE "Feb 12, 2009" 1003 1004 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 1005 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" 1006 " v" DRV_VERSION " (" DRV_RELDATE ")"); 1007 MODULE_VERSION(DRV_VERSION); 1008 MODULE_LICENSE("Dual BSD/GPL"); 1009 MODULE_ALIAS_NETPROTO(PF_RDS); 1010