1 /* 2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 #include <linux/module.h> 34 #include <linux/errno.h> 35 #include <linux/kernel.h> 36 #include <linux/gfp.h> 37 #include <linux/in.h> 38 #include <linux/ipv6.h> 39 #include <linux/poll.h> 40 #include <linux/uio.h> 41 #include <net/sock.h> 42 43 #include "rds.h" 44 45 /* this is just used for stats gathering :/ */ 46 static DEFINE_SPINLOCK(rds_sock_lock); 47 static LIST_HEAD(rds_sock_list); 48 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); 49 50 /* 51 * This is called as the final descriptor referencing this socket is closed. 52 * We have to unbind the socket so that another socket can be bound to the 53 * address it was using. 54 * 55 * We have to be careful about racing with the incoming path. sock_orphan() 56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 57 * messages shouldn't be queued. 58 */ 59 static int rds_release(struct socket *sock) 60 { 61 struct sock *sk = sock->sk; 62 struct rds_sock *rs; 63 64 if (!sk) 65 goto out; 66 67 rs = rds_sk_to_rs(sk); 68 69 sock_orphan(sk); 70 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so 71 * that ensures the recv path has completed messing 72 * with the socket. */ 73 rds_clear_recv_queue(rs); 74 rds_cong_remove_socket(rs); 75 76 rds_remove_bound(rs); 77 78 rds_send_drop_to(rs, NULL); 79 rds_rdma_drop_keys(rs); 80 rds_notify_queue_get(rs, NULL); 81 rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); 82 83 spin_lock_bh(&rds_sock_lock); 84 list_del_init(&rs->rs_item); 85 spin_unlock_bh(&rds_sock_lock); 86 87 rds_trans_put(rs->rs_transport); 88 89 sock->sk = NULL; 90 sock_put(sk); 91 out: 92 return 0; 93 } 94 95 /* 96 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. 97 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 98 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 99 * this seems more conservative. 100 * NB - normally, one would use sk_callback_lock for this, but we can 101 * get here from interrupts, whereas the network code grabs sk_callback_lock 102 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 103 */ 104 void rds_wake_sk_sleep(struct rds_sock *rs) 105 { 106 unsigned long flags; 107 108 read_lock_irqsave(&rs->rs_recv_lock, flags); 109 __rds_wake_sk_sleep(rds_rs_to_sk(rs)); 110 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 111 } 112 113 static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 114 int peer) 115 { 116 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 117 struct sockaddr_in6 *sin6; 118 struct sockaddr_in *sin; 119 int uaddr_len; 120 121 /* racey, don't care */ 122 if (peer) { 123 if (ipv6_addr_any(&rs->rs_conn_addr)) 124 return -ENOTCONN; 125 126 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { 127 sin = (struct sockaddr_in *)uaddr; 128 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 129 sin->sin_family = AF_INET; 130 sin->sin_port = rs->rs_conn_port; 131 sin->sin_addr.s_addr = rs->rs_conn_addr_v4; 132 uaddr_len = sizeof(*sin); 133 } else { 134 sin6 = (struct sockaddr_in6 *)uaddr; 135 sin6->sin6_family = AF_INET6; 136 sin6->sin6_port = rs->rs_conn_port; 137 sin6->sin6_addr = rs->rs_conn_addr; 138 sin6->sin6_flowinfo = 0; 139 /* scope_id is the same as in the bound address. */ 140 sin6->sin6_scope_id = rs->rs_bound_scope_id; 141 uaddr_len = sizeof(*sin6); 142 } 143 } else { 144 /* If socket is not yet bound and the socket is connected, 145 * set the return address family to be the same as the 146 * connected address, but with 0 address value. If it is not 147 * connected, set the family to be AF_UNSPEC (value 0) and 148 * the address size to be that of an IPv4 address. 149 */ 150 if (ipv6_addr_any(&rs->rs_bound_addr)) { 151 if (ipv6_addr_any(&rs->rs_conn_addr)) { 152 sin = (struct sockaddr_in *)uaddr; 153 memset(sin, 0, sizeof(*sin)); 154 sin->sin_family = AF_UNSPEC; 155 return sizeof(*sin); 156 } 157 158 #if IS_ENABLED(CONFIG_IPV6) 159 if (!(ipv6_addr_type(&rs->rs_conn_addr) & 160 IPV6_ADDR_MAPPED)) { 161 sin6 = (struct sockaddr_in6 *)uaddr; 162 memset(sin6, 0, sizeof(*sin6)); 163 sin6->sin6_family = AF_INET6; 164 return sizeof(*sin6); 165 } 166 #endif 167 168 sin = (struct sockaddr_in *)uaddr; 169 memset(sin, 0, sizeof(*sin)); 170 sin->sin_family = AF_INET; 171 return sizeof(*sin); 172 } 173 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 174 sin = (struct sockaddr_in *)uaddr; 175 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 176 sin->sin_family = AF_INET; 177 sin->sin_port = rs->rs_bound_port; 178 sin->sin_addr.s_addr = rs->rs_bound_addr_v4; 179 uaddr_len = sizeof(*sin); 180 } else { 181 sin6 = (struct sockaddr_in6 *)uaddr; 182 sin6->sin6_family = AF_INET6; 183 sin6->sin6_port = rs->rs_bound_port; 184 sin6->sin6_addr = rs->rs_bound_addr; 185 sin6->sin6_flowinfo = 0; 186 sin6->sin6_scope_id = rs->rs_bound_scope_id; 187 uaddr_len = sizeof(*sin6); 188 } 189 } 190 191 return uaddr_len; 192 } 193 194 /* 195 * RDS' poll is without a doubt the least intuitive part of the interface, 196 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from 197 * a network protocol. 198 * 199 * EPOLLIN is asserted if 200 * - there is data on the receive queue. 201 * - to signal that a previously congested destination may have become 202 * uncongested 203 * - A notification has been queued to the socket (this can be a congestion 204 * update, or a RDMA completion, or a MSG_ZEROCOPY completion). 205 * 206 * EPOLLOUT is asserted if there is room on the send queue. This does not mean 207 * however, that the next sendmsg() call will succeed. If the application tries 208 * to send to a congested destination, the system call may still fail (and 209 * return ENOBUFS). 210 */ 211 static __poll_t rds_poll(struct file *file, struct socket *sock, 212 poll_table *wait) 213 { 214 struct sock *sk = sock->sk; 215 struct rds_sock *rs = rds_sk_to_rs(sk); 216 __poll_t mask = 0; 217 unsigned long flags; 218 219 poll_wait(file, sk_sleep(sk), wait); 220 221 if (READ_ONCE(rs->rs_seen_congestion)) 222 poll_wait(file, &rds_poll_waitq, wait); 223 224 read_lock_irqsave(&rs->rs_recv_lock, flags); 225 if (!rs->rs_cong_monitor) { 226 /* When a congestion map was updated, we signal EPOLLIN for 227 * "historical" reasons. Applications can also poll for 228 * WRBAND instead. */ 229 if (rds_cong_updated_since(&rs->rs_cong_track)) 230 mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); 231 } else { 232 spin_lock(&rs->rs_lock); 233 if (rs->rs_cong_notify) 234 mask |= (EPOLLIN | EPOLLRDNORM); 235 spin_unlock(&rs->rs_lock); 236 } 237 if (!list_empty(&rs->rs_recv_queue) || 238 !list_empty(&rs->rs_notify_queue) || 239 !list_empty(&rs->rs_zcookie_queue.zcookie_head)) 240 mask |= (EPOLLIN | EPOLLRDNORM); 241 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) 242 mask |= (EPOLLOUT | EPOLLWRNORM); 243 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 244 mask |= EPOLLERR; 245 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 246 247 /* clear state any time we wake a seen-congested socket */ 248 if (mask) 249 WRITE_ONCE(rs->rs_seen_congestion, 0); 250 251 return mask; 252 } 253 254 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 255 { 256 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 257 rds_tos_t utos, tos = 0; 258 259 switch (cmd) { 260 case SIOCRDSSETTOS: 261 if (get_user(utos, (rds_tos_t __user *)arg)) 262 return -EFAULT; 263 264 if (rs->rs_transport && 265 rs->rs_transport->get_tos_map) 266 tos = rs->rs_transport->get_tos_map(utos); 267 else 268 return -ENOIOCTLCMD; 269 270 spin_lock_bh(&rds_sock_lock); 271 if (rs->rs_tos || rs->rs_conn) { 272 spin_unlock_bh(&rds_sock_lock); 273 return -EINVAL; 274 } 275 rs->rs_tos = tos; 276 spin_unlock_bh(&rds_sock_lock); 277 break; 278 case SIOCRDSGETTOS: 279 spin_lock_bh(&rds_sock_lock); 280 tos = rs->rs_tos; 281 spin_unlock_bh(&rds_sock_lock); 282 if (put_user(tos, (rds_tos_t __user *)arg)) 283 return -EFAULT; 284 break; 285 default: 286 return -ENOIOCTLCMD; 287 } 288 289 return 0; 290 } 291 292 static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) 293 { 294 struct sockaddr_in6 sin6; 295 struct sockaddr_in sin; 296 int ret = 0; 297 298 /* racing with another thread binding seems ok here */ 299 if (ipv6_addr_any(&rs->rs_bound_addr)) { 300 ret = -ENOTCONN; /* XXX not a great errno */ 301 goto out; 302 } 303 304 if (len < sizeof(struct sockaddr_in)) { 305 ret = -EINVAL; 306 goto out; 307 } else if (len < sizeof(struct sockaddr_in6)) { 308 /* Assume IPv4 */ 309 if (copy_from_sockptr(&sin, optval, 310 sizeof(struct sockaddr_in))) { 311 ret = -EFAULT; 312 goto out; 313 } 314 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); 315 sin6.sin6_port = sin.sin_port; 316 } else { 317 if (copy_from_sockptr(&sin6, optval, 318 sizeof(struct sockaddr_in6))) { 319 ret = -EFAULT; 320 goto out; 321 } 322 } 323 324 rds_send_drop_to(rs, &sin6); 325 out: 326 return ret; 327 } 328 329 static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, 330 int optlen) 331 { 332 int value; 333 334 if (optlen < sizeof(int)) 335 return -EINVAL; 336 if (copy_from_sockptr(&value, optval, sizeof(int))) 337 return -EFAULT; 338 *optvar = !!value; 339 return 0; 340 } 341 342 static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) 343 { 344 int ret; 345 346 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 347 if (ret == 0) { 348 if (rs->rs_cong_monitor) { 349 rds_cong_add_socket(rs); 350 } else { 351 rds_cong_remove_socket(rs); 352 rs->rs_cong_mask = 0; 353 rs->rs_cong_notify = 0; 354 } 355 } 356 return ret; 357 } 358 359 static int rds_set_transport(struct net *net, struct rds_sock *rs, 360 sockptr_t optval, int optlen) 361 { 362 int t_type; 363 364 if (rs->rs_transport) 365 return -EOPNOTSUPP; /* previously attached to transport */ 366 367 if (optlen != sizeof(int)) 368 return -EINVAL; 369 370 if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) 371 return -EFAULT; 372 373 if (t_type < 0 || t_type >= RDS_TRANS_COUNT) 374 return -EINVAL; 375 376 /* RDS/IB is restricted to the initial network namespace */ 377 if (t_type != RDS_TRANS_TCP && !net_eq(net, &init_net)) 378 return -EPROTOTYPE; 379 380 rs->rs_transport = rds_trans_get(t_type); 381 382 return rs->rs_transport ? 0 : -ENOPROTOOPT; 383 } 384 385 static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, 386 int optlen, int optname) 387 { 388 int val, valbool; 389 390 if (optlen != sizeof(int)) 391 return -EFAULT; 392 393 if (copy_from_sockptr(&val, optval, sizeof(int))) 394 return -EFAULT; 395 396 valbool = val ? 1 : 0; 397 398 if (optname == SO_TIMESTAMP_NEW) 399 sock_set_flag(sk, SOCK_TSTAMP_NEW); 400 401 if (valbool) 402 sock_set_flag(sk, SOCK_RCVTSTAMP); 403 else 404 sock_reset_flag(sk, SOCK_RCVTSTAMP); 405 406 return 0; 407 } 408 409 static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, 410 int optlen) 411 { 412 struct rds_rx_trace_so trace; 413 int i; 414 415 if (optlen != sizeof(struct rds_rx_trace_so)) 416 return -EFAULT; 417 418 if (copy_from_sockptr(&trace, optval, sizeof(trace))) 419 return -EFAULT; 420 421 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) 422 return -EFAULT; 423 424 rs->rs_rx_traces = trace.rx_traces; 425 for (i = 0; i < rs->rs_rx_traces; i++) { 426 if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { 427 rs->rs_rx_traces = 0; 428 return -EFAULT; 429 } 430 rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; 431 } 432 433 return 0; 434 } 435 436 static int rds_setsockopt(struct socket *sock, int level, int optname, 437 sockptr_t optval, unsigned int optlen) 438 { 439 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 440 struct net *net = sock_net(sock->sk); 441 int ret; 442 443 if (level != SOL_RDS) { 444 ret = -ENOPROTOOPT; 445 goto out; 446 } 447 448 switch (optname) { 449 case RDS_CANCEL_SENT_TO: 450 ret = rds_cancel_sent_to(rs, optval, optlen); 451 break; 452 case RDS_GET_MR: 453 ret = rds_get_mr(rs, optval, optlen); 454 break; 455 case RDS_GET_MR_FOR_DEST: 456 ret = rds_get_mr_for_dest(rs, optval, optlen); 457 break; 458 case RDS_FREE_MR: 459 ret = rds_free_mr(rs, optval, optlen); 460 break; 461 case RDS_RECVERR: 462 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); 463 break; 464 case RDS_CONG_MONITOR: 465 ret = rds_cong_monitor(rs, optval, optlen); 466 break; 467 case SO_RDS_TRANSPORT: 468 lock_sock(sock->sk); 469 ret = rds_set_transport(net, rs, optval, optlen); 470 release_sock(sock->sk); 471 break; 472 case SO_TIMESTAMP_OLD: 473 case SO_TIMESTAMP_NEW: 474 lock_sock(sock->sk); 475 ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); 476 release_sock(sock->sk); 477 break; 478 case SO_RDS_MSG_RXPATH_LATENCY: 479 ret = rds_recv_track_latency(rs, optval, optlen); 480 break; 481 default: 482 ret = -ENOPROTOOPT; 483 } 484 out: 485 return ret; 486 } 487 488 static int rds_getsockopt(struct socket *sock, int level, int optname, 489 sockopt_t *opt) 490 { 491 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 492 int ret = -ENOPROTOOPT, len; 493 int trans; 494 int val; 495 496 if (level != SOL_RDS) 497 goto out; 498 499 len = opt->optlen; 500 501 switch (optname) { 502 case RDS_INFO_FIRST ... RDS_INFO_LAST: 503 ret = rds_info_getsockopt(sock, optname, opt); 504 break; 505 506 case RDS_RECVERR: 507 if (len < sizeof(int)) { 508 ret = -EINVAL; 509 break; 510 } 511 val = rs->rs_recverr; 512 if (copy_to_iter(&val, sizeof(int), &opt->iter_out) != 513 sizeof(int)) { 514 ret = -EFAULT; 515 } else { 516 opt->optlen = sizeof(int); 517 ret = 0; 518 } 519 break; 520 case SO_RDS_TRANSPORT: 521 if (len < sizeof(int)) { 522 ret = -EINVAL; 523 break; 524 } 525 trans = (rs->rs_transport ? rs->rs_transport->t_type : 526 RDS_TRANS_NONE); /* unbound */ 527 if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) != 528 sizeof(int)) { 529 ret = -EFAULT; 530 } else { 531 opt->optlen = sizeof(int); 532 ret = 0; 533 } 534 break; 535 default: 536 break; 537 } 538 539 out: 540 return ret; 541 542 } 543 544 static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, 545 int addr_len, int flags) 546 { 547 struct sock *sk = sock->sk; 548 struct sockaddr_in *sin; 549 struct rds_sock *rs = rds_sk_to_rs(sk); 550 int ret = 0; 551 552 if (addr_len < offsetofend(struct sockaddr, sa_family)) 553 return -EINVAL; 554 555 lock_sock(sk); 556 557 switch (uaddr->sa_family) { 558 case AF_INET: 559 sin = (struct sockaddr_in *)uaddr; 560 if (addr_len < sizeof(struct sockaddr_in)) { 561 ret = -EINVAL; 562 break; 563 } 564 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 565 ret = -EDESTADDRREQ; 566 break; 567 } 568 if (ipv4_is_multicast(sin->sin_addr.s_addr) || 569 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { 570 ret = -EINVAL; 571 break; 572 } 573 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); 574 rs->rs_conn_port = sin->sin_port; 575 break; 576 577 #if IS_ENABLED(CONFIG_IPV6) 578 case AF_INET6: { 579 struct sockaddr_in6 *sin6; 580 int addr_type; 581 582 sin6 = (struct sockaddr_in6 *)uaddr; 583 if (addr_len < sizeof(struct sockaddr_in6)) { 584 ret = -EINVAL; 585 break; 586 } 587 addr_type = ipv6_addr_type(&sin6->sin6_addr); 588 if (!(addr_type & IPV6_ADDR_UNICAST)) { 589 __be32 addr4; 590 591 if (!(addr_type & IPV6_ADDR_MAPPED)) { 592 ret = -EPROTOTYPE; 593 break; 594 } 595 596 /* It is a mapped address. Need to do some sanity 597 * checks. 598 */ 599 addr4 = sin6->sin6_addr.s6_addr32[3]; 600 if (addr4 == htonl(INADDR_ANY) || 601 addr4 == htonl(INADDR_BROADCAST) || 602 ipv4_is_multicast(addr4)) { 603 ret = -EPROTOTYPE; 604 break; 605 } 606 } 607 608 if (addr_type & IPV6_ADDR_LINKLOCAL) { 609 /* If socket is already bound to a link local address, 610 * the peer address must be on the same link. 611 */ 612 if (sin6->sin6_scope_id == 0 || 613 (!ipv6_addr_any(&rs->rs_bound_addr) && 614 rs->rs_bound_scope_id && 615 sin6->sin6_scope_id != rs->rs_bound_scope_id)) { 616 ret = -EINVAL; 617 break; 618 } 619 /* Remember the connected address scope ID. It will 620 * be checked against the binding local address when 621 * the socket is bound. 622 */ 623 rs->rs_bound_scope_id = sin6->sin6_scope_id; 624 } 625 rs->rs_conn_addr = sin6->sin6_addr; 626 rs->rs_conn_port = sin6->sin6_port; 627 break; 628 } 629 #endif 630 631 default: 632 ret = -EAFNOSUPPORT; 633 break; 634 } 635 636 release_sock(sk); 637 return ret; 638 } 639 640 static struct proto rds_proto = { 641 .name = "RDS", 642 .owner = THIS_MODULE, 643 .obj_size = sizeof(struct rds_sock), 644 }; 645 646 static const struct proto_ops rds_proto_ops = { 647 .family = AF_RDS, 648 .owner = THIS_MODULE, 649 .release = rds_release, 650 .bind = rds_bind, 651 .connect = rds_connect, 652 .socketpair = sock_no_socketpair, 653 .accept = sock_no_accept, 654 .getname = rds_getname, 655 .poll = rds_poll, 656 .ioctl = rds_ioctl, 657 .listen = sock_no_listen, 658 .shutdown = sock_no_shutdown, 659 .setsockopt = rds_setsockopt, 660 .getsockopt_iter = rds_getsockopt, 661 .sendmsg = rds_sendmsg, 662 .recvmsg = rds_recvmsg, 663 .mmap = sock_no_mmap, 664 }; 665 666 static void rds_sock_destruct(struct sock *sk) 667 { 668 struct rds_sock *rs = rds_sk_to_rs(sk); 669 670 WARN_ON((&rs->rs_item != rs->rs_item.next || 671 &rs->rs_item != rs->rs_item.prev)); 672 } 673 674 static int __rds_create(struct socket *sock, struct sock *sk, int protocol) 675 { 676 struct rds_sock *rs; 677 678 sock_init_data(sock, sk); 679 sock->ops = &rds_proto_ops; 680 sk->sk_protocol = protocol; 681 sk->sk_destruct = rds_sock_destruct; 682 683 rs = rds_sk_to_rs(sk); 684 spin_lock_init(&rs->rs_lock); 685 rwlock_init(&rs->rs_recv_lock); 686 INIT_LIST_HEAD(&rs->rs_send_queue); 687 INIT_LIST_HEAD(&rs->rs_recv_queue); 688 INIT_LIST_HEAD(&rs->rs_notify_queue); 689 INIT_LIST_HEAD(&rs->rs_cong_list); 690 rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); 691 spin_lock_init(&rs->rs_rdma_lock); 692 rs->rs_rdma_keys = RB_ROOT; 693 rs->rs_rx_traces = 0; 694 rs->rs_tos = 0; 695 rs->rs_conn = NULL; 696 697 spin_lock_bh(&rds_sock_lock); 698 list_add_tail(&rs->rs_item, &rds_sock_list); 699 spin_unlock_bh(&rds_sock_lock); 700 701 return 0; 702 } 703 704 static int rds_create(struct net *net, struct socket *sock, int protocol, 705 int kern) 706 { 707 struct sock *sk; 708 709 if (sock->type != SOCK_SEQPACKET || protocol) 710 return -ESOCKTNOSUPPORT; 711 712 sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); 713 if (!sk) 714 return -ENOMEM; 715 716 return __rds_create(sock, sk, protocol); 717 } 718 719 void rds_sock_addref(struct rds_sock *rs) 720 { 721 sock_hold(rds_rs_to_sk(rs)); 722 } 723 724 void rds_sock_put(struct rds_sock *rs) 725 { 726 sock_put(rds_rs_to_sk(rs)); 727 } 728 729 static const struct net_proto_family rds_family_ops = { 730 .family = AF_RDS, 731 .create = rds_create, 732 .owner = THIS_MODULE, 733 }; 734 735 static void rds_sock_inc_info(struct socket *sock, unsigned int len, 736 struct rds_info_iterator *iter, 737 struct rds_info_lengths *lens) 738 { 739 struct net *net = sock_net(sock->sk); 740 struct rds_sock *rs; 741 struct rds_incoming *inc; 742 unsigned int total = 0; 743 744 len /= sizeof(struct rds_info_message); 745 746 spin_lock_bh(&rds_sock_lock); 747 748 list_for_each_entry(rs, &rds_sock_list, rs_item) { 749 /* Only show sockets in the caller's netns. */ 750 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 751 continue; 752 /* This option only supports IPv4 sockets. */ 753 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 754 continue; 755 756 read_lock(&rs->rs_recv_lock); 757 758 /* XXX too lazy to maintain counts.. */ 759 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 760 total++; 761 if (total <= len) 762 rds_inc_info_copy(inc, iter, 763 inc->i_saddr.s6_addr32[3], 764 rs->rs_bound_addr_v4, 765 1); 766 } 767 768 read_unlock(&rs->rs_recv_lock); 769 } 770 771 spin_unlock_bh(&rds_sock_lock); 772 773 lens->nr = total; 774 lens->each = sizeof(struct rds_info_message); 775 } 776 777 #if IS_ENABLED(CONFIG_IPV6) 778 static void rds6_sock_inc_info(struct socket *sock, unsigned int len, 779 struct rds_info_iterator *iter, 780 struct rds_info_lengths *lens) 781 { 782 struct net *net = sock_net(sock->sk); 783 struct rds_incoming *inc; 784 unsigned int total = 0; 785 struct rds_sock *rs; 786 787 len /= sizeof(struct rds6_info_message); 788 789 spin_lock_bh(&rds_sock_lock); 790 791 list_for_each_entry(rs, &rds_sock_list, rs_item) { 792 /* Only show sockets in the caller's netns. */ 793 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 794 continue; 795 read_lock(&rs->rs_recv_lock); 796 797 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 798 total++; 799 if (total <= len) 800 rds6_inc_info_copy(inc, iter, &inc->i_saddr, 801 &rs->rs_bound_addr, 1); 802 } 803 804 read_unlock(&rs->rs_recv_lock); 805 } 806 807 spin_unlock_bh(&rds_sock_lock); 808 809 lens->nr = total; 810 lens->each = sizeof(struct rds6_info_message); 811 } 812 #endif 813 814 static void rds_sock_info(struct socket *sock, unsigned int len, 815 struct rds_info_iterator *iter, 816 struct rds_info_lengths *lens) 817 { 818 struct net *net = sock_net(sock->sk); 819 struct rds_info_socket sinfo; 820 unsigned int copied = 0; 821 unsigned int cnt = 0; 822 struct rds_sock *rs; 823 824 len /= sizeof(struct rds_info_socket); 825 826 spin_lock_bh(&rds_sock_lock); 827 828 /* First pass: count entries visible in the caller's netns. */ 829 list_for_each_entry(rs, &rds_sock_list, rs_item) { 830 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 831 continue; 832 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 833 continue; 834 cnt++; 835 } 836 837 if (len < cnt) 838 goto out; 839 840 list_for_each_entry(rs, &rds_sock_list, rs_item) { 841 if (copied >= cnt) 842 break; 843 /* Only show sockets in the caller's netns. */ 844 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 845 continue; 846 /* This option only supports IPv4 sockets. */ 847 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 848 continue; 849 sinfo.sndbuf = rds_sk_sndbuf(rs); 850 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 851 sinfo.bound_addr = rs->rs_bound_addr_v4; 852 sinfo.connected_addr = rs->rs_conn_addr_v4; 853 sinfo.bound_port = rs->rs_bound_port; 854 sinfo.connected_port = rs->rs_conn_port; 855 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 856 857 rds_info_copy(iter, &sinfo, sizeof(sinfo)); 858 copied++; 859 } 860 /* A concurrent rds_bind() can change rs_bound_addr between the 861 * two passes without holding rds_sock_lock, so copied may be 862 * less than cnt. Report what was actually copied. 863 */ 864 cnt = copied; 865 866 out: 867 lens->nr = cnt; 868 lens->each = sizeof(struct rds_info_socket); 869 870 spin_unlock_bh(&rds_sock_lock); 871 } 872 873 #if IS_ENABLED(CONFIG_IPV6) 874 static void rds6_sock_info(struct socket *sock, unsigned int len, 875 struct rds_info_iterator *iter, 876 struct rds_info_lengths *lens) 877 { 878 struct net *net = sock_net(sock->sk); 879 struct rds6_info_socket sinfo6; 880 unsigned int copied = 0; 881 unsigned int cnt = 0; 882 struct rds_sock *rs; 883 884 len /= sizeof(struct rds6_info_socket); 885 886 spin_lock_bh(&rds_sock_lock); 887 888 /* First pass: count entries visible in the caller's netns. */ 889 list_for_each_entry(rs, &rds_sock_list, rs_item) { 890 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 891 continue; 892 cnt++; 893 } 894 895 if (len < cnt) 896 goto out; 897 898 list_for_each_entry(rs, &rds_sock_list, rs_item) { 899 if (copied >= cnt) 900 break; 901 /* Only show sockets in the caller's netns. */ 902 if (!net_eq(sock_net(rds_rs_to_sk(rs)), net)) 903 continue; 904 sinfo6.sndbuf = rds_sk_sndbuf(rs); 905 sinfo6.rcvbuf = rds_sk_rcvbuf(rs); 906 sinfo6.bound_addr = rs->rs_bound_addr; 907 sinfo6.connected_addr = rs->rs_conn_addr; 908 sinfo6.bound_port = rs->rs_bound_port; 909 sinfo6.connected_port = rs->rs_conn_port; 910 sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); 911 912 rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); 913 copied++; 914 } 915 cnt = copied; 916 917 out: 918 lens->nr = cnt; 919 lens->each = sizeof(struct rds6_info_socket); 920 921 spin_unlock_bh(&rds_sock_lock); 922 } 923 #endif 924 925 static void rds_exit(void) 926 { 927 sock_unregister(rds_family_ops.family); 928 proto_unregister(&rds_proto); 929 rds_conn_exit(); 930 rds_cong_exit(); 931 rds_sysctl_exit(); 932 rds_threads_exit(); 933 rds_stats_exit(); 934 rds_page_exit(); 935 rds_bind_lock_destroy(); 936 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); 937 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 938 #if IS_ENABLED(CONFIG_IPV6) 939 rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); 940 rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 941 #endif 942 } 943 module_exit(rds_exit); 944 945 u32 rds_gen_num; 946 947 static int __init rds_init(void) 948 { 949 int ret; 950 951 net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); 952 953 ret = rds_bind_lock_init(); 954 if (ret) 955 goto out; 956 957 ret = rds_conn_init(); 958 if (ret) 959 goto out_bind; 960 961 ret = rds_threads_init(); 962 if (ret) 963 goto out_conn; 964 ret = rds_sysctl_init(); 965 if (ret) 966 goto out_threads; 967 ret = rds_stats_init(); 968 if (ret) 969 goto out_sysctl; 970 ret = proto_register(&rds_proto, 1); 971 if (ret) 972 goto out_stats; 973 ret = sock_register(&rds_family_ops); 974 if (ret) 975 goto out_proto; 976 977 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); 978 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 979 #if IS_ENABLED(CONFIG_IPV6) 980 rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); 981 rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 982 #endif 983 984 goto out; 985 986 out_proto: 987 proto_unregister(&rds_proto); 988 out_stats: 989 rds_stats_exit(); 990 out_sysctl: 991 rds_sysctl_exit(); 992 out_threads: 993 rds_threads_exit(); 994 out_conn: 995 rds_conn_exit(); 996 rds_cong_exit(); 997 rds_page_exit(); 998 out_bind: 999 rds_bind_lock_destroy(); 1000 out: 1001 return ret; 1002 } 1003 module_init(rds_init); 1004 1005 #define DRV_VERSION "4.0" 1006 #define DRV_RELDATE "Feb 12, 2009" 1007 1008 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 1009 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" 1010 " v" DRV_VERSION " (" DRV_RELDATE ")"); 1011 MODULE_VERSION(DRV_VERSION); 1012 MODULE_LICENSE("Dual BSD/GPL"); 1013 MODULE_ALIAS_NETPROTO(PF_RDS); 1014