xref: /linux/net/rds/af_rds.c (revision 056a5087d87ead77dedbe9cf5bde53b7cd4b4651)
1 /*
2  * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  *
32  */
33 #include <linux/module.h>
34 #include <linux/errno.h>
35 #include <linux/kernel.h>
36 #include <linux/gfp.h>
37 #include <linux/in.h>
38 #include <linux/ipv6.h>
39 #include <linux/poll.h>
40 #include <linux/uio.h>
41 #include <net/sock.h>
42 
43 #include "rds.h"
44 
45 /* this is just used for stats gathering :/ */
46 static DEFINE_SPINLOCK(rds_sock_lock);
47 static LIST_HEAD(rds_sock_list);
48 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
49 
50 /*
51  * This is called as the final descriptor referencing this socket is closed.
52  * We have to unbind the socket so that another socket can be bound to the
53  * address it was using.
54  *
55  * We have to be careful about racing with the incoming path.  sock_orphan()
56  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
57  * messages shouldn't be queued.
58  */
59 static int rds_release(struct socket *sock)
60 {
61 	struct sock *sk = sock->sk;
62 	struct rds_sock *rs;
63 
64 	if (!sk)
65 		goto out;
66 
67 	rs = rds_sk_to_rs(sk);
68 
69 	sock_orphan(sk);
70 	/* Note - rds_clear_recv_queue grabs rs_recv_lock, so
71 	 * that ensures the recv path has completed messing
72 	 * with the socket. */
73 	rds_clear_recv_queue(rs);
74 	rds_cong_remove_socket(rs);
75 
76 	rds_remove_bound(rs);
77 
78 	rds_send_drop_to(rs, NULL);
79 	rds_rdma_drop_keys(rs);
80 	rds_notify_queue_get(rs, NULL);
81 	rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
82 
83 	spin_lock_bh(&rds_sock_lock);
84 	list_del_init(&rs->rs_item);
85 	spin_unlock_bh(&rds_sock_lock);
86 
87 	rds_trans_put(rs->rs_transport);
88 
89 	sock->sk = NULL;
90 	sock_put(sk);
91 out:
92 	return 0;
93 }
94 
95 /*
96  * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
97  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
98  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
99  * this seems more conservative.
100  * NB - normally, one would use sk_callback_lock for this, but we can
101  * get here from interrupts, whereas the network code grabs sk_callback_lock
102  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
103  */
104 void rds_wake_sk_sleep(struct rds_sock *rs)
105 {
106 	unsigned long flags;
107 
108 	read_lock_irqsave(&rs->rs_recv_lock, flags);
109 	__rds_wake_sk_sleep(rds_rs_to_sk(rs));
110 	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
111 }
112 
113 static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
114 		       int peer)
115 {
116 	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
117 	struct sockaddr_in6 *sin6;
118 	struct sockaddr_in *sin;
119 	int uaddr_len;
120 
121 	/* racey, don't care */
122 	if (peer) {
123 		if (ipv6_addr_any(&rs->rs_conn_addr))
124 			return -ENOTCONN;
125 
126 		if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
127 			sin = (struct sockaddr_in *)uaddr;
128 			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
129 			sin->sin_family = AF_INET;
130 			sin->sin_port = rs->rs_conn_port;
131 			sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
132 			uaddr_len = sizeof(*sin);
133 		} else {
134 			sin6 = (struct sockaddr_in6 *)uaddr;
135 			sin6->sin6_family = AF_INET6;
136 			sin6->sin6_port = rs->rs_conn_port;
137 			sin6->sin6_addr = rs->rs_conn_addr;
138 			sin6->sin6_flowinfo = 0;
139 			/* scope_id is the same as in the bound address. */
140 			sin6->sin6_scope_id = rs->rs_bound_scope_id;
141 			uaddr_len = sizeof(*sin6);
142 		}
143 	} else {
144 		/* If socket is not yet bound and the socket is connected,
145 		 * set the return address family to be the same as the
146 		 * connected address, but with 0 address value.  If it is not
147 		 * connected, set the family to be AF_UNSPEC (value 0) and
148 		 * the address size to be that of an IPv4 address.
149 		 */
150 		if (ipv6_addr_any(&rs->rs_bound_addr)) {
151 			if (ipv6_addr_any(&rs->rs_conn_addr)) {
152 				sin = (struct sockaddr_in *)uaddr;
153 				memset(sin, 0, sizeof(*sin));
154 				sin->sin_family = AF_UNSPEC;
155 				return sizeof(*sin);
156 			}
157 
158 #if IS_ENABLED(CONFIG_IPV6)
159 			if (!(ipv6_addr_type(&rs->rs_conn_addr) &
160 			      IPV6_ADDR_MAPPED)) {
161 				sin6 = (struct sockaddr_in6 *)uaddr;
162 				memset(sin6, 0, sizeof(*sin6));
163 				sin6->sin6_family = AF_INET6;
164 				return sizeof(*sin6);
165 			}
166 #endif
167 
168 			sin = (struct sockaddr_in *)uaddr;
169 			memset(sin, 0, sizeof(*sin));
170 			sin->sin_family = AF_INET;
171 			return sizeof(*sin);
172 		}
173 		if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
174 			sin = (struct sockaddr_in *)uaddr;
175 			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
176 			sin->sin_family = AF_INET;
177 			sin->sin_port = rs->rs_bound_port;
178 			sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
179 			uaddr_len = sizeof(*sin);
180 		} else {
181 			sin6 = (struct sockaddr_in6 *)uaddr;
182 			sin6->sin6_family = AF_INET6;
183 			sin6->sin6_port = rs->rs_bound_port;
184 			sin6->sin6_addr = rs->rs_bound_addr;
185 			sin6->sin6_flowinfo = 0;
186 			sin6->sin6_scope_id = rs->rs_bound_scope_id;
187 			uaddr_len = sizeof(*sin6);
188 		}
189 	}
190 
191 	return uaddr_len;
192 }
193 
194 /*
195  * RDS' poll is without a doubt the least intuitive part of the interface,
196  * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
197  * a network protocol.
198  *
199  * EPOLLIN is asserted if
200  *  -	there is data on the receive queue.
201  *  -	to signal that a previously congested destination may have become
202  *	uncongested
203  *  -	A notification has been queued to the socket (this can be a congestion
204  *	update, or a RDMA completion, or a MSG_ZEROCOPY completion).
205  *
206  * EPOLLOUT is asserted if there is room on the send queue. This does not mean
207  * however, that the next sendmsg() call will succeed. If the application tries
208  * to send to a congested destination, the system call may still fail (and
209  * return ENOBUFS).
210  */
211 static __poll_t rds_poll(struct file *file, struct socket *sock,
212 			     poll_table *wait)
213 {
214 	struct sock *sk = sock->sk;
215 	struct rds_sock *rs = rds_sk_to_rs(sk);
216 	__poll_t mask = 0;
217 	unsigned long flags;
218 
219 	poll_wait(file, sk_sleep(sk), wait);
220 
221 	if (READ_ONCE(rs->rs_seen_congestion))
222 		poll_wait(file, &rds_poll_waitq, wait);
223 
224 	read_lock_irqsave(&rs->rs_recv_lock, flags);
225 	if (!rs->rs_cong_monitor) {
226 		/* When a congestion map was updated, we signal EPOLLIN for
227 		 * "historical" reasons. Applications can also poll for
228 		 * WRBAND instead. */
229 		if (rds_cong_updated_since(&rs->rs_cong_track))
230 			mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
231 	} else {
232 		spin_lock(&rs->rs_lock);
233 		if (rs->rs_cong_notify)
234 			mask |= (EPOLLIN | EPOLLRDNORM);
235 		spin_unlock(&rs->rs_lock);
236 	}
237 	if (!list_empty(&rs->rs_recv_queue) ||
238 	    !list_empty(&rs->rs_notify_queue) ||
239 	    !list_empty(&rs->rs_zcookie_queue.zcookie_head))
240 		mask |= (EPOLLIN | EPOLLRDNORM);
241 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
242 		mask |= (EPOLLOUT | EPOLLWRNORM);
243 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
244 		mask |= EPOLLERR;
245 	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
246 
247 	/* clear state any time we wake a seen-congested socket */
248 	if (mask)
249 		WRITE_ONCE(rs->rs_seen_congestion, 0);
250 
251 	return mask;
252 }
253 
254 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
255 {
256 	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
257 	rds_tos_t utos, tos = 0;
258 
259 	switch (cmd) {
260 	case SIOCRDSSETTOS:
261 		if (get_user(utos, (rds_tos_t __user *)arg))
262 			return -EFAULT;
263 
264 		if (rs->rs_transport &&
265 		    rs->rs_transport->get_tos_map)
266 			tos = rs->rs_transport->get_tos_map(utos);
267 		else
268 			return -ENOIOCTLCMD;
269 
270 		spin_lock_bh(&rds_sock_lock);
271 		if (rs->rs_tos || rs->rs_conn) {
272 			spin_unlock_bh(&rds_sock_lock);
273 			return -EINVAL;
274 		}
275 		rs->rs_tos = tos;
276 		spin_unlock_bh(&rds_sock_lock);
277 		break;
278 	case SIOCRDSGETTOS:
279 		spin_lock_bh(&rds_sock_lock);
280 		tos = rs->rs_tos;
281 		spin_unlock_bh(&rds_sock_lock);
282 		if (put_user(tos, (rds_tos_t __user *)arg))
283 			return -EFAULT;
284 		break;
285 	default:
286 		return -ENOIOCTLCMD;
287 	}
288 
289 	return 0;
290 }
291 
292 static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
293 {
294 	struct sockaddr_in6 sin6;
295 	struct sockaddr_in sin;
296 	int ret = 0;
297 
298 	/* racing with another thread binding seems ok here */
299 	if (ipv6_addr_any(&rs->rs_bound_addr)) {
300 		ret = -ENOTCONN; /* XXX not a great errno */
301 		goto out;
302 	}
303 
304 	if (len < sizeof(struct sockaddr_in)) {
305 		ret = -EINVAL;
306 		goto out;
307 	} else if (len < sizeof(struct sockaddr_in6)) {
308 		/* Assume IPv4 */
309 		if (copy_from_sockptr(&sin, optval,
310 				sizeof(struct sockaddr_in))) {
311 			ret = -EFAULT;
312 			goto out;
313 		}
314 		ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
315 		sin6.sin6_port = sin.sin_port;
316 	} else {
317 		if (copy_from_sockptr(&sin6, optval,
318 				   sizeof(struct sockaddr_in6))) {
319 			ret = -EFAULT;
320 			goto out;
321 		}
322 	}
323 
324 	rds_send_drop_to(rs, &sin6);
325 out:
326 	return ret;
327 }
328 
329 static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
330 			       int optlen)
331 {
332 	int value;
333 
334 	if (optlen < sizeof(int))
335 		return -EINVAL;
336 	if (copy_from_sockptr(&value, optval, sizeof(int)))
337 		return -EFAULT;
338 	*optvar = !!value;
339 	return 0;
340 }
341 
342 static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
343 {
344 	int ret;
345 
346 	ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
347 	if (ret == 0) {
348 		if (rs->rs_cong_monitor) {
349 			rds_cong_add_socket(rs);
350 		} else {
351 			rds_cong_remove_socket(rs);
352 			rs->rs_cong_mask = 0;
353 			rs->rs_cong_notify = 0;
354 		}
355 	}
356 	return ret;
357 }
358 
359 static int rds_set_transport(struct net *net, struct rds_sock *rs,
360 			     sockptr_t optval, int optlen)
361 {
362 	int t_type;
363 
364 	if (rs->rs_transport)
365 		return -EOPNOTSUPP; /* previously attached to transport */
366 
367 	if (optlen != sizeof(int))
368 		return -EINVAL;
369 
370 	if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
371 		return -EFAULT;
372 
373 	if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
374 		return -EINVAL;
375 
376 	/* RDS/IB is restricted to the initial network namespace */
377 	if (t_type != RDS_TRANS_TCP && !net_eq(net, &init_net))
378 		return -EPROTOTYPE;
379 
380 	rs->rs_transport = rds_trans_get(t_type);
381 
382 	return rs->rs_transport ? 0 : -ENOPROTOOPT;
383 }
384 
385 static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
386 				 int optlen, int optname)
387 {
388 	int val, valbool;
389 
390 	if (optlen != sizeof(int))
391 		return -EFAULT;
392 
393 	if (copy_from_sockptr(&val, optval, sizeof(int)))
394 		return -EFAULT;
395 
396 	valbool = val ? 1 : 0;
397 
398 	if (optname == SO_TIMESTAMP_NEW)
399 		sock_set_flag(sk, SOCK_TSTAMP_NEW);
400 
401 	if (valbool)
402 		sock_set_flag(sk, SOCK_RCVTSTAMP);
403 	else
404 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
405 
406 	return 0;
407 }
408 
409 static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
410 				  int optlen)
411 {
412 	struct rds_rx_trace_so trace;
413 	int i;
414 
415 	if (optlen != sizeof(struct rds_rx_trace_so))
416 		return -EFAULT;
417 
418 	if (copy_from_sockptr(&trace, optval, sizeof(trace)))
419 		return -EFAULT;
420 
421 	if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
422 		return -EFAULT;
423 
424 	rs->rs_rx_traces = trace.rx_traces;
425 	for (i = 0; i < rs->rs_rx_traces; i++) {
426 		if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
427 			rs->rs_rx_traces = 0;
428 			return -EFAULT;
429 		}
430 		rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
431 	}
432 
433 	return 0;
434 }
435 
436 static int rds_setsockopt(struct socket *sock, int level, int optname,
437 			  sockptr_t optval, unsigned int optlen)
438 {
439 	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
440 	struct net *net = sock_net(sock->sk);
441 	int ret;
442 
443 	if (level != SOL_RDS) {
444 		ret = -ENOPROTOOPT;
445 		goto out;
446 	}
447 
448 	switch (optname) {
449 	case RDS_CANCEL_SENT_TO:
450 		ret = rds_cancel_sent_to(rs, optval, optlen);
451 		break;
452 	case RDS_GET_MR:
453 		ret = rds_get_mr(rs, optval, optlen);
454 		break;
455 	case RDS_GET_MR_FOR_DEST:
456 		ret = rds_get_mr_for_dest(rs, optval, optlen);
457 		break;
458 	case RDS_FREE_MR:
459 		ret = rds_free_mr(rs, optval, optlen);
460 		break;
461 	case RDS_RECVERR:
462 		ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
463 		break;
464 	case RDS_CONG_MONITOR:
465 		ret = rds_cong_monitor(rs, optval, optlen);
466 		break;
467 	case SO_RDS_TRANSPORT:
468 		lock_sock(sock->sk);
469 		ret = rds_set_transport(net, rs, optval, optlen);
470 		release_sock(sock->sk);
471 		break;
472 	case SO_TIMESTAMP_OLD:
473 	case SO_TIMESTAMP_NEW:
474 		lock_sock(sock->sk);
475 		ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
476 		release_sock(sock->sk);
477 		break;
478 	case SO_RDS_MSG_RXPATH_LATENCY:
479 		ret = rds_recv_track_latency(rs, optval, optlen);
480 		break;
481 	default:
482 		ret = -ENOPROTOOPT;
483 	}
484 out:
485 	return ret;
486 }
487 
488 static int rds_getsockopt(struct socket *sock, int level, int optname,
489 			  sockopt_t *opt)
490 {
491 	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
492 	int ret = -ENOPROTOOPT, len;
493 	int trans;
494 	int val;
495 
496 	if (level != SOL_RDS)
497 		goto out;
498 
499 	len = opt->optlen;
500 
501 	switch (optname) {
502 	case RDS_INFO_FIRST ... RDS_INFO_LAST:
503 		ret = rds_info_getsockopt(sock, optname, opt);
504 		break;
505 
506 	case RDS_RECVERR:
507 		if (len < sizeof(int)) {
508 			ret = -EINVAL;
509 			break;
510 		}
511 		val = rs->rs_recverr;
512 		if (copy_to_iter(&val, sizeof(int), &opt->iter_out) !=
513 		    sizeof(int)) {
514 			ret = -EFAULT;
515 		} else {
516 			opt->optlen = sizeof(int);
517 			ret = 0;
518 		}
519 		break;
520 	case SO_RDS_TRANSPORT:
521 		if (len < sizeof(int)) {
522 			ret = -EINVAL;
523 			break;
524 		}
525 		trans = (rs->rs_transport ? rs->rs_transport->t_type :
526 			 RDS_TRANS_NONE); /* unbound */
527 		if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) !=
528 		    sizeof(int)) {
529 			ret = -EFAULT;
530 		} else {
531 			opt->optlen = sizeof(int);
532 			ret = 0;
533 		}
534 		break;
535 	default:
536 		break;
537 	}
538 
539 out:
540 	return ret;
541 
542 }
543 
544 static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
545 		       int addr_len, int flags)
546 {
547 	struct sock *sk = sock->sk;
548 	struct sockaddr_in *sin;
549 	struct rds_sock *rs = rds_sk_to_rs(sk);
550 	int ret = 0;
551 
552 	if (addr_len < offsetofend(struct sockaddr, sa_family))
553 		return -EINVAL;
554 
555 	lock_sock(sk);
556 
557 	switch (uaddr->sa_family) {
558 	case AF_INET:
559 		sin = (struct sockaddr_in *)uaddr;
560 		if (addr_len < sizeof(struct sockaddr_in)) {
561 			ret = -EINVAL;
562 			break;
563 		}
564 		if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
565 			ret = -EDESTADDRREQ;
566 			break;
567 		}
568 		if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
569 		    sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
570 			ret = -EINVAL;
571 			break;
572 		}
573 		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
574 		rs->rs_conn_port = sin->sin_port;
575 		break;
576 
577 #if IS_ENABLED(CONFIG_IPV6)
578 	case AF_INET6: {
579 		struct sockaddr_in6 *sin6;
580 		int addr_type;
581 
582 		sin6 = (struct sockaddr_in6 *)uaddr;
583 		if (addr_len < sizeof(struct sockaddr_in6)) {
584 			ret = -EINVAL;
585 			break;
586 		}
587 		addr_type = ipv6_addr_type(&sin6->sin6_addr);
588 		if (!(addr_type & IPV6_ADDR_UNICAST)) {
589 			__be32 addr4;
590 
591 			if (!(addr_type & IPV6_ADDR_MAPPED)) {
592 				ret = -EPROTOTYPE;
593 				break;
594 			}
595 
596 			/* It is a mapped address.  Need to do some sanity
597 			 * checks.
598 			 */
599 			addr4 = sin6->sin6_addr.s6_addr32[3];
600 			if (addr4 == htonl(INADDR_ANY) ||
601 			    addr4 == htonl(INADDR_BROADCAST) ||
602 			    ipv4_is_multicast(addr4)) {
603 				ret = -EPROTOTYPE;
604 				break;
605 			}
606 		}
607 
608 		if (addr_type & IPV6_ADDR_LINKLOCAL) {
609 			/* If socket is already bound to a link local address,
610 			 * the peer address must be on the same link.
611 			 */
612 			if (sin6->sin6_scope_id == 0 ||
613 			    (!ipv6_addr_any(&rs->rs_bound_addr) &&
614 			     rs->rs_bound_scope_id &&
615 			     sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
616 				ret = -EINVAL;
617 				break;
618 			}
619 			/* Remember the connected address scope ID.  It will
620 			 * be checked against the binding local address when
621 			 * the socket is bound.
622 			 */
623 			rs->rs_bound_scope_id = sin6->sin6_scope_id;
624 		}
625 		rs->rs_conn_addr = sin6->sin6_addr;
626 		rs->rs_conn_port = sin6->sin6_port;
627 		break;
628 	}
629 #endif
630 
631 	default:
632 		ret = -EAFNOSUPPORT;
633 		break;
634 	}
635 
636 	release_sock(sk);
637 	return ret;
638 }
639 
640 static struct proto rds_proto = {
641 	.name	  = "RDS",
642 	.owner	  = THIS_MODULE,
643 	.obj_size = sizeof(struct rds_sock),
644 };
645 
646 static const struct proto_ops rds_proto_ops = {
647 	.family =	AF_RDS,
648 	.owner =	THIS_MODULE,
649 	.release =	rds_release,
650 	.bind =		rds_bind,
651 	.connect =	rds_connect,
652 	.socketpair =	sock_no_socketpair,
653 	.accept =	sock_no_accept,
654 	.getname =	rds_getname,
655 	.poll =		rds_poll,
656 	.ioctl =	rds_ioctl,
657 	.listen =	sock_no_listen,
658 	.shutdown =	sock_no_shutdown,
659 	.setsockopt =	rds_setsockopt,
660 	.getsockopt_iter =	rds_getsockopt,
661 	.sendmsg =	rds_sendmsg,
662 	.recvmsg =	rds_recvmsg,
663 	.mmap =		sock_no_mmap,
664 };
665 
666 static void rds_sock_destruct(struct sock *sk)
667 {
668 	struct rds_sock *rs = rds_sk_to_rs(sk);
669 
670 	WARN_ON((&rs->rs_item != rs->rs_item.next ||
671 		 &rs->rs_item != rs->rs_item.prev));
672 }
673 
674 static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
675 {
676 	struct rds_sock *rs;
677 
678 	sock_init_data(sock, sk);
679 	sock->ops		= &rds_proto_ops;
680 	sk->sk_protocol		= protocol;
681 	sk->sk_destruct		= rds_sock_destruct;
682 
683 	rs = rds_sk_to_rs(sk);
684 	spin_lock_init(&rs->rs_lock);
685 	rwlock_init(&rs->rs_recv_lock);
686 	INIT_LIST_HEAD(&rs->rs_send_queue);
687 	INIT_LIST_HEAD(&rs->rs_recv_queue);
688 	INIT_LIST_HEAD(&rs->rs_notify_queue);
689 	INIT_LIST_HEAD(&rs->rs_cong_list);
690 	rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
691 	spin_lock_init(&rs->rs_rdma_lock);
692 	rs->rs_rdma_keys = RB_ROOT;
693 	rs->rs_rx_traces = 0;
694 	rs->rs_tos = 0;
695 	rs->rs_conn = NULL;
696 
697 	spin_lock_bh(&rds_sock_lock);
698 	list_add_tail(&rs->rs_item, &rds_sock_list);
699 	spin_unlock_bh(&rds_sock_lock);
700 
701 	return 0;
702 }
703 
704 static int rds_create(struct net *net, struct socket *sock, int protocol,
705 		      int kern)
706 {
707 	struct sock *sk;
708 
709 	if (sock->type != SOCK_SEQPACKET || protocol)
710 		return -ESOCKTNOSUPPORT;
711 
712 	sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
713 	if (!sk)
714 		return -ENOMEM;
715 
716 	return __rds_create(sock, sk, protocol);
717 }
718 
719 void rds_sock_addref(struct rds_sock *rs)
720 {
721 	sock_hold(rds_rs_to_sk(rs));
722 }
723 
724 void rds_sock_put(struct rds_sock *rs)
725 {
726 	sock_put(rds_rs_to_sk(rs));
727 }
728 
729 static const struct net_proto_family rds_family_ops = {
730 	.family =	AF_RDS,
731 	.create =	rds_create,
732 	.owner	=	THIS_MODULE,
733 };
734 
735 static void rds_sock_inc_info(struct socket *sock, unsigned int len,
736 			      struct rds_info_iterator *iter,
737 			      struct rds_info_lengths *lens)
738 {
739 	struct net *net = sock_net(sock->sk);
740 	struct rds_sock *rs;
741 	struct rds_incoming *inc;
742 	unsigned int total = 0;
743 
744 	len /= sizeof(struct rds_info_message);
745 
746 	spin_lock_bh(&rds_sock_lock);
747 
748 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
749 		/* Only show sockets in the caller's netns. */
750 		if (!net_eq(sock_net(rds_rs_to_sk(rs)), net))
751 			continue;
752 		/* This option only supports IPv4 sockets. */
753 		if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
754 			continue;
755 
756 		read_lock(&rs->rs_recv_lock);
757 
758 		/* XXX too lazy to maintain counts.. */
759 		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
760 			total++;
761 			if (total <= len)
762 				rds_inc_info_copy(inc, iter,
763 						  inc->i_saddr.s6_addr32[3],
764 						  rs->rs_bound_addr_v4,
765 						  1);
766 		}
767 
768 		read_unlock(&rs->rs_recv_lock);
769 	}
770 
771 	spin_unlock_bh(&rds_sock_lock);
772 
773 	lens->nr = total;
774 	lens->each = sizeof(struct rds_info_message);
775 }
776 
777 #if IS_ENABLED(CONFIG_IPV6)
778 static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
779 			       struct rds_info_iterator *iter,
780 			       struct rds_info_lengths *lens)
781 {
782 	struct net *net = sock_net(sock->sk);
783 	struct rds_incoming *inc;
784 	unsigned int total = 0;
785 	struct rds_sock *rs;
786 
787 	len /= sizeof(struct rds6_info_message);
788 
789 	spin_lock_bh(&rds_sock_lock);
790 
791 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
792 		/* Only show sockets in the caller's netns. */
793 		if (!net_eq(sock_net(rds_rs_to_sk(rs)), net))
794 			continue;
795 		read_lock(&rs->rs_recv_lock);
796 
797 		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
798 			total++;
799 			if (total <= len)
800 				rds6_inc_info_copy(inc, iter, &inc->i_saddr,
801 						   &rs->rs_bound_addr, 1);
802 		}
803 
804 		read_unlock(&rs->rs_recv_lock);
805 	}
806 
807 	spin_unlock_bh(&rds_sock_lock);
808 
809 	lens->nr = total;
810 	lens->each = sizeof(struct rds6_info_message);
811 }
812 #endif
813 
814 static void rds_sock_info(struct socket *sock, unsigned int len,
815 			  struct rds_info_iterator *iter,
816 			  struct rds_info_lengths *lens)
817 {
818 	struct net *net = sock_net(sock->sk);
819 	struct rds_info_socket sinfo;
820 	unsigned int copied = 0;
821 	unsigned int cnt = 0;
822 	struct rds_sock *rs;
823 
824 	len /= sizeof(struct rds_info_socket);
825 
826 	spin_lock_bh(&rds_sock_lock);
827 
828 	/* First pass: count entries visible in the caller's netns. */
829 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
830 		if (!net_eq(sock_net(rds_rs_to_sk(rs)), net))
831 			continue;
832 		if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
833 			continue;
834 		cnt++;
835 	}
836 
837 	if (len < cnt)
838 		goto out;
839 
840 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
841 		if (copied >= cnt)
842 			break;
843 		/* Only show sockets in the caller's netns. */
844 		if (!net_eq(sock_net(rds_rs_to_sk(rs)), net))
845 			continue;
846 		/* This option only supports IPv4 sockets. */
847 		if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
848 			continue;
849 		sinfo.sndbuf = rds_sk_sndbuf(rs);
850 		sinfo.rcvbuf = rds_sk_rcvbuf(rs);
851 		sinfo.bound_addr = rs->rs_bound_addr_v4;
852 		sinfo.connected_addr = rs->rs_conn_addr_v4;
853 		sinfo.bound_port = rs->rs_bound_port;
854 		sinfo.connected_port = rs->rs_conn_port;
855 		sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
856 
857 		rds_info_copy(iter, &sinfo, sizeof(sinfo));
858 		copied++;
859 	}
860 	/* A concurrent rds_bind() can change rs_bound_addr between the
861 	 * two passes without holding rds_sock_lock, so copied may be
862 	 * less than cnt. Report what was actually copied.
863 	 */
864 	cnt = copied;
865 
866 out:
867 	lens->nr = cnt;
868 	lens->each = sizeof(struct rds_info_socket);
869 
870 	spin_unlock_bh(&rds_sock_lock);
871 }
872 
873 #if IS_ENABLED(CONFIG_IPV6)
874 static void rds6_sock_info(struct socket *sock, unsigned int len,
875 			   struct rds_info_iterator *iter,
876 			   struct rds_info_lengths *lens)
877 {
878 	struct net *net = sock_net(sock->sk);
879 	struct rds6_info_socket sinfo6;
880 	unsigned int copied = 0;
881 	unsigned int cnt = 0;
882 	struct rds_sock *rs;
883 
884 	len /= sizeof(struct rds6_info_socket);
885 
886 	spin_lock_bh(&rds_sock_lock);
887 
888 	/* First pass: count entries visible in the caller's netns. */
889 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
890 		if (!net_eq(sock_net(rds_rs_to_sk(rs)), net))
891 			continue;
892 		cnt++;
893 	}
894 
895 	if (len < cnt)
896 		goto out;
897 
898 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
899 		if (copied >= cnt)
900 			break;
901 		/* Only show sockets in the caller's netns. */
902 		if (!net_eq(sock_net(rds_rs_to_sk(rs)), net))
903 			continue;
904 		sinfo6.sndbuf = rds_sk_sndbuf(rs);
905 		sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
906 		sinfo6.bound_addr = rs->rs_bound_addr;
907 		sinfo6.connected_addr = rs->rs_conn_addr;
908 		sinfo6.bound_port = rs->rs_bound_port;
909 		sinfo6.connected_port = rs->rs_conn_port;
910 		sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));
911 
912 		rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
913 		copied++;
914 	}
915 	cnt = copied;
916 
917  out:
918 	lens->nr = cnt;
919 	lens->each = sizeof(struct rds6_info_socket);
920 
921 	spin_unlock_bh(&rds_sock_lock);
922 }
923 #endif
924 
925 static void rds_exit(void)
926 {
927 	sock_unregister(rds_family_ops.family);
928 	proto_unregister(&rds_proto);
929 	rds_conn_exit();
930 	rds_cong_exit();
931 	rds_sysctl_exit();
932 	rds_threads_exit();
933 	rds_stats_exit();
934 	rds_page_exit();
935 	rds_bind_lock_destroy();
936 	rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
937 	rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
938 #if IS_ENABLED(CONFIG_IPV6)
939 	rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
940 	rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
941 #endif
942 }
943 module_exit(rds_exit);
944 
945 u32 rds_gen_num;
946 
947 static int __init rds_init(void)
948 {
949 	int ret;
950 
951 	net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
952 
953 	ret = rds_bind_lock_init();
954 	if (ret)
955 		goto out;
956 
957 	ret = rds_conn_init();
958 	if (ret)
959 		goto out_bind;
960 
961 	ret = rds_threads_init();
962 	if (ret)
963 		goto out_conn;
964 	ret = rds_sysctl_init();
965 	if (ret)
966 		goto out_threads;
967 	ret = rds_stats_init();
968 	if (ret)
969 		goto out_sysctl;
970 	ret = proto_register(&rds_proto, 1);
971 	if (ret)
972 		goto out_stats;
973 	ret = sock_register(&rds_family_ops);
974 	if (ret)
975 		goto out_proto;
976 
977 	rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
978 	rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
979 #if IS_ENABLED(CONFIG_IPV6)
980 	rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
981 	rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
982 #endif
983 
984 	goto out;
985 
986 out_proto:
987 	proto_unregister(&rds_proto);
988 out_stats:
989 	rds_stats_exit();
990 out_sysctl:
991 	rds_sysctl_exit();
992 out_threads:
993 	rds_threads_exit();
994 out_conn:
995 	rds_conn_exit();
996 	rds_cong_exit();
997 	rds_page_exit();
998 out_bind:
999 	rds_bind_lock_destroy();
1000 out:
1001 	return ret;
1002 }
1003 module_init(rds_init);
1004 
1005 #define DRV_VERSION     "4.0"
1006 #define DRV_RELDATE     "Feb 12, 2009"
1007 
1008 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
1009 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
1010 		   " v" DRV_VERSION " (" DRV_RELDATE ")");
1011 MODULE_VERSION(DRV_VERSION);
1012 MODULE_LICENSE("Dual BSD/GPL");
1013 MODULE_ALIAS_NETPROTO(PF_RDS);
1014