xref: /linux/net/rds/recv.c (revision 79790b6818e96c58fe2bffee1b418c16e64e7b80)
1bdbe6fbcSAndy Grover /*
27d0a0658SKa-Cheong Poon  * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
3bdbe6fbcSAndy Grover  *
4bdbe6fbcSAndy Grover  * This software is available to you under a choice of one of two
5bdbe6fbcSAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
6bdbe6fbcSAndy Grover  * General Public License (GPL) Version 2, available from the file
7bdbe6fbcSAndy Grover  * COPYING in the main directory of this source tree, or the
8bdbe6fbcSAndy Grover  * OpenIB.org BSD license below:
9bdbe6fbcSAndy Grover  *
10bdbe6fbcSAndy Grover  *     Redistribution and use in source and binary forms, with or
11bdbe6fbcSAndy Grover  *     without modification, are permitted provided that the following
12bdbe6fbcSAndy Grover  *     conditions are met:
13bdbe6fbcSAndy Grover  *
14bdbe6fbcSAndy Grover  *      - Redistributions of source code must retain the above
15bdbe6fbcSAndy Grover  *        copyright notice, this list of conditions and the following
16bdbe6fbcSAndy Grover  *        disclaimer.
17bdbe6fbcSAndy Grover  *
18bdbe6fbcSAndy Grover  *      - Redistributions in binary form must reproduce the above
19bdbe6fbcSAndy Grover  *        copyright notice, this list of conditions and the following
20bdbe6fbcSAndy Grover  *        disclaimer in the documentation and/or other materials
21bdbe6fbcSAndy Grover  *        provided with the distribution.
22bdbe6fbcSAndy Grover  *
23bdbe6fbcSAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24bdbe6fbcSAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25bdbe6fbcSAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26bdbe6fbcSAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27bdbe6fbcSAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28bdbe6fbcSAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29bdbe6fbcSAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30bdbe6fbcSAndy Grover  * SOFTWARE.
31bdbe6fbcSAndy Grover  *
32bdbe6fbcSAndy Grover  */
33bdbe6fbcSAndy Grover #include <linux/kernel.h>
345a0e3ad6STejun Heo #include <linux/slab.h>
35bdbe6fbcSAndy Grover #include <net/sock.h>
36bdbe6fbcSAndy Grover #include <linux/in.h>
37bc3b2d7fSPaul Gortmaker #include <linux/export.h>
382870c4d6SJakub Kicinski #include <linux/sched/clock.h>
395711f8b3Ssantosh.shilimkar@oracle.com #include <linux/time.h>
405711f8b3Ssantosh.shilimkar@oracle.com #include <linux/rds.h>
41bdbe6fbcSAndy Grover 
42bdbe6fbcSAndy Grover #include "rds.h"
43bdbe6fbcSAndy Grover 
rds_inc_init(struct rds_incoming * inc,struct rds_connection * conn,struct in6_addr * saddr)44bdbe6fbcSAndy Grover void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
45eee2fa6aSKa-Cheong Poon 		 struct in6_addr *saddr)
46bdbe6fbcSAndy Grover {
47b7f02920SReshetova, Elena 	refcount_set(&inc->i_refcount, 1);
48bdbe6fbcSAndy Grover 	INIT_LIST_HEAD(&inc->i_item);
49bdbe6fbcSAndy Grover 	inc->i_conn = conn;
50eee2fa6aSKa-Cheong Poon 	inc->i_saddr = *saddr;
51bf1867dbSDag Moxnes 	inc->i_usercopy.rdma_cookie = 0;
52bf1867dbSDag Moxnes 	inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
533289025aSSantosh Shilimkar 
541635bb54SZhu Yanjun 	memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
55bdbe6fbcSAndy Grover }
56616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_inc_init);
57bdbe6fbcSAndy Grover 
rds_inc_path_init(struct rds_incoming * inc,struct rds_conn_path * cp,struct in6_addr * saddr)585e833e02SSowmini Varadhan void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
59eee2fa6aSKa-Cheong Poon 		       struct in6_addr  *saddr)
605e833e02SSowmini Varadhan {
61b7f02920SReshetova, Elena 	refcount_set(&inc->i_refcount, 1);
625e833e02SSowmini Varadhan 	INIT_LIST_HEAD(&inc->i_item);
635e833e02SSowmini Varadhan 	inc->i_conn = cp->cp_conn;
645e833e02SSowmini Varadhan 	inc->i_conn_path = cp;
65eee2fa6aSKa-Cheong Poon 	inc->i_saddr = *saddr;
66bf1867dbSDag Moxnes 	inc->i_usercopy.rdma_cookie = 0;
67bf1867dbSDag Moxnes 	inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
685e833e02SSowmini Varadhan }
695e833e02SSowmini Varadhan EXPORT_SYMBOL_GPL(rds_inc_path_init);
705e833e02SSowmini Varadhan 
rds_inc_addref(struct rds_incoming * inc)71ff51bf84Sstephen hemminger static void rds_inc_addref(struct rds_incoming *inc)
72bdbe6fbcSAndy Grover {
73b7f02920SReshetova, Elena 	rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
74b7f02920SReshetova, Elena 	refcount_inc(&inc->i_refcount);
75bdbe6fbcSAndy Grover }
76bdbe6fbcSAndy Grover 
rds_inc_put(struct rds_incoming * inc)77bdbe6fbcSAndy Grover void rds_inc_put(struct rds_incoming *inc)
78bdbe6fbcSAndy Grover {
79b7f02920SReshetova, Elena 	rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
80b7f02920SReshetova, Elena 	if (refcount_dec_and_test(&inc->i_refcount)) {
81bdbe6fbcSAndy Grover 		BUG_ON(!list_empty(&inc->i_item));
82bdbe6fbcSAndy Grover 
83bdbe6fbcSAndy Grover 		inc->i_conn->c_trans->inc_free(inc);
84bdbe6fbcSAndy Grover 	}
85bdbe6fbcSAndy Grover }
86616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_inc_put);
87bdbe6fbcSAndy Grover 
rds_recv_rcvbuf_delta(struct rds_sock * rs,struct sock * sk,struct rds_cong_map * map,int delta,__be16 port)88bdbe6fbcSAndy Grover static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
89bdbe6fbcSAndy Grover 				  struct rds_cong_map *map,
90bdbe6fbcSAndy Grover 				  int delta, __be16 port)
91bdbe6fbcSAndy Grover {
92bdbe6fbcSAndy Grover 	int now_congested;
93bdbe6fbcSAndy Grover 
94bdbe6fbcSAndy Grover 	if (delta == 0)
95bdbe6fbcSAndy Grover 		return;
96bdbe6fbcSAndy Grover 
97bdbe6fbcSAndy Grover 	rs->rs_rcv_bytes += delta;
98192a798fSVenkat Venkatsubra 	if (delta > 0)
99192a798fSVenkat Venkatsubra 		rds_stats_add(s_recv_bytes_added_to_socket, delta);
100192a798fSVenkat Venkatsubra 	else
101192a798fSVenkat Venkatsubra 		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
102f1693c63SSantosh Shilimkar 
103f1693c63SSantosh Shilimkar 	/* loop transport doesn't send/recv congestion updates */
104f1693c63SSantosh Shilimkar 	if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
105f1693c63SSantosh Shilimkar 		return;
106f1693c63SSantosh Shilimkar 
107bdbe6fbcSAndy Grover 	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
108bdbe6fbcSAndy Grover 
109eee2fa6aSKa-Cheong Poon 	rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
110bdbe6fbcSAndy Grover 	  "now_cong %d delta %d\n",
111bdbe6fbcSAndy Grover 	  rs, &rs->rs_bound_addr,
112bdbe6fbcSAndy Grover 	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
113bdbe6fbcSAndy Grover 	  rds_sk_rcvbuf(rs), now_congested, delta);
114bdbe6fbcSAndy Grover 
115bdbe6fbcSAndy Grover 	/* wasn't -> am congested */
116bdbe6fbcSAndy Grover 	if (!rs->rs_congested && now_congested) {
117bdbe6fbcSAndy Grover 		rs->rs_congested = 1;
118bdbe6fbcSAndy Grover 		rds_cong_set_bit(map, port);
119bdbe6fbcSAndy Grover 		rds_cong_queue_updates(map);
120bdbe6fbcSAndy Grover 	}
121bdbe6fbcSAndy Grover 	/* was -> aren't congested */
122bdbe6fbcSAndy Grover 	/* Require more free space before reporting uncongested to prevent
123bdbe6fbcSAndy Grover 	   bouncing cong/uncong state too often */
124bdbe6fbcSAndy Grover 	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
125bdbe6fbcSAndy Grover 		rs->rs_congested = 0;
126bdbe6fbcSAndy Grover 		rds_cong_clear_bit(map, port);
127bdbe6fbcSAndy Grover 		rds_cong_queue_updates(map);
128bdbe6fbcSAndy Grover 	}
129bdbe6fbcSAndy Grover 
130bdbe6fbcSAndy Grover 	/* do nothing if no change in cong state */
131bdbe6fbcSAndy Grover }
132bdbe6fbcSAndy Grover 
rds_conn_peer_gen_update(struct rds_connection * conn,u32 peer_gen_num)133905dd418SSowmini Varadhan static void rds_conn_peer_gen_update(struct rds_connection *conn,
134905dd418SSowmini Varadhan 				     u32 peer_gen_num)
135905dd418SSowmini Varadhan {
136905dd418SSowmini Varadhan 	int i;
137905dd418SSowmini Varadhan 	struct rds_message *rm, *tmp;
138905dd418SSowmini Varadhan 	unsigned long flags;
139905dd418SSowmini Varadhan 
140905dd418SSowmini Varadhan 	WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
141905dd418SSowmini Varadhan 	if (peer_gen_num != 0) {
142905dd418SSowmini Varadhan 		if (conn->c_peer_gen_num != 0 &&
143905dd418SSowmini Varadhan 		    peer_gen_num != conn->c_peer_gen_num) {
144905dd418SSowmini Varadhan 			for (i = 0; i < RDS_MPATH_WORKERS; i++) {
145905dd418SSowmini Varadhan 				struct rds_conn_path *cp;
146905dd418SSowmini Varadhan 
147905dd418SSowmini Varadhan 				cp = &conn->c_path[i];
148905dd418SSowmini Varadhan 				spin_lock_irqsave(&cp->cp_lock, flags);
149905dd418SSowmini Varadhan 				cp->cp_next_tx_seq = 1;
150905dd418SSowmini Varadhan 				cp->cp_next_rx_seq = 0;
151905dd418SSowmini Varadhan 				list_for_each_entry_safe(rm, tmp,
152905dd418SSowmini Varadhan 							 &cp->cp_retrans,
153905dd418SSowmini Varadhan 							 m_conn_item) {
154905dd418SSowmini Varadhan 					set_bit(RDS_MSG_FLUSH, &rm->m_flags);
155905dd418SSowmini Varadhan 				}
156905dd418SSowmini Varadhan 				spin_unlock_irqrestore(&cp->cp_lock, flags);
157905dd418SSowmini Varadhan 			}
158905dd418SSowmini Varadhan 		}
159905dd418SSowmini Varadhan 		conn->c_peer_gen_num = peer_gen_num;
160905dd418SSowmini Varadhan 	}
161905dd418SSowmini Varadhan }
162905dd418SSowmini Varadhan 
163bdbe6fbcSAndy Grover /*
164bdbe6fbcSAndy Grover  * Process all extension headers that come with this message.
165bdbe6fbcSAndy Grover  */
rds_recv_incoming_exthdrs(struct rds_incoming * inc,struct rds_sock * rs)166bdbe6fbcSAndy Grover static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
167bdbe6fbcSAndy Grover {
168bdbe6fbcSAndy Grover 	struct rds_header *hdr = &inc->i_hdr;
169bdbe6fbcSAndy Grover 	unsigned int pos = 0, type, len;
170bdbe6fbcSAndy Grover 	union {
171bdbe6fbcSAndy Grover 		struct rds_ext_header_version version;
172bdbe6fbcSAndy Grover 		struct rds_ext_header_rdma rdma;
173bdbe6fbcSAndy Grover 		struct rds_ext_header_rdma_dest rdma_dest;
174bdbe6fbcSAndy Grover 	} buffer;
175bdbe6fbcSAndy Grover 
176bdbe6fbcSAndy Grover 	while (1) {
177bdbe6fbcSAndy Grover 		len = sizeof(buffer);
178bdbe6fbcSAndy Grover 		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
179bdbe6fbcSAndy Grover 		if (type == RDS_EXTHDR_NONE)
180bdbe6fbcSAndy Grover 			break;
181bdbe6fbcSAndy Grover 		/* Process extension header here */
182bdbe6fbcSAndy Grover 		switch (type) {
183bdbe6fbcSAndy Grover 		case RDS_EXTHDR_RDMA:
184bdbe6fbcSAndy Grover 			rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
185bdbe6fbcSAndy Grover 			break;
186bdbe6fbcSAndy Grover 
187bdbe6fbcSAndy Grover 		case RDS_EXTHDR_RDMA_DEST:
188bdbe6fbcSAndy Grover 			/* We ignore the size for now. We could stash it
189bdbe6fbcSAndy Grover 			 * somewhere and use it for error checking. */
190bf1867dbSDag Moxnes 			inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
191bdbe6fbcSAndy Grover 					be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
192bdbe6fbcSAndy Grover 					be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
193bdbe6fbcSAndy Grover 
194bdbe6fbcSAndy Grover 			break;
195bdbe6fbcSAndy Grover 		}
196bdbe6fbcSAndy Grover 	}
197bdbe6fbcSAndy Grover }
198bdbe6fbcSAndy Grover 
rds_recv_hs_exthdrs(struct rds_header * hdr,struct rds_connection * conn)1995916e2c1SSowmini Varadhan static void rds_recv_hs_exthdrs(struct rds_header *hdr,
2005916e2c1SSowmini Varadhan 				struct rds_connection *conn)
2015916e2c1SSowmini Varadhan {
2025916e2c1SSowmini Varadhan 	unsigned int pos = 0, type, len;
2035916e2c1SSowmini Varadhan 	union {
2045916e2c1SSowmini Varadhan 		struct rds_ext_header_version version;
2055916e2c1SSowmini Varadhan 		u16 rds_npaths;
206905dd418SSowmini Varadhan 		u32 rds_gen_num;
2075916e2c1SSowmini Varadhan 	} buffer;
208905dd418SSowmini Varadhan 	u32 new_peer_gen_num = 0;
2095916e2c1SSowmini Varadhan 
2105916e2c1SSowmini Varadhan 	while (1) {
2115916e2c1SSowmini Varadhan 		len = sizeof(buffer);
2125916e2c1SSowmini Varadhan 		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
2135916e2c1SSowmini Varadhan 		if (type == RDS_EXTHDR_NONE)
2145916e2c1SSowmini Varadhan 			break;
2155916e2c1SSowmini Varadhan 		/* Process extension header here */
2165916e2c1SSowmini Varadhan 		switch (type) {
2175916e2c1SSowmini Varadhan 		case RDS_EXTHDR_NPATHS:
2185916e2c1SSowmini Varadhan 			conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
21900354de5SSowmini Varadhan 					       be16_to_cpu(buffer.rds_npaths));
2205916e2c1SSowmini Varadhan 			break;
221905dd418SSowmini Varadhan 		case RDS_EXTHDR_GEN_NUM:
22200354de5SSowmini Varadhan 			new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
223905dd418SSowmini Varadhan 			break;
2245916e2c1SSowmini Varadhan 		default:
2255916e2c1SSowmini Varadhan 			pr_warn_ratelimited("ignoring unknown exthdr type "
2265916e2c1SSowmini Varadhan 					     "0x%x\n", type);
2275916e2c1SSowmini Varadhan 		}
2285916e2c1SSowmini Varadhan 	}
2295916e2c1SSowmini Varadhan 	/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
2305916e2c1SSowmini Varadhan 	conn->c_npaths = max_t(int, conn->c_npaths, 1);
23169b92b5bSSowmini Varadhan 	conn->c_ping_triggered = 0;
232905dd418SSowmini Varadhan 	rds_conn_peer_gen_update(conn, new_peer_gen_num);
2335916e2c1SSowmini Varadhan }
2345916e2c1SSowmini Varadhan 
2355916e2c1SSowmini Varadhan /* rds_start_mprds() will synchronously start multiple paths when appropriate.
2365916e2c1SSowmini Varadhan  * The scheme is based on the following rules:
2375916e2c1SSowmini Varadhan  *
2385916e2c1SSowmini Varadhan  * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
2395916e2c1SSowmini Varadhan  *    sender's npaths (s_npaths)
2405916e2c1SSowmini Varadhan  * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
2415916e2c1SSowmini Varadhan  *    sends back a probe-pong with r_npaths. After that, if rcvr is the
2425916e2c1SSowmini Varadhan  *    smaller ip addr, it starts rds_conn_path_connect_if_down on all
2435916e2c1SSowmini Varadhan  *    mprds_paths.
2445916e2c1SSowmini Varadhan  * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
2455916e2c1SSowmini Varadhan  *    If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
2465916e2c1SSowmini Varadhan  *    called after reception of the probe-pong on all mprds_paths.
2475916e2c1SSowmini Varadhan  *    Otherwise (sender of probe-ping is not the smaller ip addr): just call
2485916e2c1SSowmini Varadhan  *    rds_conn_path_connect_if_down on the hashed path. (see rule 4)
24969b92b5bSSowmini Varadhan  * 4. rds_connect_worker must only trigger a connection if laddr < faddr.
2505916e2c1SSowmini Varadhan  * 5. sender may end up queuing the packet on the cp. will get sent out later.
2515916e2c1SSowmini Varadhan  *    when connection is completed.
2525916e2c1SSowmini Varadhan  */
rds_start_mprds(struct rds_connection * conn)2535916e2c1SSowmini Varadhan static void rds_start_mprds(struct rds_connection *conn)
2545916e2c1SSowmini Varadhan {
2555916e2c1SSowmini Varadhan 	int i;
2565916e2c1SSowmini Varadhan 	struct rds_conn_path *cp;
2575916e2c1SSowmini Varadhan 
25800354de5SSowmini Varadhan 	if (conn->c_npaths > 1 &&
259eee2fa6aSKa-Cheong Poon 	    rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
26069b92b5bSSowmini Varadhan 		for (i = 0; i < conn->c_npaths; i++) {
2615916e2c1SSowmini Varadhan 			cp = &conn->c_path[i];
2625916e2c1SSowmini Varadhan 			rds_conn_path_connect_if_down(cp);
2635916e2c1SSowmini Varadhan 		}
2645916e2c1SSowmini Varadhan 	}
2655916e2c1SSowmini Varadhan }
2665916e2c1SSowmini Varadhan 
267bdbe6fbcSAndy Grover /*
268bdbe6fbcSAndy Grover  * The transport must make sure that this is serialized against other
269bdbe6fbcSAndy Grover  * rx and conn reset on this specific conn.
270bdbe6fbcSAndy Grover  *
271bdbe6fbcSAndy Grover  * We currently assert that only one fragmented message will be sent
272bdbe6fbcSAndy Grover  * down a connection at a time.  This lets us reassemble in the conn
273bdbe6fbcSAndy Grover  * instead of per-flow which means that we don't have to go digging through
274bdbe6fbcSAndy Grover  * flows to tear down partial reassembly progress on conn failure and
275bdbe6fbcSAndy Grover  * we save flow lookup and locking for each frag arrival.  It does mean
276bdbe6fbcSAndy Grover  * that small messages will wait behind large ones.  Fragmenting at all
277bdbe6fbcSAndy Grover  * is only to reduce the memory consumption of pre-posted buffers.
278bdbe6fbcSAndy Grover  *
279bdbe6fbcSAndy Grover  * The caller passes in saddr and daddr instead of us getting it from the
280bdbe6fbcSAndy Grover  * conn.  This lets loopback, who only has one conn for both directions,
281bdbe6fbcSAndy Grover  * tell us which roles the addrs in the conn are playing for this message.
282bdbe6fbcSAndy Grover  */
rds_recv_incoming(struct rds_connection * conn,struct in6_addr * saddr,struct in6_addr * daddr,struct rds_incoming * inc,gfp_t gfp)283eee2fa6aSKa-Cheong Poon void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
284eee2fa6aSKa-Cheong Poon 		       struct in6_addr *daddr,
2856114eab5SCong Wang 		       struct rds_incoming *inc, gfp_t gfp)
286bdbe6fbcSAndy Grover {
287bdbe6fbcSAndy Grover 	struct rds_sock *rs = NULL;
288bdbe6fbcSAndy Grover 	struct sock *sk;
289bdbe6fbcSAndy Grover 	unsigned long flags;
290ef9e62c2SSowmini Varadhan 	struct rds_conn_path *cp;
291bdbe6fbcSAndy Grover 
292bdbe6fbcSAndy Grover 	inc->i_conn = conn;
293bdbe6fbcSAndy Grover 	inc->i_rx_jiffies = jiffies;
294ef9e62c2SSowmini Varadhan 	if (conn->c_trans->t_mp_capable)
295ef9e62c2SSowmini Varadhan 		cp = inc->i_conn_path;
296ef9e62c2SSowmini Varadhan 	else
297ef9e62c2SSowmini Varadhan 		cp = &conn->c_path[0];
298bdbe6fbcSAndy Grover 
299bdbe6fbcSAndy Grover 	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
300bdbe6fbcSAndy Grover 		 "flags 0x%x rx_jiffies %lu\n", conn,
301ef9e62c2SSowmini Varadhan 		 (unsigned long long)cp->cp_next_rx_seq,
302bdbe6fbcSAndy Grover 		 inc,
303bdbe6fbcSAndy Grover 		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
304bdbe6fbcSAndy Grover 		 be32_to_cpu(inc->i_hdr.h_len),
305bdbe6fbcSAndy Grover 		 be16_to_cpu(inc->i_hdr.h_sport),
306bdbe6fbcSAndy Grover 		 be16_to_cpu(inc->i_hdr.h_dport),
307bdbe6fbcSAndy Grover 		 inc->i_hdr.h_flags,
308bdbe6fbcSAndy Grover 		 inc->i_rx_jiffies);
309bdbe6fbcSAndy Grover 
310bdbe6fbcSAndy Grover 	/*
311bdbe6fbcSAndy Grover 	 * Sequence numbers should only increase.  Messages get their
312bdbe6fbcSAndy Grover 	 * sequence number as they're queued in a sending conn.  They
313bdbe6fbcSAndy Grover 	 * can be dropped, though, if the sending socket is closed before
314bdbe6fbcSAndy Grover 	 * they hit the wire.  So sequence numbers can skip forward
315bdbe6fbcSAndy Grover 	 * under normal operation.  They can also drop back in the conn
316bdbe6fbcSAndy Grover 	 * failover case as previously sent messages are resent down the
317bdbe6fbcSAndy Grover 	 * new instance of a conn.  We drop those, otherwise we have
318bdbe6fbcSAndy Grover 	 * to assume that the next valid seq does not come after a
319bdbe6fbcSAndy Grover 	 * hole in the fragment stream.
320bdbe6fbcSAndy Grover 	 *
321bdbe6fbcSAndy Grover 	 * The headers don't give us a way to realize if fragments of
322bdbe6fbcSAndy Grover 	 * a message have been dropped.  We assume that frags that arrive
323bdbe6fbcSAndy Grover 	 * to a flow are part of the current message on the flow that is
324bdbe6fbcSAndy Grover 	 * being reassembled.  This means that senders can't drop messages
325bdbe6fbcSAndy Grover 	 * from the sending conn until all their frags are sent.
326bdbe6fbcSAndy Grover 	 *
327bdbe6fbcSAndy Grover 	 * XXX we could spend more on the wire to get more robust failure
328bdbe6fbcSAndy Grover 	 * detection, arguably worth it to avoid data corruption.
329bdbe6fbcSAndy Grover 	 */
330ef9e62c2SSowmini Varadhan 	if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
331f64f9e71SJoe Perches 	    (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
332bdbe6fbcSAndy Grover 		rds_stats_inc(s_recv_drop_old_seq);
333bdbe6fbcSAndy Grover 		goto out;
334bdbe6fbcSAndy Grover 	}
335ef9e62c2SSowmini Varadhan 	cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
336bdbe6fbcSAndy Grover 
337bdbe6fbcSAndy Grover 	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
33811bb62f7SSowmini Varadhan 		if (inc->i_hdr.h_sport == 0) {
339eee2fa6aSKa-Cheong Poon 			rdsdebug("ignore ping with 0 sport from %pI6c\n",
340eee2fa6aSKa-Cheong Poon 				 saddr);
34111bb62f7SSowmini Varadhan 			goto out;
34211bb62f7SSowmini Varadhan 		}
343bdbe6fbcSAndy Grover 		rds_stats_inc(s_recv_ping);
34445997e9eSSowmini Varadhan 		rds_send_pong(cp, inc->i_hdr.h_sport);
3455916e2c1SSowmini Varadhan 		/* if this is a handshake ping, start multipath if necessary */
34600354de5SSowmini Varadhan 		if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
34700354de5SSowmini Varadhan 				 be16_to_cpu(inc->i_hdr.h_dport))) {
3485916e2c1SSowmini Varadhan 			rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
3495916e2c1SSowmini Varadhan 			rds_start_mprds(cp->cp_conn);
3505916e2c1SSowmini Varadhan 		}
3515916e2c1SSowmini Varadhan 		goto out;
3525916e2c1SSowmini Varadhan 	}
3535916e2c1SSowmini Varadhan 
35400354de5SSowmini Varadhan 	if (be16_to_cpu(inc->i_hdr.h_dport) ==  RDS_FLAG_PROBE_PORT &&
3555916e2c1SSowmini Varadhan 	    inc->i_hdr.h_sport == 0) {
3565916e2c1SSowmini Varadhan 		rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
3575916e2c1SSowmini Varadhan 		/* if this is a handshake pong, start multipath if necessary */
3585916e2c1SSowmini Varadhan 		rds_start_mprds(cp->cp_conn);
3595916e2c1SSowmini Varadhan 		wake_up(&cp->cp_conn->c_hs_waitq);
360bdbe6fbcSAndy Grover 		goto out;
361bdbe6fbcSAndy Grover 	}
362bdbe6fbcSAndy Grover 
3631e2b44e7SKa-Cheong Poon 	rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
3648690bfa1SAndy Grover 	if (!rs) {
365bdbe6fbcSAndy Grover 		rds_stats_inc(s_recv_drop_no_sock);
366bdbe6fbcSAndy Grover 		goto out;
367bdbe6fbcSAndy Grover 	}
368bdbe6fbcSAndy Grover 
369bdbe6fbcSAndy Grover 	/* Process extension headers */
370bdbe6fbcSAndy Grover 	rds_recv_incoming_exthdrs(inc, rs);
371bdbe6fbcSAndy Grover 
372bdbe6fbcSAndy Grover 	/* We can be racing with rds_release() which marks the socket dead. */
373bdbe6fbcSAndy Grover 	sk = rds_rs_to_sk(rs);
374bdbe6fbcSAndy Grover 
375bdbe6fbcSAndy Grover 	/* serialize with rds_release -> sock_orphan */
376bdbe6fbcSAndy Grover 	write_lock_irqsave(&rs->rs_recv_lock, flags);
377bdbe6fbcSAndy Grover 	if (!sock_flag(sk, SOCK_DEAD)) {
378bdbe6fbcSAndy Grover 		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
379bdbe6fbcSAndy Grover 		rds_stats_inc(s_recv_queued);
380bdbe6fbcSAndy Grover 		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
381bdbe6fbcSAndy Grover 				      be32_to_cpu(inc->i_hdr.h_len),
382bdbe6fbcSAndy Grover 				      inc->i_hdr.h_dport);
3835711f8b3Ssantosh.shilimkar@oracle.com 		if (sock_flag(sk, SOCK_RCVTSTAMP))
384bf1867dbSDag Moxnes 			inc->i_usercopy.rx_tstamp = ktime_get_real();
385bdbe6fbcSAndy Grover 		rds_inc_addref(inc);
3863289025aSSantosh Shilimkar 		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
387bdbe6fbcSAndy Grover 		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
388bdbe6fbcSAndy Grover 		__rds_wake_sk_sleep(sk);
389bdbe6fbcSAndy Grover 	} else {
390bdbe6fbcSAndy Grover 		rds_stats_inc(s_recv_drop_dead_sock);
391bdbe6fbcSAndy Grover 	}
392bdbe6fbcSAndy Grover 	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
393bdbe6fbcSAndy Grover 
394bdbe6fbcSAndy Grover out:
395bdbe6fbcSAndy Grover 	if (rs)
396bdbe6fbcSAndy Grover 		rds_sock_put(rs);
397bdbe6fbcSAndy Grover }
398616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_recv_incoming);
399bdbe6fbcSAndy Grover 
400bdbe6fbcSAndy Grover /*
401bdbe6fbcSAndy Grover  * be very careful here.  This is being called as the condition in
402bdbe6fbcSAndy Grover  * wait_event_*() needs to cope with being called many times.
403bdbe6fbcSAndy Grover  */
rds_next_incoming(struct rds_sock * rs,struct rds_incoming ** inc)404bdbe6fbcSAndy Grover static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
405bdbe6fbcSAndy Grover {
406bdbe6fbcSAndy Grover 	unsigned long flags;
407bdbe6fbcSAndy Grover 
4088690bfa1SAndy Grover 	if (!*inc) {
409bdbe6fbcSAndy Grover 		read_lock_irqsave(&rs->rs_recv_lock, flags);
410bdbe6fbcSAndy Grover 		if (!list_empty(&rs->rs_recv_queue)) {
411bdbe6fbcSAndy Grover 			*inc = list_entry(rs->rs_recv_queue.next,
412bdbe6fbcSAndy Grover 					  struct rds_incoming,
413bdbe6fbcSAndy Grover 					  i_item);
414bdbe6fbcSAndy Grover 			rds_inc_addref(*inc);
415bdbe6fbcSAndy Grover 		}
416bdbe6fbcSAndy Grover 		read_unlock_irqrestore(&rs->rs_recv_lock, flags);
417bdbe6fbcSAndy Grover 	}
418bdbe6fbcSAndy Grover 
419bdbe6fbcSAndy Grover 	return *inc != NULL;
420bdbe6fbcSAndy Grover }
421bdbe6fbcSAndy Grover 
rds_still_queued(struct rds_sock * rs,struct rds_incoming * inc,int drop)422bdbe6fbcSAndy Grover static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
423bdbe6fbcSAndy Grover 			    int drop)
424bdbe6fbcSAndy Grover {
425bdbe6fbcSAndy Grover 	struct sock *sk = rds_rs_to_sk(rs);
426bdbe6fbcSAndy Grover 	int ret = 0;
427bdbe6fbcSAndy Grover 	unsigned long flags;
428*f1acf1acSAllison Henderson 	struct rds_incoming *to_drop = NULL;
429bdbe6fbcSAndy Grover 
430bdbe6fbcSAndy Grover 	write_lock_irqsave(&rs->rs_recv_lock, flags);
431bdbe6fbcSAndy Grover 	if (!list_empty(&inc->i_item)) {
432bdbe6fbcSAndy Grover 		ret = 1;
433bdbe6fbcSAndy Grover 		if (drop) {
434bdbe6fbcSAndy Grover 			/* XXX make sure this i_conn is reliable */
435bdbe6fbcSAndy Grover 			rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
436bdbe6fbcSAndy Grover 					      -be32_to_cpu(inc->i_hdr.h_len),
437bdbe6fbcSAndy Grover 					      inc->i_hdr.h_dport);
438bdbe6fbcSAndy Grover 			list_del_init(&inc->i_item);
439*f1acf1acSAllison Henderson 			to_drop = inc;
440bdbe6fbcSAndy Grover 		}
441bdbe6fbcSAndy Grover 	}
442bdbe6fbcSAndy Grover 	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
443bdbe6fbcSAndy Grover 
444*f1acf1acSAllison Henderson 	if (to_drop)
445*f1acf1acSAllison Henderson 		rds_inc_put(to_drop);
446*f1acf1acSAllison Henderson 
447bdbe6fbcSAndy Grover 	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
448bdbe6fbcSAndy Grover 	return ret;
449bdbe6fbcSAndy Grover }
450bdbe6fbcSAndy Grover 
451bdbe6fbcSAndy Grover /*
452bdbe6fbcSAndy Grover  * Pull errors off the error queue.
453bdbe6fbcSAndy Grover  * If msghdr is NULL, we will just purge the error queue.
454bdbe6fbcSAndy Grover  */
rds_notify_queue_get(struct rds_sock * rs,struct msghdr * msghdr)455bdbe6fbcSAndy Grover int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
456bdbe6fbcSAndy Grover {
457bdbe6fbcSAndy Grover 	struct rds_notifier *notifier;
458bbc8a99eSPeilin Ye 	struct rds_rdma_notify cmsg;
459bdbe6fbcSAndy Grover 	unsigned int count = 0, max_messages = ~0U;
460bdbe6fbcSAndy Grover 	unsigned long flags;
461bdbe6fbcSAndy Grover 	LIST_HEAD(copy);
462bdbe6fbcSAndy Grover 	int err = 0;
463bdbe6fbcSAndy Grover 
464bbc8a99eSPeilin Ye 	memset(&cmsg, 0, sizeof(cmsg));	/* fill holes with zero */
465bdbe6fbcSAndy Grover 
466bdbe6fbcSAndy Grover 	/* put_cmsg copies to user space and thus may sleep. We can't do this
467bdbe6fbcSAndy Grover 	 * with rs_lock held, so first grab as many notifications as we can stuff
468bdbe6fbcSAndy Grover 	 * in the user provided cmsg buffer. We don't try to copy more, to avoid
469bdbe6fbcSAndy Grover 	 * losing notifications - except when the buffer is so small that it wouldn't
470bdbe6fbcSAndy Grover 	 * even hold a single notification. Then we give him as much of this single
471bdbe6fbcSAndy Grover 	 * msg as we can squeeze in, and set MSG_CTRUNC.
472bdbe6fbcSAndy Grover 	 */
473bdbe6fbcSAndy Grover 	if (msghdr) {
474bdbe6fbcSAndy Grover 		max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
475bdbe6fbcSAndy Grover 		if (!max_messages)
476bdbe6fbcSAndy Grover 			max_messages = 1;
477bdbe6fbcSAndy Grover 	}
478bdbe6fbcSAndy Grover 
479bdbe6fbcSAndy Grover 	spin_lock_irqsave(&rs->rs_lock, flags);
480bdbe6fbcSAndy Grover 	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
481bdbe6fbcSAndy Grover 		notifier = list_entry(rs->rs_notify_queue.next,
482bdbe6fbcSAndy Grover 				struct rds_notifier, n_list);
483bdbe6fbcSAndy Grover 		list_move(&notifier->n_list, &copy);
484bdbe6fbcSAndy Grover 		count++;
485bdbe6fbcSAndy Grover 	}
486bdbe6fbcSAndy Grover 	spin_unlock_irqrestore(&rs->rs_lock, flags);
487bdbe6fbcSAndy Grover 
488bdbe6fbcSAndy Grover 	if (!count)
489bdbe6fbcSAndy Grover 		return 0;
490bdbe6fbcSAndy Grover 
491bdbe6fbcSAndy Grover 	while (!list_empty(&copy)) {
492bdbe6fbcSAndy Grover 		notifier = list_entry(copy.next, struct rds_notifier, n_list);
493bdbe6fbcSAndy Grover 
494bdbe6fbcSAndy Grover 		if (msghdr) {
495bdbe6fbcSAndy Grover 			cmsg.user_token = notifier->n_user_token;
496bdbe6fbcSAndy Grover 			cmsg.status = notifier->n_status;
497bdbe6fbcSAndy Grover 
498bdbe6fbcSAndy Grover 			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
499bdbe6fbcSAndy Grover 				       sizeof(cmsg), &cmsg);
500bdbe6fbcSAndy Grover 			if (err)
501bdbe6fbcSAndy Grover 				break;
502bdbe6fbcSAndy Grover 		}
503bdbe6fbcSAndy Grover 
504bdbe6fbcSAndy Grover 		list_del_init(&notifier->n_list);
505bdbe6fbcSAndy Grover 		kfree(notifier);
506bdbe6fbcSAndy Grover 	}
507bdbe6fbcSAndy Grover 
508bdbe6fbcSAndy Grover 	/* If we bailed out because of an error in put_cmsg,
509bdbe6fbcSAndy Grover 	 * we may be left with one or more notifications that we
510bdbe6fbcSAndy Grover 	 * didn't process. Return them to the head of the list. */
511bdbe6fbcSAndy Grover 	if (!list_empty(&copy)) {
512bdbe6fbcSAndy Grover 		spin_lock_irqsave(&rs->rs_lock, flags);
513bdbe6fbcSAndy Grover 		list_splice(&copy, &rs->rs_notify_queue);
514bdbe6fbcSAndy Grover 		spin_unlock_irqrestore(&rs->rs_lock, flags);
515bdbe6fbcSAndy Grover 	}
516bdbe6fbcSAndy Grover 
517bdbe6fbcSAndy Grover 	return err;
518bdbe6fbcSAndy Grover }
519bdbe6fbcSAndy Grover 
520bdbe6fbcSAndy Grover /*
521bdbe6fbcSAndy Grover  * Queue a congestion notification
522bdbe6fbcSAndy Grover  */
rds_notify_cong(struct rds_sock * rs,struct msghdr * msghdr)523bdbe6fbcSAndy Grover static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
524bdbe6fbcSAndy Grover {
525bdbe6fbcSAndy Grover 	uint64_t notify = rs->rs_cong_notify;
526bdbe6fbcSAndy Grover 	unsigned long flags;
527bdbe6fbcSAndy Grover 	int err;
528bdbe6fbcSAndy Grover 
529bdbe6fbcSAndy Grover 	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
530bdbe6fbcSAndy Grover 			sizeof(notify), &notify);
531bdbe6fbcSAndy Grover 	if (err)
532bdbe6fbcSAndy Grover 		return err;
533bdbe6fbcSAndy Grover 
534bdbe6fbcSAndy Grover 	spin_lock_irqsave(&rs->rs_lock, flags);
535bdbe6fbcSAndy Grover 	rs->rs_cong_notify &= ~notify;
536bdbe6fbcSAndy Grover 	spin_unlock_irqrestore(&rs->rs_lock, flags);
537bdbe6fbcSAndy Grover 
538bdbe6fbcSAndy Grover 	return 0;
539bdbe6fbcSAndy Grover }
540bdbe6fbcSAndy Grover 
541bdbe6fbcSAndy Grover /*
542bdbe6fbcSAndy Grover  * Receive any control messages.
543bdbe6fbcSAndy Grover  */
rds_cmsg_recv(struct rds_incoming * inc,struct msghdr * msg,struct rds_sock * rs)5445711f8b3Ssantosh.shilimkar@oracle.com static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
5455711f8b3Ssantosh.shilimkar@oracle.com 			 struct rds_sock *rs)
546bdbe6fbcSAndy Grover {
547bdbe6fbcSAndy Grover 	int ret = 0;
548bdbe6fbcSAndy Grover 
549bf1867dbSDag Moxnes 	if (inc->i_usercopy.rdma_cookie) {
550bdbe6fbcSAndy Grover 		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
551bf1867dbSDag Moxnes 				sizeof(inc->i_usercopy.rdma_cookie),
552bf1867dbSDag Moxnes 				&inc->i_usercopy.rdma_cookie);
553bdbe6fbcSAndy Grover 		if (ret)
5543289025aSSantosh Shilimkar 			goto out;
555bdbe6fbcSAndy Grover 	}
556bdbe6fbcSAndy Grover 
557bf1867dbSDag Moxnes 	if ((inc->i_usercopy.rx_tstamp != 0) &&
5585711f8b3Ssantosh.shilimkar@oracle.com 	    sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
559bf1867dbSDag Moxnes 		struct __kernel_old_timeval tv =
560bf1867dbSDag Moxnes 			ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
561887feae3SDeepa Dinamani 
562887feae3SDeepa Dinamani 		if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
5637f1bc6e9SDeepa Dinamani 			ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
5642de9d505SArnd Bergmann 				       sizeof(tv), &tv);
565887feae3SDeepa Dinamani 		} else {
566887feae3SDeepa Dinamani 			struct __kernel_sock_timeval sk_tv;
567887feae3SDeepa Dinamani 
568887feae3SDeepa Dinamani 			sk_tv.tv_sec = tv.tv_sec;
569887feae3SDeepa Dinamani 			sk_tv.tv_usec = tv.tv_usec;
570887feae3SDeepa Dinamani 
571887feae3SDeepa Dinamani 			ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
572887feae3SDeepa Dinamani 				       sizeof(sk_tv), &sk_tv);
573887feae3SDeepa Dinamani 		}
574887feae3SDeepa Dinamani 
5755711f8b3Ssantosh.shilimkar@oracle.com 		if (ret)
5763289025aSSantosh Shilimkar 			goto out;
5775711f8b3Ssantosh.shilimkar@oracle.com 	}
5785711f8b3Ssantosh.shilimkar@oracle.com 
5793289025aSSantosh Shilimkar 	if (rs->rs_rx_traces) {
5803289025aSSantosh Shilimkar 		struct rds_cmsg_rx_trace t;
5813289025aSSantosh Shilimkar 		int i, j;
5823289025aSSantosh Shilimkar 
583eb80ca47SEric Dumazet 		memset(&t, 0, sizeof(t));
5843289025aSSantosh Shilimkar 		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
5853289025aSSantosh Shilimkar 		t.rx_traces =  rs->rs_rx_traces;
5863289025aSSantosh Shilimkar 		for (i = 0; i < rs->rs_rx_traces; i++) {
5873289025aSSantosh Shilimkar 			j = rs->rs_rx_trace[i];
5883289025aSSantosh Shilimkar 			t.rx_trace_pos[i] = j;
5893289025aSSantosh Shilimkar 			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
5903289025aSSantosh Shilimkar 					  inc->i_rx_lat_trace[j];
5913289025aSSantosh Shilimkar 		}
5923289025aSSantosh Shilimkar 
5933289025aSSantosh Shilimkar 		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
5943289025aSSantosh Shilimkar 			       sizeof(t), &t);
5953289025aSSantosh Shilimkar 		if (ret)
5963289025aSSantosh Shilimkar 			goto out;
5973289025aSSantosh Shilimkar 	}
5983289025aSSantosh Shilimkar 
5993289025aSSantosh Shilimkar out:
6003289025aSSantosh Shilimkar 	return ret;
601bdbe6fbcSAndy Grover }
602bdbe6fbcSAndy Grover 
rds_recvmsg_zcookie(struct rds_sock * rs,struct msghdr * msg)603401910dbSSowmini Varadhan static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
604401910dbSSowmini Varadhan {
6059426bbc6SSowmini Varadhan 	struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
6069426bbc6SSowmini Varadhan 	struct rds_msg_zcopy_info *info = NULL;
607401910dbSSowmini Varadhan 	struct rds_zcopy_cookies *done;
6089426bbc6SSowmini Varadhan 	unsigned long flags;
609401910dbSSowmini Varadhan 
610401910dbSSowmini Varadhan 	if (!msg->msg_control)
611401910dbSSowmini Varadhan 		return false;
612401910dbSSowmini Varadhan 
613401910dbSSowmini Varadhan 	if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
614401910dbSSowmini Varadhan 	    msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
615401910dbSSowmini Varadhan 		return false;
616401910dbSSowmini Varadhan 
6179426bbc6SSowmini Varadhan 	spin_lock_irqsave(&q->lock, flags);
6189426bbc6SSowmini Varadhan 	if (!list_empty(&q->zcookie_head)) {
6199426bbc6SSowmini Varadhan 		info = list_entry(q->zcookie_head.next,
6209426bbc6SSowmini Varadhan 				  struct rds_msg_zcopy_info, rs_zcookie_next);
6219426bbc6SSowmini Varadhan 		list_del(&info->rs_zcookie_next);
6229426bbc6SSowmini Varadhan 	}
6239426bbc6SSowmini Varadhan 	spin_unlock_irqrestore(&q->lock, flags);
6249426bbc6SSowmini Varadhan 	if (!info)
625401910dbSSowmini Varadhan 		return false;
6269426bbc6SSowmini Varadhan 	done = &info->zcookies;
627401910dbSSowmini Varadhan 	if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
628401910dbSSowmini Varadhan 		     done)) {
6299426bbc6SSowmini Varadhan 		spin_lock_irqsave(&q->lock, flags);
6309426bbc6SSowmini Varadhan 		list_add(&info->rs_zcookie_next, &q->zcookie_head);
6319426bbc6SSowmini Varadhan 		spin_unlock_irqrestore(&q->lock, flags);
632401910dbSSowmini Varadhan 		return false;
633401910dbSSowmini Varadhan 	}
6349426bbc6SSowmini Varadhan 	kfree(info);
635401910dbSSowmini Varadhan 	return true;
636401910dbSSowmini Varadhan }
637401910dbSSowmini Varadhan 
rds_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int msg_flags)6381b784140SYing Xue int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
6391b784140SYing Xue 		int msg_flags)
640bdbe6fbcSAndy Grover {
641bdbe6fbcSAndy Grover 	struct sock *sk = sock->sk;
642bdbe6fbcSAndy Grover 	struct rds_sock *rs = rds_sk_to_rs(sk);
643bdbe6fbcSAndy Grover 	long timeo;
644bdbe6fbcSAndy Grover 	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
645eee2fa6aSKa-Cheong Poon 	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
646342dfc30SSteffen Hurrle 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
647bdbe6fbcSAndy Grover 	struct rds_incoming *inc = NULL;
648bdbe6fbcSAndy Grover 
649bdbe6fbcSAndy Grover 	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
650bdbe6fbcSAndy Grover 	timeo = sock_rcvtimeo(sk, nonblock);
651bdbe6fbcSAndy Grover 
652bdbe6fbcSAndy Grover 	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
653bdbe6fbcSAndy Grover 
654bdbe6fbcSAndy Grover 	if (msg_flags & MSG_OOB)
655bdbe6fbcSAndy Grover 		goto out;
65601883edaSSowmini Varadhan 	if (msg_flags & MSG_ERRQUEUE)
65701883edaSSowmini Varadhan 		return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
658bdbe6fbcSAndy Grover 
659edacaeaeSAndy Grover 	while (1) {
660bdbe6fbcSAndy Grover 		/* If there are pending notifications, do those - and nothing else */
661bdbe6fbcSAndy Grover 		if (!list_empty(&rs->rs_notify_queue)) {
662bdbe6fbcSAndy Grover 			ret = rds_notify_queue_get(rs, msg);
663edacaeaeSAndy Grover 			break;
664bdbe6fbcSAndy Grover 		}
665bdbe6fbcSAndy Grover 
666bdbe6fbcSAndy Grover 		if (rs->rs_cong_notify) {
667bdbe6fbcSAndy Grover 			ret = rds_notify_cong(rs, msg);
668edacaeaeSAndy Grover 			break;
669bdbe6fbcSAndy Grover 		}
670bdbe6fbcSAndy Grover 
671bdbe6fbcSAndy Grover 		if (!rds_next_incoming(rs, &inc)) {
672bdbe6fbcSAndy Grover 			if (nonblock) {
673401910dbSSowmini Varadhan 				bool reaped = rds_recvmsg_zcookie(rs, msg);
674401910dbSSowmini Varadhan 
675401910dbSSowmini Varadhan 				ret = reaped ?  0 : -EAGAIN;
676bdbe6fbcSAndy Grover 				break;
677bdbe6fbcSAndy Grover 			}
678bdbe6fbcSAndy Grover 
679aa395145SEric Dumazet 			timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
680f64f9e71SJoe Perches 					(!list_empty(&rs->rs_notify_queue) ||
681f64f9e71SJoe Perches 					 rs->rs_cong_notify ||
682f64f9e71SJoe Perches 					 rds_next_incoming(rs, &inc)), timeo);
683bdbe6fbcSAndy Grover 			rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
684bdbe6fbcSAndy Grover 				 timeo);
685bdbe6fbcSAndy Grover 			if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
686bdbe6fbcSAndy Grover 				continue;
687bdbe6fbcSAndy Grover 
688bdbe6fbcSAndy Grover 			ret = timeo;
689bdbe6fbcSAndy Grover 			if (ret == 0)
690bdbe6fbcSAndy Grover 				ret = -ETIMEDOUT;
691bdbe6fbcSAndy Grover 			break;
692bdbe6fbcSAndy Grover 		}
693bdbe6fbcSAndy Grover 
694eee2fa6aSKa-Cheong Poon 		rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
695bdbe6fbcSAndy Grover 			 &inc->i_conn->c_faddr,
696bdbe6fbcSAndy Grover 			 ntohs(inc->i_hdr.h_sport));
697c0371da6SAl Viro 		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
698bdbe6fbcSAndy Grover 		if (ret < 0)
699bdbe6fbcSAndy Grover 			break;
700bdbe6fbcSAndy Grover 
701bdbe6fbcSAndy Grover 		/*
702bdbe6fbcSAndy Grover 		 * if the message we just copied isn't at the head of the
703bdbe6fbcSAndy Grover 		 * recv queue then someone else raced us to return it, try
704bdbe6fbcSAndy Grover 		 * to get the next message.
705bdbe6fbcSAndy Grover 		 */
706bdbe6fbcSAndy Grover 		if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
707bdbe6fbcSAndy Grover 			rds_inc_put(inc);
708bdbe6fbcSAndy Grover 			inc = NULL;
709bdbe6fbcSAndy Grover 			rds_stats_inc(s_recv_deliver_raced);
710dc88e3b4SAl Viro 			iov_iter_revert(&msg->msg_iter, ret);
711bdbe6fbcSAndy Grover 			continue;
712bdbe6fbcSAndy Grover 		}
713bdbe6fbcSAndy Grover 
714bdbe6fbcSAndy Grover 		if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
715bdbe6fbcSAndy Grover 			if (msg_flags & MSG_TRUNC)
716bdbe6fbcSAndy Grover 				ret = be32_to_cpu(inc->i_hdr.h_len);
717bdbe6fbcSAndy Grover 			msg->msg_flags |= MSG_TRUNC;
718bdbe6fbcSAndy Grover 		}
719bdbe6fbcSAndy Grover 
7205711f8b3Ssantosh.shilimkar@oracle.com 		if (rds_cmsg_recv(inc, msg, rs)) {
721bdbe6fbcSAndy Grover 			ret = -EFAULT;
72249bfcbfdSPavel Skripkin 			break;
723bdbe6fbcSAndy Grover 		}
724401910dbSSowmini Varadhan 		rds_recvmsg_zcookie(rs, msg);
725bdbe6fbcSAndy Grover 
726bdbe6fbcSAndy Grover 		rds_stats_inc(s_recv_delivered);
727bdbe6fbcSAndy Grover 
728eee2fa6aSKa-Cheong Poon 		if (msg->msg_name) {
729eee2fa6aSKa-Cheong Poon 			if (ipv6_addr_v4mapped(&inc->i_saddr)) {
730bdbe6fbcSAndy Grover 				sin->sin_family = AF_INET;
731bdbe6fbcSAndy Grover 				sin->sin_port = inc->i_hdr.h_sport;
732eee2fa6aSKa-Cheong Poon 				sin->sin_addr.s_addr =
733eee2fa6aSKa-Cheong Poon 				    inc->i_saddr.s6_addr32[3];
734bdbe6fbcSAndy Grover 				memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
73506b6a1cfSWeiping Pan 				msg->msg_namelen = sizeof(*sin);
736eee2fa6aSKa-Cheong Poon 			} else {
737eee2fa6aSKa-Cheong Poon 				sin6->sin6_family = AF_INET6;
738eee2fa6aSKa-Cheong Poon 				sin6->sin6_port = inc->i_hdr.h_sport;
739eee2fa6aSKa-Cheong Poon 				sin6->sin6_addr = inc->i_saddr;
740eee2fa6aSKa-Cheong Poon 				sin6->sin6_flowinfo = 0;
741eee2fa6aSKa-Cheong Poon 				sin6->sin6_scope_id = rs->rs_bound_scope_id;
742eee2fa6aSKa-Cheong Poon 				msg->msg_namelen = sizeof(*sin6);
743eee2fa6aSKa-Cheong Poon 			}
744bdbe6fbcSAndy Grover 		}
745bdbe6fbcSAndy Grover 		break;
746bdbe6fbcSAndy Grover 	}
747bdbe6fbcSAndy Grover 
748bdbe6fbcSAndy Grover 	if (inc)
749bdbe6fbcSAndy Grover 		rds_inc_put(inc);
750bdbe6fbcSAndy Grover 
751bdbe6fbcSAndy Grover out:
752bdbe6fbcSAndy Grover 	return ret;
753bdbe6fbcSAndy Grover }
754bdbe6fbcSAndy Grover 
755bdbe6fbcSAndy Grover /*
756bdbe6fbcSAndy Grover  * The socket is being shut down and we're asked to drop messages that were
757bdbe6fbcSAndy Grover  * queued for recvmsg.  The caller has unbound the socket so the receive path
758bdbe6fbcSAndy Grover  * won't queue any more incoming fragments or messages on the socket.
759bdbe6fbcSAndy Grover  */
rds_clear_recv_queue(struct rds_sock * rs)760bdbe6fbcSAndy Grover void rds_clear_recv_queue(struct rds_sock *rs)
761bdbe6fbcSAndy Grover {
762bdbe6fbcSAndy Grover 	struct sock *sk = rds_rs_to_sk(rs);
763bdbe6fbcSAndy Grover 	struct rds_incoming *inc, *tmp;
764bdbe6fbcSAndy Grover 	unsigned long flags;
765*f1acf1acSAllison Henderson 	LIST_HEAD(to_drop);
766bdbe6fbcSAndy Grover 
767bdbe6fbcSAndy Grover 	write_lock_irqsave(&rs->rs_recv_lock, flags);
768bdbe6fbcSAndy Grover 	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
769bdbe6fbcSAndy Grover 		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
770bdbe6fbcSAndy Grover 				      -be32_to_cpu(inc->i_hdr.h_len),
771bdbe6fbcSAndy Grover 				      inc->i_hdr.h_dport);
772*f1acf1acSAllison Henderson 		list_move(&inc->i_item, &to_drop);
773*f1acf1acSAllison Henderson 	}
774*f1acf1acSAllison Henderson 	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
775*f1acf1acSAllison Henderson 
776*f1acf1acSAllison Henderson 	list_for_each_entry_safe(inc, tmp, &to_drop, i_item) {
777bdbe6fbcSAndy Grover 		list_del_init(&inc->i_item);
778bdbe6fbcSAndy Grover 		rds_inc_put(inc);
779bdbe6fbcSAndy Grover 	}
780bdbe6fbcSAndy Grover }
781bdbe6fbcSAndy Grover 
782bdbe6fbcSAndy Grover /*
783bdbe6fbcSAndy Grover  * inc->i_saddr isn't used here because it is only set in the receive
784bdbe6fbcSAndy Grover  * path.
785bdbe6fbcSAndy Grover  */
rds_inc_info_copy(struct rds_incoming * inc,struct rds_info_iterator * iter,__be32 saddr,__be32 daddr,int flip)786bdbe6fbcSAndy Grover void rds_inc_info_copy(struct rds_incoming *inc,
787bdbe6fbcSAndy Grover 		       struct rds_info_iterator *iter,
788bdbe6fbcSAndy Grover 		       __be32 saddr, __be32 daddr, int flip)
789bdbe6fbcSAndy Grover {
790bdbe6fbcSAndy Grover 	struct rds_info_message minfo;
791bdbe6fbcSAndy Grover 
792bdbe6fbcSAndy Grover 	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
793bdbe6fbcSAndy Grover 	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
7943eb45036SSantosh Shilimkar 	minfo.tos = inc->i_conn->c_tos;
795bdbe6fbcSAndy Grover 
796bdbe6fbcSAndy Grover 	if (flip) {
797bdbe6fbcSAndy Grover 		minfo.laddr = daddr;
798bdbe6fbcSAndy Grover 		minfo.faddr = saddr;
799bdbe6fbcSAndy Grover 		minfo.lport = inc->i_hdr.h_dport;
800bdbe6fbcSAndy Grover 		minfo.fport = inc->i_hdr.h_sport;
801bdbe6fbcSAndy Grover 	} else {
802bdbe6fbcSAndy Grover 		minfo.laddr = saddr;
803bdbe6fbcSAndy Grover 		minfo.faddr = daddr;
804bdbe6fbcSAndy Grover 		minfo.lport = inc->i_hdr.h_sport;
805bdbe6fbcSAndy Grover 		minfo.fport = inc->i_hdr.h_dport;
806bdbe6fbcSAndy Grover 	}
807bdbe6fbcSAndy Grover 
8084116def2SKangjie Lu 	minfo.flags = 0;
8094116def2SKangjie Lu 
810bdbe6fbcSAndy Grover 	rds_info_copy(iter, &minfo, sizeof(minfo));
811bdbe6fbcSAndy Grover }
812b7ff8b10SKa-Cheong Poon 
813e65d4d96SKa-Cheong Poon #if IS_ENABLED(CONFIG_IPV6)
rds6_inc_info_copy(struct rds_incoming * inc,struct rds_info_iterator * iter,struct in6_addr * saddr,struct in6_addr * daddr,int flip)814b7ff8b10SKa-Cheong Poon void rds6_inc_info_copy(struct rds_incoming *inc,
815b7ff8b10SKa-Cheong Poon 			struct rds_info_iterator *iter,
816b7ff8b10SKa-Cheong Poon 			struct in6_addr *saddr, struct in6_addr *daddr,
817b7ff8b10SKa-Cheong Poon 			int flip)
818b7ff8b10SKa-Cheong Poon {
819b7ff8b10SKa-Cheong Poon 	struct rds6_info_message minfo6;
820b7ff8b10SKa-Cheong Poon 
821b7ff8b10SKa-Cheong Poon 	minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
822b7ff8b10SKa-Cheong Poon 	minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
8237d0a0658SKa-Cheong Poon 	minfo6.tos = inc->i_conn->c_tos;
824b7ff8b10SKa-Cheong Poon 
825b7ff8b10SKa-Cheong Poon 	if (flip) {
826b7ff8b10SKa-Cheong Poon 		minfo6.laddr = *daddr;
827b7ff8b10SKa-Cheong Poon 		minfo6.faddr = *saddr;
828b7ff8b10SKa-Cheong Poon 		minfo6.lport = inc->i_hdr.h_dport;
829b7ff8b10SKa-Cheong Poon 		minfo6.fport = inc->i_hdr.h_sport;
830b7ff8b10SKa-Cheong Poon 	} else {
831b7ff8b10SKa-Cheong Poon 		minfo6.laddr = *saddr;
832b7ff8b10SKa-Cheong Poon 		minfo6.faddr = *daddr;
833b7ff8b10SKa-Cheong Poon 		minfo6.lport = inc->i_hdr.h_sport;
834b7ff8b10SKa-Cheong Poon 		minfo6.fport = inc->i_hdr.h_dport;
835b7ff8b10SKa-Cheong Poon 	}
836b7ff8b10SKa-Cheong Poon 
8377d0a0658SKa-Cheong Poon 	minfo6.flags = 0;
8387d0a0658SKa-Cheong Poon 
839b7ff8b10SKa-Cheong Poon 	rds_info_copy(iter, &minfo6, sizeof(minfo6));
840b7ff8b10SKa-Cheong Poon }
841e65d4d96SKa-Cheong Poon #endif
842