1bdbe6fbcSAndy Grover /*
27d0a0658SKa-Cheong Poon * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
3bdbe6fbcSAndy Grover *
4bdbe6fbcSAndy Grover * This software is available to you under a choice of one of two
5bdbe6fbcSAndy Grover * licenses. You may choose to be licensed under the terms of the GNU
6bdbe6fbcSAndy Grover * General Public License (GPL) Version 2, available from the file
7bdbe6fbcSAndy Grover * COPYING in the main directory of this source tree, or the
8bdbe6fbcSAndy Grover * OpenIB.org BSD license below:
9bdbe6fbcSAndy Grover *
10bdbe6fbcSAndy Grover * Redistribution and use in source and binary forms, with or
11bdbe6fbcSAndy Grover * without modification, are permitted provided that the following
12bdbe6fbcSAndy Grover * conditions are met:
13bdbe6fbcSAndy Grover *
14bdbe6fbcSAndy Grover * - Redistributions of source code must retain the above
15bdbe6fbcSAndy Grover * copyright notice, this list of conditions and the following
16bdbe6fbcSAndy Grover * disclaimer.
17bdbe6fbcSAndy Grover *
18bdbe6fbcSAndy Grover * - Redistributions in binary form must reproduce the above
19bdbe6fbcSAndy Grover * copyright notice, this list of conditions and the following
20bdbe6fbcSAndy Grover * disclaimer in the documentation and/or other materials
21bdbe6fbcSAndy Grover * provided with the distribution.
22bdbe6fbcSAndy Grover *
23bdbe6fbcSAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24bdbe6fbcSAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25bdbe6fbcSAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26bdbe6fbcSAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27bdbe6fbcSAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28bdbe6fbcSAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29bdbe6fbcSAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30bdbe6fbcSAndy Grover * SOFTWARE.
31bdbe6fbcSAndy Grover *
32bdbe6fbcSAndy Grover */
33bdbe6fbcSAndy Grover #include <linux/kernel.h>
345a0e3ad6STejun Heo #include <linux/slab.h>
35bdbe6fbcSAndy Grover #include <net/sock.h>
36bdbe6fbcSAndy Grover #include <linux/in.h>
37bc3b2d7fSPaul Gortmaker #include <linux/export.h>
382870c4d6SJakub Kicinski #include <linux/sched/clock.h>
395711f8b3Ssantosh.shilimkar@oracle.com #include <linux/time.h>
405711f8b3Ssantosh.shilimkar@oracle.com #include <linux/rds.h>
41bdbe6fbcSAndy Grover
42bdbe6fbcSAndy Grover #include "rds.h"
43bdbe6fbcSAndy Grover
rds_inc_init(struct rds_incoming * inc,struct rds_connection * conn,struct in6_addr * saddr)44bdbe6fbcSAndy Grover void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
45eee2fa6aSKa-Cheong Poon struct in6_addr *saddr)
46bdbe6fbcSAndy Grover {
47b7f02920SReshetova, Elena refcount_set(&inc->i_refcount, 1);
48bdbe6fbcSAndy Grover INIT_LIST_HEAD(&inc->i_item);
49bdbe6fbcSAndy Grover inc->i_conn = conn;
50eee2fa6aSKa-Cheong Poon inc->i_saddr = *saddr;
51bf1867dbSDag Moxnes inc->i_usercopy.rdma_cookie = 0;
52bf1867dbSDag Moxnes inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
533289025aSSantosh Shilimkar
541635bb54SZhu Yanjun memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
55bdbe6fbcSAndy Grover }
56616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_inc_init);
57bdbe6fbcSAndy Grover
rds_inc_path_init(struct rds_incoming * inc,struct rds_conn_path * cp,struct in6_addr * saddr)585e833e02SSowmini Varadhan void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
59eee2fa6aSKa-Cheong Poon struct in6_addr *saddr)
605e833e02SSowmini Varadhan {
61b7f02920SReshetova, Elena refcount_set(&inc->i_refcount, 1);
625e833e02SSowmini Varadhan INIT_LIST_HEAD(&inc->i_item);
635e833e02SSowmini Varadhan inc->i_conn = cp->cp_conn;
645e833e02SSowmini Varadhan inc->i_conn_path = cp;
65eee2fa6aSKa-Cheong Poon inc->i_saddr = *saddr;
66bf1867dbSDag Moxnes inc->i_usercopy.rdma_cookie = 0;
67bf1867dbSDag Moxnes inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
685e833e02SSowmini Varadhan }
695e833e02SSowmini Varadhan EXPORT_SYMBOL_GPL(rds_inc_path_init);
705e833e02SSowmini Varadhan
rds_inc_addref(struct rds_incoming * inc)71ff51bf84Sstephen hemminger static void rds_inc_addref(struct rds_incoming *inc)
72bdbe6fbcSAndy Grover {
73b7f02920SReshetova, Elena rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
74b7f02920SReshetova, Elena refcount_inc(&inc->i_refcount);
75bdbe6fbcSAndy Grover }
76bdbe6fbcSAndy Grover
rds_inc_put(struct rds_incoming * inc)77bdbe6fbcSAndy Grover void rds_inc_put(struct rds_incoming *inc)
78bdbe6fbcSAndy Grover {
79b7f02920SReshetova, Elena rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
80b7f02920SReshetova, Elena if (refcount_dec_and_test(&inc->i_refcount)) {
81bdbe6fbcSAndy Grover BUG_ON(!list_empty(&inc->i_item));
82bdbe6fbcSAndy Grover
83bdbe6fbcSAndy Grover inc->i_conn->c_trans->inc_free(inc);
84bdbe6fbcSAndy Grover }
85bdbe6fbcSAndy Grover }
86616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_inc_put);
87bdbe6fbcSAndy Grover
rds_recv_rcvbuf_delta(struct rds_sock * rs,struct sock * sk,struct rds_cong_map * map,int delta,__be16 port)88bdbe6fbcSAndy Grover static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
89bdbe6fbcSAndy Grover struct rds_cong_map *map,
90bdbe6fbcSAndy Grover int delta, __be16 port)
91bdbe6fbcSAndy Grover {
92bdbe6fbcSAndy Grover int now_congested;
93bdbe6fbcSAndy Grover
94bdbe6fbcSAndy Grover if (delta == 0)
95bdbe6fbcSAndy Grover return;
96bdbe6fbcSAndy Grover
97bdbe6fbcSAndy Grover rs->rs_rcv_bytes += delta;
98192a798fSVenkat Venkatsubra if (delta > 0)
99192a798fSVenkat Venkatsubra rds_stats_add(s_recv_bytes_added_to_socket, delta);
100192a798fSVenkat Venkatsubra else
101192a798fSVenkat Venkatsubra rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
102f1693c63SSantosh Shilimkar
103f1693c63SSantosh Shilimkar /* loop transport doesn't send/recv congestion updates */
104f1693c63SSantosh Shilimkar if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
105f1693c63SSantosh Shilimkar return;
106f1693c63SSantosh Shilimkar
107bdbe6fbcSAndy Grover now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
108bdbe6fbcSAndy Grover
109eee2fa6aSKa-Cheong Poon rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
110bdbe6fbcSAndy Grover "now_cong %d delta %d\n",
111bdbe6fbcSAndy Grover rs, &rs->rs_bound_addr,
112bdbe6fbcSAndy Grover ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
113bdbe6fbcSAndy Grover rds_sk_rcvbuf(rs), now_congested, delta);
114bdbe6fbcSAndy Grover
115bdbe6fbcSAndy Grover /* wasn't -> am congested */
116bdbe6fbcSAndy Grover if (!rs->rs_congested && now_congested) {
117bdbe6fbcSAndy Grover rs->rs_congested = 1;
118bdbe6fbcSAndy Grover rds_cong_set_bit(map, port);
119bdbe6fbcSAndy Grover rds_cong_queue_updates(map);
120bdbe6fbcSAndy Grover }
121bdbe6fbcSAndy Grover /* was -> aren't congested */
122bdbe6fbcSAndy Grover /* Require more free space before reporting uncongested to prevent
123bdbe6fbcSAndy Grover bouncing cong/uncong state too often */
124bdbe6fbcSAndy Grover else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
125bdbe6fbcSAndy Grover rs->rs_congested = 0;
126bdbe6fbcSAndy Grover rds_cong_clear_bit(map, port);
127bdbe6fbcSAndy Grover rds_cong_queue_updates(map);
128bdbe6fbcSAndy Grover }
129bdbe6fbcSAndy Grover
130bdbe6fbcSAndy Grover /* do nothing if no change in cong state */
131bdbe6fbcSAndy Grover }
132bdbe6fbcSAndy Grover
rds_conn_peer_gen_update(struct rds_connection * conn,u32 peer_gen_num)133905dd418SSowmini Varadhan static void rds_conn_peer_gen_update(struct rds_connection *conn,
134905dd418SSowmini Varadhan u32 peer_gen_num)
135905dd418SSowmini Varadhan {
136905dd418SSowmini Varadhan int i;
137905dd418SSowmini Varadhan struct rds_message *rm, *tmp;
138905dd418SSowmini Varadhan unsigned long flags;
139905dd418SSowmini Varadhan
140905dd418SSowmini Varadhan WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
141905dd418SSowmini Varadhan if (peer_gen_num != 0) {
142905dd418SSowmini Varadhan if (conn->c_peer_gen_num != 0 &&
143905dd418SSowmini Varadhan peer_gen_num != conn->c_peer_gen_num) {
144905dd418SSowmini Varadhan for (i = 0; i < RDS_MPATH_WORKERS; i++) {
145905dd418SSowmini Varadhan struct rds_conn_path *cp;
146905dd418SSowmini Varadhan
147905dd418SSowmini Varadhan cp = &conn->c_path[i];
148905dd418SSowmini Varadhan spin_lock_irqsave(&cp->cp_lock, flags);
149905dd418SSowmini Varadhan cp->cp_next_tx_seq = 1;
150905dd418SSowmini Varadhan cp->cp_next_rx_seq = 0;
151905dd418SSowmini Varadhan list_for_each_entry_safe(rm, tmp,
152905dd418SSowmini Varadhan &cp->cp_retrans,
153905dd418SSowmini Varadhan m_conn_item) {
154905dd418SSowmini Varadhan set_bit(RDS_MSG_FLUSH, &rm->m_flags);
155905dd418SSowmini Varadhan }
156905dd418SSowmini Varadhan spin_unlock_irqrestore(&cp->cp_lock, flags);
157905dd418SSowmini Varadhan }
158905dd418SSowmini Varadhan }
159905dd418SSowmini Varadhan conn->c_peer_gen_num = peer_gen_num;
160905dd418SSowmini Varadhan }
161905dd418SSowmini Varadhan }
162905dd418SSowmini Varadhan
163bdbe6fbcSAndy Grover /*
164bdbe6fbcSAndy Grover * Process all extension headers that come with this message.
165bdbe6fbcSAndy Grover */
rds_recv_incoming_exthdrs(struct rds_incoming * inc,struct rds_sock * rs)166bdbe6fbcSAndy Grover static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
167bdbe6fbcSAndy Grover {
168bdbe6fbcSAndy Grover struct rds_header *hdr = &inc->i_hdr;
169bdbe6fbcSAndy Grover unsigned int pos = 0, type, len;
170bdbe6fbcSAndy Grover union {
171bdbe6fbcSAndy Grover struct rds_ext_header_version version;
172bdbe6fbcSAndy Grover struct rds_ext_header_rdma rdma;
173bdbe6fbcSAndy Grover struct rds_ext_header_rdma_dest rdma_dest;
174bdbe6fbcSAndy Grover } buffer;
175bdbe6fbcSAndy Grover
176bdbe6fbcSAndy Grover while (1) {
177bdbe6fbcSAndy Grover len = sizeof(buffer);
178bdbe6fbcSAndy Grover type = rds_message_next_extension(hdr, &pos, &buffer, &len);
179bdbe6fbcSAndy Grover if (type == RDS_EXTHDR_NONE)
180bdbe6fbcSAndy Grover break;
181bdbe6fbcSAndy Grover /* Process extension header here */
182bdbe6fbcSAndy Grover switch (type) {
183bdbe6fbcSAndy Grover case RDS_EXTHDR_RDMA:
184bdbe6fbcSAndy Grover rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
185bdbe6fbcSAndy Grover break;
186bdbe6fbcSAndy Grover
187bdbe6fbcSAndy Grover case RDS_EXTHDR_RDMA_DEST:
188bdbe6fbcSAndy Grover /* We ignore the size for now. We could stash it
189bdbe6fbcSAndy Grover * somewhere and use it for error checking. */
190bf1867dbSDag Moxnes inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
191bdbe6fbcSAndy Grover be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
192bdbe6fbcSAndy Grover be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
193bdbe6fbcSAndy Grover
194bdbe6fbcSAndy Grover break;
195bdbe6fbcSAndy Grover }
196bdbe6fbcSAndy Grover }
197bdbe6fbcSAndy Grover }
198bdbe6fbcSAndy Grover
rds_recv_hs_exthdrs(struct rds_header * hdr,struct rds_connection * conn)1995916e2c1SSowmini Varadhan static void rds_recv_hs_exthdrs(struct rds_header *hdr,
2005916e2c1SSowmini Varadhan struct rds_connection *conn)
2015916e2c1SSowmini Varadhan {
2025916e2c1SSowmini Varadhan unsigned int pos = 0, type, len;
2035916e2c1SSowmini Varadhan union {
2045916e2c1SSowmini Varadhan struct rds_ext_header_version version;
2055916e2c1SSowmini Varadhan u16 rds_npaths;
206905dd418SSowmini Varadhan u32 rds_gen_num;
2075916e2c1SSowmini Varadhan } buffer;
208905dd418SSowmini Varadhan u32 new_peer_gen_num = 0;
2095916e2c1SSowmini Varadhan
2105916e2c1SSowmini Varadhan while (1) {
2115916e2c1SSowmini Varadhan len = sizeof(buffer);
2125916e2c1SSowmini Varadhan type = rds_message_next_extension(hdr, &pos, &buffer, &len);
2135916e2c1SSowmini Varadhan if (type == RDS_EXTHDR_NONE)
2145916e2c1SSowmini Varadhan break;
2155916e2c1SSowmini Varadhan /* Process extension header here */
2165916e2c1SSowmini Varadhan switch (type) {
2175916e2c1SSowmini Varadhan case RDS_EXTHDR_NPATHS:
2185916e2c1SSowmini Varadhan conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
21900354de5SSowmini Varadhan be16_to_cpu(buffer.rds_npaths));
2205916e2c1SSowmini Varadhan break;
221905dd418SSowmini Varadhan case RDS_EXTHDR_GEN_NUM:
22200354de5SSowmini Varadhan new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
223905dd418SSowmini Varadhan break;
2245916e2c1SSowmini Varadhan default:
2255916e2c1SSowmini Varadhan pr_warn_ratelimited("ignoring unknown exthdr type "
2265916e2c1SSowmini Varadhan "0x%x\n", type);
2275916e2c1SSowmini Varadhan }
2285916e2c1SSowmini Varadhan }
2295916e2c1SSowmini Varadhan /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
2305916e2c1SSowmini Varadhan conn->c_npaths = max_t(int, conn->c_npaths, 1);
23169b92b5bSSowmini Varadhan conn->c_ping_triggered = 0;
232905dd418SSowmini Varadhan rds_conn_peer_gen_update(conn, new_peer_gen_num);
2335916e2c1SSowmini Varadhan }
2345916e2c1SSowmini Varadhan
2355916e2c1SSowmini Varadhan /* rds_start_mprds() will synchronously start multiple paths when appropriate.
2365916e2c1SSowmini Varadhan * The scheme is based on the following rules:
2375916e2c1SSowmini Varadhan *
2385916e2c1SSowmini Varadhan * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
2395916e2c1SSowmini Varadhan * sender's npaths (s_npaths)
2405916e2c1SSowmini Varadhan * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
2415916e2c1SSowmini Varadhan * sends back a probe-pong with r_npaths. After that, if rcvr is the
2425916e2c1SSowmini Varadhan * smaller ip addr, it starts rds_conn_path_connect_if_down on all
2435916e2c1SSowmini Varadhan * mprds_paths.
2445916e2c1SSowmini Varadhan * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
2455916e2c1SSowmini Varadhan * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
2465916e2c1SSowmini Varadhan * called after reception of the probe-pong on all mprds_paths.
2475916e2c1SSowmini Varadhan * Otherwise (sender of probe-ping is not the smaller ip addr): just call
2485916e2c1SSowmini Varadhan * rds_conn_path_connect_if_down on the hashed path. (see rule 4)
24969b92b5bSSowmini Varadhan * 4. rds_connect_worker must only trigger a connection if laddr < faddr.
2505916e2c1SSowmini Varadhan * 5. sender may end up queuing the packet on the cp. will get sent out later.
2515916e2c1SSowmini Varadhan * when connection is completed.
2525916e2c1SSowmini Varadhan */
rds_start_mprds(struct rds_connection * conn)2535916e2c1SSowmini Varadhan static void rds_start_mprds(struct rds_connection *conn)
2545916e2c1SSowmini Varadhan {
2555916e2c1SSowmini Varadhan int i;
2565916e2c1SSowmini Varadhan struct rds_conn_path *cp;
2575916e2c1SSowmini Varadhan
25800354de5SSowmini Varadhan if (conn->c_npaths > 1 &&
259eee2fa6aSKa-Cheong Poon rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
26069b92b5bSSowmini Varadhan for (i = 0; i < conn->c_npaths; i++) {
2615916e2c1SSowmini Varadhan cp = &conn->c_path[i];
2625916e2c1SSowmini Varadhan rds_conn_path_connect_if_down(cp);
2635916e2c1SSowmini Varadhan }
2645916e2c1SSowmini Varadhan }
2655916e2c1SSowmini Varadhan }
2665916e2c1SSowmini Varadhan
267bdbe6fbcSAndy Grover /*
268bdbe6fbcSAndy Grover * The transport must make sure that this is serialized against other
269bdbe6fbcSAndy Grover * rx and conn reset on this specific conn.
270bdbe6fbcSAndy Grover *
271bdbe6fbcSAndy Grover * We currently assert that only one fragmented message will be sent
272bdbe6fbcSAndy Grover * down a connection at a time. This lets us reassemble in the conn
273bdbe6fbcSAndy Grover * instead of per-flow which means that we don't have to go digging through
274bdbe6fbcSAndy Grover * flows to tear down partial reassembly progress on conn failure and
275bdbe6fbcSAndy Grover * we save flow lookup and locking for each frag arrival. It does mean
276bdbe6fbcSAndy Grover * that small messages will wait behind large ones. Fragmenting at all
277bdbe6fbcSAndy Grover * is only to reduce the memory consumption of pre-posted buffers.
278bdbe6fbcSAndy Grover *
279bdbe6fbcSAndy Grover * The caller passes in saddr and daddr instead of us getting it from the
280bdbe6fbcSAndy Grover * conn. This lets loopback, who only has one conn for both directions,
281bdbe6fbcSAndy Grover * tell us which roles the addrs in the conn are playing for this message.
282bdbe6fbcSAndy Grover */
rds_recv_incoming(struct rds_connection * conn,struct in6_addr * saddr,struct in6_addr * daddr,struct rds_incoming * inc,gfp_t gfp)283eee2fa6aSKa-Cheong Poon void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
284eee2fa6aSKa-Cheong Poon struct in6_addr *daddr,
2856114eab5SCong Wang struct rds_incoming *inc, gfp_t gfp)
286bdbe6fbcSAndy Grover {
287bdbe6fbcSAndy Grover struct rds_sock *rs = NULL;
288bdbe6fbcSAndy Grover struct sock *sk;
289bdbe6fbcSAndy Grover unsigned long flags;
290ef9e62c2SSowmini Varadhan struct rds_conn_path *cp;
291bdbe6fbcSAndy Grover
292bdbe6fbcSAndy Grover inc->i_conn = conn;
293bdbe6fbcSAndy Grover inc->i_rx_jiffies = jiffies;
294ef9e62c2SSowmini Varadhan if (conn->c_trans->t_mp_capable)
295ef9e62c2SSowmini Varadhan cp = inc->i_conn_path;
296ef9e62c2SSowmini Varadhan else
297ef9e62c2SSowmini Varadhan cp = &conn->c_path[0];
298bdbe6fbcSAndy Grover
299bdbe6fbcSAndy Grover rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
300bdbe6fbcSAndy Grover "flags 0x%x rx_jiffies %lu\n", conn,
301ef9e62c2SSowmini Varadhan (unsigned long long)cp->cp_next_rx_seq,
302bdbe6fbcSAndy Grover inc,
303bdbe6fbcSAndy Grover (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
304bdbe6fbcSAndy Grover be32_to_cpu(inc->i_hdr.h_len),
305bdbe6fbcSAndy Grover be16_to_cpu(inc->i_hdr.h_sport),
306bdbe6fbcSAndy Grover be16_to_cpu(inc->i_hdr.h_dport),
307bdbe6fbcSAndy Grover inc->i_hdr.h_flags,
308bdbe6fbcSAndy Grover inc->i_rx_jiffies);
309bdbe6fbcSAndy Grover
310bdbe6fbcSAndy Grover /*
311bdbe6fbcSAndy Grover * Sequence numbers should only increase. Messages get their
312bdbe6fbcSAndy Grover * sequence number as they're queued in a sending conn. They
313bdbe6fbcSAndy Grover * can be dropped, though, if the sending socket is closed before
314bdbe6fbcSAndy Grover * they hit the wire. So sequence numbers can skip forward
315bdbe6fbcSAndy Grover * under normal operation. They can also drop back in the conn
316bdbe6fbcSAndy Grover * failover case as previously sent messages are resent down the
317bdbe6fbcSAndy Grover * new instance of a conn. We drop those, otherwise we have
318bdbe6fbcSAndy Grover * to assume that the next valid seq does not come after a
319bdbe6fbcSAndy Grover * hole in the fragment stream.
320bdbe6fbcSAndy Grover *
321bdbe6fbcSAndy Grover * The headers don't give us a way to realize if fragments of
322bdbe6fbcSAndy Grover * a message have been dropped. We assume that frags that arrive
323bdbe6fbcSAndy Grover * to a flow are part of the current message on the flow that is
324bdbe6fbcSAndy Grover * being reassembled. This means that senders can't drop messages
325bdbe6fbcSAndy Grover * from the sending conn until all their frags are sent.
326bdbe6fbcSAndy Grover *
327bdbe6fbcSAndy Grover * XXX we could spend more on the wire to get more robust failure
328bdbe6fbcSAndy Grover * detection, arguably worth it to avoid data corruption.
329bdbe6fbcSAndy Grover */
330ef9e62c2SSowmini Varadhan if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
331f64f9e71SJoe Perches (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
332bdbe6fbcSAndy Grover rds_stats_inc(s_recv_drop_old_seq);
333bdbe6fbcSAndy Grover goto out;
334bdbe6fbcSAndy Grover }
335ef9e62c2SSowmini Varadhan cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
336bdbe6fbcSAndy Grover
337bdbe6fbcSAndy Grover if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
33811bb62f7SSowmini Varadhan if (inc->i_hdr.h_sport == 0) {
339eee2fa6aSKa-Cheong Poon rdsdebug("ignore ping with 0 sport from %pI6c\n",
340eee2fa6aSKa-Cheong Poon saddr);
34111bb62f7SSowmini Varadhan goto out;
34211bb62f7SSowmini Varadhan }
343bdbe6fbcSAndy Grover rds_stats_inc(s_recv_ping);
34445997e9eSSowmini Varadhan rds_send_pong(cp, inc->i_hdr.h_sport);
3455916e2c1SSowmini Varadhan /* if this is a handshake ping, start multipath if necessary */
34600354de5SSowmini Varadhan if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
34700354de5SSowmini Varadhan be16_to_cpu(inc->i_hdr.h_dport))) {
3485916e2c1SSowmini Varadhan rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
3495916e2c1SSowmini Varadhan rds_start_mprds(cp->cp_conn);
3505916e2c1SSowmini Varadhan }
3515916e2c1SSowmini Varadhan goto out;
3525916e2c1SSowmini Varadhan }
3535916e2c1SSowmini Varadhan
35400354de5SSowmini Varadhan if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT &&
3555916e2c1SSowmini Varadhan inc->i_hdr.h_sport == 0) {
3565916e2c1SSowmini Varadhan rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
3575916e2c1SSowmini Varadhan /* if this is a handshake pong, start multipath if necessary */
3585916e2c1SSowmini Varadhan rds_start_mprds(cp->cp_conn);
3595916e2c1SSowmini Varadhan wake_up(&cp->cp_conn->c_hs_waitq);
360bdbe6fbcSAndy Grover goto out;
361bdbe6fbcSAndy Grover }
362bdbe6fbcSAndy Grover
3631e2b44e7SKa-Cheong Poon rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
3648690bfa1SAndy Grover if (!rs) {
365bdbe6fbcSAndy Grover rds_stats_inc(s_recv_drop_no_sock);
366bdbe6fbcSAndy Grover goto out;
367bdbe6fbcSAndy Grover }
368bdbe6fbcSAndy Grover
369bdbe6fbcSAndy Grover /* Process extension headers */
370bdbe6fbcSAndy Grover rds_recv_incoming_exthdrs(inc, rs);
371bdbe6fbcSAndy Grover
372bdbe6fbcSAndy Grover /* We can be racing with rds_release() which marks the socket dead. */
373bdbe6fbcSAndy Grover sk = rds_rs_to_sk(rs);
374bdbe6fbcSAndy Grover
375bdbe6fbcSAndy Grover /* serialize with rds_release -> sock_orphan */
376bdbe6fbcSAndy Grover write_lock_irqsave(&rs->rs_recv_lock, flags);
377bdbe6fbcSAndy Grover if (!sock_flag(sk, SOCK_DEAD)) {
378bdbe6fbcSAndy Grover rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
379bdbe6fbcSAndy Grover rds_stats_inc(s_recv_queued);
380bdbe6fbcSAndy Grover rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
381bdbe6fbcSAndy Grover be32_to_cpu(inc->i_hdr.h_len),
382bdbe6fbcSAndy Grover inc->i_hdr.h_dport);
3835711f8b3Ssantosh.shilimkar@oracle.com if (sock_flag(sk, SOCK_RCVTSTAMP))
384bf1867dbSDag Moxnes inc->i_usercopy.rx_tstamp = ktime_get_real();
385bdbe6fbcSAndy Grover rds_inc_addref(inc);
3863289025aSSantosh Shilimkar inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
387bdbe6fbcSAndy Grover list_add_tail(&inc->i_item, &rs->rs_recv_queue);
388bdbe6fbcSAndy Grover __rds_wake_sk_sleep(sk);
389bdbe6fbcSAndy Grover } else {
390bdbe6fbcSAndy Grover rds_stats_inc(s_recv_drop_dead_sock);
391bdbe6fbcSAndy Grover }
392bdbe6fbcSAndy Grover write_unlock_irqrestore(&rs->rs_recv_lock, flags);
393bdbe6fbcSAndy Grover
394bdbe6fbcSAndy Grover out:
395bdbe6fbcSAndy Grover if (rs)
396bdbe6fbcSAndy Grover rds_sock_put(rs);
397bdbe6fbcSAndy Grover }
398616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_recv_incoming);
399bdbe6fbcSAndy Grover
400bdbe6fbcSAndy Grover /*
401bdbe6fbcSAndy Grover * be very careful here. This is being called as the condition in
402bdbe6fbcSAndy Grover * wait_event_*() needs to cope with being called many times.
403bdbe6fbcSAndy Grover */
rds_next_incoming(struct rds_sock * rs,struct rds_incoming ** inc)404bdbe6fbcSAndy Grover static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
405bdbe6fbcSAndy Grover {
406bdbe6fbcSAndy Grover unsigned long flags;
407bdbe6fbcSAndy Grover
4088690bfa1SAndy Grover if (!*inc) {
409bdbe6fbcSAndy Grover read_lock_irqsave(&rs->rs_recv_lock, flags);
410bdbe6fbcSAndy Grover if (!list_empty(&rs->rs_recv_queue)) {
411bdbe6fbcSAndy Grover *inc = list_entry(rs->rs_recv_queue.next,
412bdbe6fbcSAndy Grover struct rds_incoming,
413bdbe6fbcSAndy Grover i_item);
414bdbe6fbcSAndy Grover rds_inc_addref(*inc);
415bdbe6fbcSAndy Grover }
416bdbe6fbcSAndy Grover read_unlock_irqrestore(&rs->rs_recv_lock, flags);
417bdbe6fbcSAndy Grover }
418bdbe6fbcSAndy Grover
419bdbe6fbcSAndy Grover return *inc != NULL;
420bdbe6fbcSAndy Grover }
421bdbe6fbcSAndy Grover
rds_still_queued(struct rds_sock * rs,struct rds_incoming * inc,int drop)422bdbe6fbcSAndy Grover static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
423bdbe6fbcSAndy Grover int drop)
424bdbe6fbcSAndy Grover {
425bdbe6fbcSAndy Grover struct sock *sk = rds_rs_to_sk(rs);
426bdbe6fbcSAndy Grover int ret = 0;
427bdbe6fbcSAndy Grover unsigned long flags;
428*f1acf1acSAllison Henderson struct rds_incoming *to_drop = NULL;
429bdbe6fbcSAndy Grover
430bdbe6fbcSAndy Grover write_lock_irqsave(&rs->rs_recv_lock, flags);
431bdbe6fbcSAndy Grover if (!list_empty(&inc->i_item)) {
432bdbe6fbcSAndy Grover ret = 1;
433bdbe6fbcSAndy Grover if (drop) {
434bdbe6fbcSAndy Grover /* XXX make sure this i_conn is reliable */
435bdbe6fbcSAndy Grover rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
436bdbe6fbcSAndy Grover -be32_to_cpu(inc->i_hdr.h_len),
437bdbe6fbcSAndy Grover inc->i_hdr.h_dport);
438bdbe6fbcSAndy Grover list_del_init(&inc->i_item);
439*f1acf1acSAllison Henderson to_drop = inc;
440bdbe6fbcSAndy Grover }
441bdbe6fbcSAndy Grover }
442bdbe6fbcSAndy Grover write_unlock_irqrestore(&rs->rs_recv_lock, flags);
443bdbe6fbcSAndy Grover
444*f1acf1acSAllison Henderson if (to_drop)
445*f1acf1acSAllison Henderson rds_inc_put(to_drop);
446*f1acf1acSAllison Henderson
447bdbe6fbcSAndy Grover rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
448bdbe6fbcSAndy Grover return ret;
449bdbe6fbcSAndy Grover }
450bdbe6fbcSAndy Grover
451bdbe6fbcSAndy Grover /*
452bdbe6fbcSAndy Grover * Pull errors off the error queue.
453bdbe6fbcSAndy Grover * If msghdr is NULL, we will just purge the error queue.
454bdbe6fbcSAndy Grover */
rds_notify_queue_get(struct rds_sock * rs,struct msghdr * msghdr)455bdbe6fbcSAndy Grover int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
456bdbe6fbcSAndy Grover {
457bdbe6fbcSAndy Grover struct rds_notifier *notifier;
458bbc8a99eSPeilin Ye struct rds_rdma_notify cmsg;
459bdbe6fbcSAndy Grover unsigned int count = 0, max_messages = ~0U;
460bdbe6fbcSAndy Grover unsigned long flags;
461bdbe6fbcSAndy Grover LIST_HEAD(copy);
462bdbe6fbcSAndy Grover int err = 0;
463bdbe6fbcSAndy Grover
464bbc8a99eSPeilin Ye memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */
465bdbe6fbcSAndy Grover
466bdbe6fbcSAndy Grover /* put_cmsg copies to user space and thus may sleep. We can't do this
467bdbe6fbcSAndy Grover * with rs_lock held, so first grab as many notifications as we can stuff
468bdbe6fbcSAndy Grover * in the user provided cmsg buffer. We don't try to copy more, to avoid
469bdbe6fbcSAndy Grover * losing notifications - except when the buffer is so small that it wouldn't
470bdbe6fbcSAndy Grover * even hold a single notification. Then we give him as much of this single
471bdbe6fbcSAndy Grover * msg as we can squeeze in, and set MSG_CTRUNC.
472bdbe6fbcSAndy Grover */
473bdbe6fbcSAndy Grover if (msghdr) {
474bdbe6fbcSAndy Grover max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
475bdbe6fbcSAndy Grover if (!max_messages)
476bdbe6fbcSAndy Grover max_messages = 1;
477bdbe6fbcSAndy Grover }
478bdbe6fbcSAndy Grover
479bdbe6fbcSAndy Grover spin_lock_irqsave(&rs->rs_lock, flags);
480bdbe6fbcSAndy Grover while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
481bdbe6fbcSAndy Grover notifier = list_entry(rs->rs_notify_queue.next,
482bdbe6fbcSAndy Grover struct rds_notifier, n_list);
483bdbe6fbcSAndy Grover list_move(¬ifier->n_list, ©);
484bdbe6fbcSAndy Grover count++;
485bdbe6fbcSAndy Grover }
486bdbe6fbcSAndy Grover spin_unlock_irqrestore(&rs->rs_lock, flags);
487bdbe6fbcSAndy Grover
488bdbe6fbcSAndy Grover if (!count)
489bdbe6fbcSAndy Grover return 0;
490bdbe6fbcSAndy Grover
491bdbe6fbcSAndy Grover while (!list_empty(©)) {
492bdbe6fbcSAndy Grover notifier = list_entry(copy.next, struct rds_notifier, n_list);
493bdbe6fbcSAndy Grover
494bdbe6fbcSAndy Grover if (msghdr) {
495bdbe6fbcSAndy Grover cmsg.user_token = notifier->n_user_token;
496bdbe6fbcSAndy Grover cmsg.status = notifier->n_status;
497bdbe6fbcSAndy Grover
498bdbe6fbcSAndy Grover err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
499bdbe6fbcSAndy Grover sizeof(cmsg), &cmsg);
500bdbe6fbcSAndy Grover if (err)
501bdbe6fbcSAndy Grover break;
502bdbe6fbcSAndy Grover }
503bdbe6fbcSAndy Grover
504bdbe6fbcSAndy Grover list_del_init(¬ifier->n_list);
505bdbe6fbcSAndy Grover kfree(notifier);
506bdbe6fbcSAndy Grover }
507bdbe6fbcSAndy Grover
508bdbe6fbcSAndy Grover /* If we bailed out because of an error in put_cmsg,
509bdbe6fbcSAndy Grover * we may be left with one or more notifications that we
510bdbe6fbcSAndy Grover * didn't process. Return them to the head of the list. */
511bdbe6fbcSAndy Grover if (!list_empty(©)) {
512bdbe6fbcSAndy Grover spin_lock_irqsave(&rs->rs_lock, flags);
513bdbe6fbcSAndy Grover list_splice(©, &rs->rs_notify_queue);
514bdbe6fbcSAndy Grover spin_unlock_irqrestore(&rs->rs_lock, flags);
515bdbe6fbcSAndy Grover }
516bdbe6fbcSAndy Grover
517bdbe6fbcSAndy Grover return err;
518bdbe6fbcSAndy Grover }
519bdbe6fbcSAndy Grover
520bdbe6fbcSAndy Grover /*
521bdbe6fbcSAndy Grover * Queue a congestion notification
522bdbe6fbcSAndy Grover */
rds_notify_cong(struct rds_sock * rs,struct msghdr * msghdr)523bdbe6fbcSAndy Grover static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
524bdbe6fbcSAndy Grover {
525bdbe6fbcSAndy Grover uint64_t notify = rs->rs_cong_notify;
526bdbe6fbcSAndy Grover unsigned long flags;
527bdbe6fbcSAndy Grover int err;
528bdbe6fbcSAndy Grover
529bdbe6fbcSAndy Grover err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
530bdbe6fbcSAndy Grover sizeof(notify), ¬ify);
531bdbe6fbcSAndy Grover if (err)
532bdbe6fbcSAndy Grover return err;
533bdbe6fbcSAndy Grover
534bdbe6fbcSAndy Grover spin_lock_irqsave(&rs->rs_lock, flags);
535bdbe6fbcSAndy Grover rs->rs_cong_notify &= ~notify;
536bdbe6fbcSAndy Grover spin_unlock_irqrestore(&rs->rs_lock, flags);
537bdbe6fbcSAndy Grover
538bdbe6fbcSAndy Grover return 0;
539bdbe6fbcSAndy Grover }
540bdbe6fbcSAndy Grover
541bdbe6fbcSAndy Grover /*
542bdbe6fbcSAndy Grover * Receive any control messages.
543bdbe6fbcSAndy Grover */
rds_cmsg_recv(struct rds_incoming * inc,struct msghdr * msg,struct rds_sock * rs)5445711f8b3Ssantosh.shilimkar@oracle.com static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
5455711f8b3Ssantosh.shilimkar@oracle.com struct rds_sock *rs)
546bdbe6fbcSAndy Grover {
547bdbe6fbcSAndy Grover int ret = 0;
548bdbe6fbcSAndy Grover
549bf1867dbSDag Moxnes if (inc->i_usercopy.rdma_cookie) {
550bdbe6fbcSAndy Grover ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
551bf1867dbSDag Moxnes sizeof(inc->i_usercopy.rdma_cookie),
552bf1867dbSDag Moxnes &inc->i_usercopy.rdma_cookie);
553bdbe6fbcSAndy Grover if (ret)
5543289025aSSantosh Shilimkar goto out;
555bdbe6fbcSAndy Grover }
556bdbe6fbcSAndy Grover
557bf1867dbSDag Moxnes if ((inc->i_usercopy.rx_tstamp != 0) &&
5585711f8b3Ssantosh.shilimkar@oracle.com sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
559bf1867dbSDag Moxnes struct __kernel_old_timeval tv =
560bf1867dbSDag Moxnes ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
561887feae3SDeepa Dinamani
562887feae3SDeepa Dinamani if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
5637f1bc6e9SDeepa Dinamani ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
5642de9d505SArnd Bergmann sizeof(tv), &tv);
565887feae3SDeepa Dinamani } else {
566887feae3SDeepa Dinamani struct __kernel_sock_timeval sk_tv;
567887feae3SDeepa Dinamani
568887feae3SDeepa Dinamani sk_tv.tv_sec = tv.tv_sec;
569887feae3SDeepa Dinamani sk_tv.tv_usec = tv.tv_usec;
570887feae3SDeepa Dinamani
571887feae3SDeepa Dinamani ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
572887feae3SDeepa Dinamani sizeof(sk_tv), &sk_tv);
573887feae3SDeepa Dinamani }
574887feae3SDeepa Dinamani
5755711f8b3Ssantosh.shilimkar@oracle.com if (ret)
5763289025aSSantosh Shilimkar goto out;
5775711f8b3Ssantosh.shilimkar@oracle.com }
5785711f8b3Ssantosh.shilimkar@oracle.com
5793289025aSSantosh Shilimkar if (rs->rs_rx_traces) {
5803289025aSSantosh Shilimkar struct rds_cmsg_rx_trace t;
5813289025aSSantosh Shilimkar int i, j;
5823289025aSSantosh Shilimkar
583eb80ca47SEric Dumazet memset(&t, 0, sizeof(t));
5843289025aSSantosh Shilimkar inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
5853289025aSSantosh Shilimkar t.rx_traces = rs->rs_rx_traces;
5863289025aSSantosh Shilimkar for (i = 0; i < rs->rs_rx_traces; i++) {
5873289025aSSantosh Shilimkar j = rs->rs_rx_trace[i];
5883289025aSSantosh Shilimkar t.rx_trace_pos[i] = j;
5893289025aSSantosh Shilimkar t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
5903289025aSSantosh Shilimkar inc->i_rx_lat_trace[j];
5913289025aSSantosh Shilimkar }
5923289025aSSantosh Shilimkar
5933289025aSSantosh Shilimkar ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
5943289025aSSantosh Shilimkar sizeof(t), &t);
5953289025aSSantosh Shilimkar if (ret)
5963289025aSSantosh Shilimkar goto out;
5973289025aSSantosh Shilimkar }
5983289025aSSantosh Shilimkar
5993289025aSSantosh Shilimkar out:
6003289025aSSantosh Shilimkar return ret;
601bdbe6fbcSAndy Grover }
602bdbe6fbcSAndy Grover
rds_recvmsg_zcookie(struct rds_sock * rs,struct msghdr * msg)603401910dbSSowmini Varadhan static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
604401910dbSSowmini Varadhan {
6059426bbc6SSowmini Varadhan struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
6069426bbc6SSowmini Varadhan struct rds_msg_zcopy_info *info = NULL;
607401910dbSSowmini Varadhan struct rds_zcopy_cookies *done;
6089426bbc6SSowmini Varadhan unsigned long flags;
609401910dbSSowmini Varadhan
610401910dbSSowmini Varadhan if (!msg->msg_control)
611401910dbSSowmini Varadhan return false;
612401910dbSSowmini Varadhan
613401910dbSSowmini Varadhan if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
614401910dbSSowmini Varadhan msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
615401910dbSSowmini Varadhan return false;
616401910dbSSowmini Varadhan
6179426bbc6SSowmini Varadhan spin_lock_irqsave(&q->lock, flags);
6189426bbc6SSowmini Varadhan if (!list_empty(&q->zcookie_head)) {
6199426bbc6SSowmini Varadhan info = list_entry(q->zcookie_head.next,
6209426bbc6SSowmini Varadhan struct rds_msg_zcopy_info, rs_zcookie_next);
6219426bbc6SSowmini Varadhan list_del(&info->rs_zcookie_next);
6229426bbc6SSowmini Varadhan }
6239426bbc6SSowmini Varadhan spin_unlock_irqrestore(&q->lock, flags);
6249426bbc6SSowmini Varadhan if (!info)
625401910dbSSowmini Varadhan return false;
6269426bbc6SSowmini Varadhan done = &info->zcookies;
627401910dbSSowmini Varadhan if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
628401910dbSSowmini Varadhan done)) {
6299426bbc6SSowmini Varadhan spin_lock_irqsave(&q->lock, flags);
6309426bbc6SSowmini Varadhan list_add(&info->rs_zcookie_next, &q->zcookie_head);
6319426bbc6SSowmini Varadhan spin_unlock_irqrestore(&q->lock, flags);
632401910dbSSowmini Varadhan return false;
633401910dbSSowmini Varadhan }
6349426bbc6SSowmini Varadhan kfree(info);
635401910dbSSowmini Varadhan return true;
636401910dbSSowmini Varadhan }
637401910dbSSowmini Varadhan
rds_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int msg_flags)6381b784140SYing Xue int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
6391b784140SYing Xue int msg_flags)
640bdbe6fbcSAndy Grover {
641bdbe6fbcSAndy Grover struct sock *sk = sock->sk;
642bdbe6fbcSAndy Grover struct rds_sock *rs = rds_sk_to_rs(sk);
643bdbe6fbcSAndy Grover long timeo;
644bdbe6fbcSAndy Grover int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
645eee2fa6aSKa-Cheong Poon DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
646342dfc30SSteffen Hurrle DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
647bdbe6fbcSAndy Grover struct rds_incoming *inc = NULL;
648bdbe6fbcSAndy Grover
649bdbe6fbcSAndy Grover /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
650bdbe6fbcSAndy Grover timeo = sock_rcvtimeo(sk, nonblock);
651bdbe6fbcSAndy Grover
652bdbe6fbcSAndy Grover rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
653bdbe6fbcSAndy Grover
654bdbe6fbcSAndy Grover if (msg_flags & MSG_OOB)
655bdbe6fbcSAndy Grover goto out;
65601883edaSSowmini Varadhan if (msg_flags & MSG_ERRQUEUE)
65701883edaSSowmini Varadhan return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
658bdbe6fbcSAndy Grover
659edacaeaeSAndy Grover while (1) {
660bdbe6fbcSAndy Grover /* If there are pending notifications, do those - and nothing else */
661bdbe6fbcSAndy Grover if (!list_empty(&rs->rs_notify_queue)) {
662bdbe6fbcSAndy Grover ret = rds_notify_queue_get(rs, msg);
663edacaeaeSAndy Grover break;
664bdbe6fbcSAndy Grover }
665bdbe6fbcSAndy Grover
666bdbe6fbcSAndy Grover if (rs->rs_cong_notify) {
667bdbe6fbcSAndy Grover ret = rds_notify_cong(rs, msg);
668edacaeaeSAndy Grover break;
669bdbe6fbcSAndy Grover }
670bdbe6fbcSAndy Grover
671bdbe6fbcSAndy Grover if (!rds_next_incoming(rs, &inc)) {
672bdbe6fbcSAndy Grover if (nonblock) {
673401910dbSSowmini Varadhan bool reaped = rds_recvmsg_zcookie(rs, msg);
674401910dbSSowmini Varadhan
675401910dbSSowmini Varadhan ret = reaped ? 0 : -EAGAIN;
676bdbe6fbcSAndy Grover break;
677bdbe6fbcSAndy Grover }
678bdbe6fbcSAndy Grover
679aa395145SEric Dumazet timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
680f64f9e71SJoe Perches (!list_empty(&rs->rs_notify_queue) ||
681f64f9e71SJoe Perches rs->rs_cong_notify ||
682f64f9e71SJoe Perches rds_next_incoming(rs, &inc)), timeo);
683bdbe6fbcSAndy Grover rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
684bdbe6fbcSAndy Grover timeo);
685bdbe6fbcSAndy Grover if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
686bdbe6fbcSAndy Grover continue;
687bdbe6fbcSAndy Grover
688bdbe6fbcSAndy Grover ret = timeo;
689bdbe6fbcSAndy Grover if (ret == 0)
690bdbe6fbcSAndy Grover ret = -ETIMEDOUT;
691bdbe6fbcSAndy Grover break;
692bdbe6fbcSAndy Grover }
693bdbe6fbcSAndy Grover
694eee2fa6aSKa-Cheong Poon rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
695bdbe6fbcSAndy Grover &inc->i_conn->c_faddr,
696bdbe6fbcSAndy Grover ntohs(inc->i_hdr.h_sport));
697c0371da6SAl Viro ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
698bdbe6fbcSAndy Grover if (ret < 0)
699bdbe6fbcSAndy Grover break;
700bdbe6fbcSAndy Grover
701bdbe6fbcSAndy Grover /*
702bdbe6fbcSAndy Grover * if the message we just copied isn't at the head of the
703bdbe6fbcSAndy Grover * recv queue then someone else raced us to return it, try
704bdbe6fbcSAndy Grover * to get the next message.
705bdbe6fbcSAndy Grover */
706bdbe6fbcSAndy Grover if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
707bdbe6fbcSAndy Grover rds_inc_put(inc);
708bdbe6fbcSAndy Grover inc = NULL;
709bdbe6fbcSAndy Grover rds_stats_inc(s_recv_deliver_raced);
710dc88e3b4SAl Viro iov_iter_revert(&msg->msg_iter, ret);
711bdbe6fbcSAndy Grover continue;
712bdbe6fbcSAndy Grover }
713bdbe6fbcSAndy Grover
714bdbe6fbcSAndy Grover if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
715bdbe6fbcSAndy Grover if (msg_flags & MSG_TRUNC)
716bdbe6fbcSAndy Grover ret = be32_to_cpu(inc->i_hdr.h_len);
717bdbe6fbcSAndy Grover msg->msg_flags |= MSG_TRUNC;
718bdbe6fbcSAndy Grover }
719bdbe6fbcSAndy Grover
7205711f8b3Ssantosh.shilimkar@oracle.com if (rds_cmsg_recv(inc, msg, rs)) {
721bdbe6fbcSAndy Grover ret = -EFAULT;
72249bfcbfdSPavel Skripkin break;
723bdbe6fbcSAndy Grover }
724401910dbSSowmini Varadhan rds_recvmsg_zcookie(rs, msg);
725bdbe6fbcSAndy Grover
726bdbe6fbcSAndy Grover rds_stats_inc(s_recv_delivered);
727bdbe6fbcSAndy Grover
728eee2fa6aSKa-Cheong Poon if (msg->msg_name) {
729eee2fa6aSKa-Cheong Poon if (ipv6_addr_v4mapped(&inc->i_saddr)) {
730bdbe6fbcSAndy Grover sin->sin_family = AF_INET;
731bdbe6fbcSAndy Grover sin->sin_port = inc->i_hdr.h_sport;
732eee2fa6aSKa-Cheong Poon sin->sin_addr.s_addr =
733eee2fa6aSKa-Cheong Poon inc->i_saddr.s6_addr32[3];
734bdbe6fbcSAndy Grover memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
73506b6a1cfSWeiping Pan msg->msg_namelen = sizeof(*sin);
736eee2fa6aSKa-Cheong Poon } else {
737eee2fa6aSKa-Cheong Poon sin6->sin6_family = AF_INET6;
738eee2fa6aSKa-Cheong Poon sin6->sin6_port = inc->i_hdr.h_sport;
739eee2fa6aSKa-Cheong Poon sin6->sin6_addr = inc->i_saddr;
740eee2fa6aSKa-Cheong Poon sin6->sin6_flowinfo = 0;
741eee2fa6aSKa-Cheong Poon sin6->sin6_scope_id = rs->rs_bound_scope_id;
742eee2fa6aSKa-Cheong Poon msg->msg_namelen = sizeof(*sin6);
743eee2fa6aSKa-Cheong Poon }
744bdbe6fbcSAndy Grover }
745bdbe6fbcSAndy Grover break;
746bdbe6fbcSAndy Grover }
747bdbe6fbcSAndy Grover
748bdbe6fbcSAndy Grover if (inc)
749bdbe6fbcSAndy Grover rds_inc_put(inc);
750bdbe6fbcSAndy Grover
751bdbe6fbcSAndy Grover out:
752bdbe6fbcSAndy Grover return ret;
753bdbe6fbcSAndy Grover }
754bdbe6fbcSAndy Grover
755bdbe6fbcSAndy Grover /*
756bdbe6fbcSAndy Grover * The socket is being shut down and we're asked to drop messages that were
757bdbe6fbcSAndy Grover * queued for recvmsg. The caller has unbound the socket so the receive path
758bdbe6fbcSAndy Grover * won't queue any more incoming fragments or messages on the socket.
759bdbe6fbcSAndy Grover */
rds_clear_recv_queue(struct rds_sock * rs)760bdbe6fbcSAndy Grover void rds_clear_recv_queue(struct rds_sock *rs)
761bdbe6fbcSAndy Grover {
762bdbe6fbcSAndy Grover struct sock *sk = rds_rs_to_sk(rs);
763bdbe6fbcSAndy Grover struct rds_incoming *inc, *tmp;
764bdbe6fbcSAndy Grover unsigned long flags;
765*f1acf1acSAllison Henderson LIST_HEAD(to_drop);
766bdbe6fbcSAndy Grover
767bdbe6fbcSAndy Grover write_lock_irqsave(&rs->rs_recv_lock, flags);
768bdbe6fbcSAndy Grover list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
769bdbe6fbcSAndy Grover rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
770bdbe6fbcSAndy Grover -be32_to_cpu(inc->i_hdr.h_len),
771bdbe6fbcSAndy Grover inc->i_hdr.h_dport);
772*f1acf1acSAllison Henderson list_move(&inc->i_item, &to_drop);
773*f1acf1acSAllison Henderson }
774*f1acf1acSAllison Henderson write_unlock_irqrestore(&rs->rs_recv_lock, flags);
775*f1acf1acSAllison Henderson
776*f1acf1acSAllison Henderson list_for_each_entry_safe(inc, tmp, &to_drop, i_item) {
777bdbe6fbcSAndy Grover list_del_init(&inc->i_item);
778bdbe6fbcSAndy Grover rds_inc_put(inc);
779bdbe6fbcSAndy Grover }
780bdbe6fbcSAndy Grover }
781bdbe6fbcSAndy Grover
782bdbe6fbcSAndy Grover /*
783bdbe6fbcSAndy Grover * inc->i_saddr isn't used here because it is only set in the receive
784bdbe6fbcSAndy Grover * path.
785bdbe6fbcSAndy Grover */
rds_inc_info_copy(struct rds_incoming * inc,struct rds_info_iterator * iter,__be32 saddr,__be32 daddr,int flip)786bdbe6fbcSAndy Grover void rds_inc_info_copy(struct rds_incoming *inc,
787bdbe6fbcSAndy Grover struct rds_info_iterator *iter,
788bdbe6fbcSAndy Grover __be32 saddr, __be32 daddr, int flip)
789bdbe6fbcSAndy Grover {
790bdbe6fbcSAndy Grover struct rds_info_message minfo;
791bdbe6fbcSAndy Grover
792bdbe6fbcSAndy Grover minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
793bdbe6fbcSAndy Grover minfo.len = be32_to_cpu(inc->i_hdr.h_len);
7943eb45036SSantosh Shilimkar minfo.tos = inc->i_conn->c_tos;
795bdbe6fbcSAndy Grover
796bdbe6fbcSAndy Grover if (flip) {
797bdbe6fbcSAndy Grover minfo.laddr = daddr;
798bdbe6fbcSAndy Grover minfo.faddr = saddr;
799bdbe6fbcSAndy Grover minfo.lport = inc->i_hdr.h_dport;
800bdbe6fbcSAndy Grover minfo.fport = inc->i_hdr.h_sport;
801bdbe6fbcSAndy Grover } else {
802bdbe6fbcSAndy Grover minfo.laddr = saddr;
803bdbe6fbcSAndy Grover minfo.faddr = daddr;
804bdbe6fbcSAndy Grover minfo.lport = inc->i_hdr.h_sport;
805bdbe6fbcSAndy Grover minfo.fport = inc->i_hdr.h_dport;
806bdbe6fbcSAndy Grover }
807bdbe6fbcSAndy Grover
8084116def2SKangjie Lu minfo.flags = 0;
8094116def2SKangjie Lu
810bdbe6fbcSAndy Grover rds_info_copy(iter, &minfo, sizeof(minfo));
811bdbe6fbcSAndy Grover }
812b7ff8b10SKa-Cheong Poon
813e65d4d96SKa-Cheong Poon #if IS_ENABLED(CONFIG_IPV6)
rds6_inc_info_copy(struct rds_incoming * inc,struct rds_info_iterator * iter,struct in6_addr * saddr,struct in6_addr * daddr,int flip)814b7ff8b10SKa-Cheong Poon void rds6_inc_info_copy(struct rds_incoming *inc,
815b7ff8b10SKa-Cheong Poon struct rds_info_iterator *iter,
816b7ff8b10SKa-Cheong Poon struct in6_addr *saddr, struct in6_addr *daddr,
817b7ff8b10SKa-Cheong Poon int flip)
818b7ff8b10SKa-Cheong Poon {
819b7ff8b10SKa-Cheong Poon struct rds6_info_message minfo6;
820b7ff8b10SKa-Cheong Poon
821b7ff8b10SKa-Cheong Poon minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
822b7ff8b10SKa-Cheong Poon minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
8237d0a0658SKa-Cheong Poon minfo6.tos = inc->i_conn->c_tos;
824b7ff8b10SKa-Cheong Poon
825b7ff8b10SKa-Cheong Poon if (flip) {
826b7ff8b10SKa-Cheong Poon minfo6.laddr = *daddr;
827b7ff8b10SKa-Cheong Poon minfo6.faddr = *saddr;
828b7ff8b10SKa-Cheong Poon minfo6.lport = inc->i_hdr.h_dport;
829b7ff8b10SKa-Cheong Poon minfo6.fport = inc->i_hdr.h_sport;
830b7ff8b10SKa-Cheong Poon } else {
831b7ff8b10SKa-Cheong Poon minfo6.laddr = *saddr;
832b7ff8b10SKa-Cheong Poon minfo6.faddr = *daddr;
833b7ff8b10SKa-Cheong Poon minfo6.lport = inc->i_hdr.h_sport;
834b7ff8b10SKa-Cheong Poon minfo6.fport = inc->i_hdr.h_dport;
835b7ff8b10SKa-Cheong Poon }
836b7ff8b10SKa-Cheong Poon
8377d0a0658SKa-Cheong Poon minfo6.flags = 0;
8387d0a0658SKa-Cheong Poon
839b7ff8b10SKa-Cheong Poon rds_info_copy(iter, &minfo6, sizeof(minfo6));
840b7ff8b10SKa-Cheong Poon }
841e65d4d96SKa-Cheong Poon #endif
842