xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/threads.c (revision 94c3dad2979525d0a82595f3d8350a6116aba8ed)
1c0dd49bdSEiji Ota /*
2c0dd49bdSEiji Ota  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3c0dd49bdSEiji Ota  */
4c0dd49bdSEiji Ota 
5c0dd49bdSEiji Ota /*
616e76cddSagiri  * This file contains code imported from the OFED rds source file threads.c
716e76cddSagiri  * Oracle elects to have and use the contents of threads.c under and governed
816e76cddSagiri  * by the OpenIB.org BSD license (see below for full license text). However,
916e76cddSagiri  * the following notice accompanied the original version of this file:
1016e76cddSagiri  */
1116e76cddSagiri 
1216e76cddSagiri /*
13c0dd49bdSEiji Ota  * Copyright (c) 2006 Oracle.  All rights reserved.
14c0dd49bdSEiji Ota  *
15c0dd49bdSEiji Ota  * This software is available to you under a choice of one of two
16c0dd49bdSEiji Ota  * licenses.  You may choose to be licensed under the terms of the GNU
17c0dd49bdSEiji Ota  * General Public License (GPL) Version 2, available from the file
18c0dd49bdSEiji Ota  * COPYING in the main directory of this source tree, or the
19c0dd49bdSEiji Ota  * OpenIB.org BSD license below:
20c0dd49bdSEiji Ota  *
21c0dd49bdSEiji Ota  *     Redistribution and use in source and binary forms, with or
22c0dd49bdSEiji Ota  *     without modification, are permitted provided that the following
23c0dd49bdSEiji Ota  *     conditions are met:
24c0dd49bdSEiji Ota  *
25c0dd49bdSEiji Ota  *      - Redistributions of source code must retain the above
26c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
27c0dd49bdSEiji Ota  *        disclaimer.
28c0dd49bdSEiji Ota  *
29c0dd49bdSEiji Ota  *      - Redistributions in binary form must reproduce the above
30c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
31c0dd49bdSEiji Ota  *        disclaimer in the documentation and/or other materials
32c0dd49bdSEiji Ota  *        provided with the distribution.
33c0dd49bdSEiji Ota  *
34c0dd49bdSEiji Ota  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35c0dd49bdSEiji Ota  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36c0dd49bdSEiji Ota  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37c0dd49bdSEiji Ota  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38c0dd49bdSEiji Ota  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39c0dd49bdSEiji Ota  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40c0dd49bdSEiji Ota  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41c0dd49bdSEiji Ota  * SOFTWARE.
42c0dd49bdSEiji Ota  *
43c0dd49bdSEiji Ota  */
44c0dd49bdSEiji Ota #include <sys/rds.h>
45c0dd49bdSEiji Ota #include <sys/sunddi.h>
46*94c3dad2SToomas Soome #include <sys/containerof.h>
47c0dd49bdSEiji Ota 
48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
50c0dd49bdSEiji Ota 
51c0dd49bdSEiji Ota /*
52c0dd49bdSEiji Ota  * All of connection management is simplified by serializing it through
53c0dd49bdSEiji Ota  * work queues that execute in a connection managing thread.
54c0dd49bdSEiji Ota  *
55c0dd49bdSEiji Ota  * TCP wants to send acks through sendpage() in response to data_ready(),
56c0dd49bdSEiji Ota  * but it needs a process context to do so.
57c0dd49bdSEiji Ota  *
58c0dd49bdSEiji Ota  * The receive paths need to allocate but can't drop packets (!) so we have
59c0dd49bdSEiji Ota  * a thread around to block allocating if the receive fast path sees an
60c0dd49bdSEiji Ota  * allocation failure.
61c0dd49bdSEiji Ota  */
62c0dd49bdSEiji Ota 
63c0dd49bdSEiji Ota /*
64c0dd49bdSEiji Ota  * Grand Unified Theory of connection life cycle:
65c0dd49bdSEiji Ota  * At any point in time, the connection can be in one of these states:
66c0dd49bdSEiji Ota  * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
67c0dd49bdSEiji Ota  *
68c0dd49bdSEiji Ota  * The following transitions are possible:
69c0dd49bdSEiji Ota  *  ANY		  -> ERROR
70c0dd49bdSEiji Ota  *  UP		  -> DISCONNECTING
71c0dd49bdSEiji Ota  *  ERROR	  -> DISCONNECTING
72c0dd49bdSEiji Ota  *  DISCONNECTING -> DOWN
73c0dd49bdSEiji Ota  *  DOWN	  -> CONNECTING
74c0dd49bdSEiji Ota  *  CONNECTING	  -> UP
75c0dd49bdSEiji Ota  *
76c0dd49bdSEiji Ota  * Transition to state DISCONNECTING/DOWN:
77c0dd49bdSEiji Ota  *  -	Inside the shutdown worker; synchronizes with xmit path
78c0dd49bdSEiji Ota  *	through c_send_lock, and with connection management callbacks
79c0dd49bdSEiji Ota  *	via c_cm_lock.
80c0dd49bdSEiji Ota  *
81c0dd49bdSEiji Ota  *	For receive callbacks, we rely on the underlying transport
82c0dd49bdSEiji Ota  *	(TCP, IB/RDMA) to provide the necessary synchronisation.
83c0dd49bdSEiji Ota  */
84c0dd49bdSEiji Ota struct rdsv3_workqueue_struct_s *rdsv3_wq;
85c0dd49bdSEiji Ota 
86c0dd49bdSEiji Ota void
rdsv3_connect_complete(struct rdsv3_connection * conn)87c0dd49bdSEiji Ota rdsv3_connect_complete(struct rdsv3_connection *conn)
88c0dd49bdSEiji Ota {
89c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn);
90c0dd49bdSEiji Ota 
91c0dd49bdSEiji Ota 	if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING,
92c0dd49bdSEiji Ota 	    RDSV3_CONN_UP)) {
93c0dd49bdSEiji Ota #ifndef __lock_lint
946e18d381Sagiri 		RDSV3_DPRINTF2("rdsv3_connect_complete",
95c0dd49bdSEiji Ota 		    "%s: Cannot transition to state UP, "
96c0dd49bdSEiji Ota 		    "current state is %d",
97c0dd49bdSEiji Ota 		    __func__,
98c0dd49bdSEiji Ota 		    atomic_get(&conn->c_state));
99c0dd49bdSEiji Ota #endif
100c0dd49bdSEiji Ota 		conn->c_state = RDSV3_CONN_ERROR;
101c0dd49bdSEiji Ota 		rdsv3_queue_work(rdsv3_wq, &conn->c_down_w);
102c0dd49bdSEiji Ota 		return;
103c0dd49bdSEiji Ota 	}
104c0dd49bdSEiji Ota 
105c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("rdsv3_connect_complete",
106c0dd49bdSEiji Ota 	    "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete",
107c0dd49bdSEiji Ota 	    conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
108c0dd49bdSEiji Ota 
109c0dd49bdSEiji Ota 	conn->c_reconnect_jiffies = 0;
1105d5562f5SEiji Ota 	conn->c_last_connect_jiffies = ddi_get_lbolt();
1115d5562f5SEiji Ota 
112c0dd49bdSEiji Ota 	set_bit(0, &conn->c_map_queued);
113c0dd49bdSEiji Ota 	rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
114c0dd49bdSEiji Ota 	rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0);
115c0dd49bdSEiji Ota 
116c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn);
117c0dd49bdSEiji Ota }
118c0dd49bdSEiji Ota 
119c0dd49bdSEiji Ota /*
120c0dd49bdSEiji Ota  * This random exponential backoff is relied on to eventually resolve racing
121c0dd49bdSEiji Ota  * connects.
122c0dd49bdSEiji Ota  *
123c0dd49bdSEiji Ota  * If connect attempts race then both parties drop both connections and come
124c0dd49bdSEiji Ota  * here to wait for a random amount of time before trying again.  Eventually
125c0dd49bdSEiji Ota  * the backoff range will be so much greater than the time it takes to
126c0dd49bdSEiji Ota  * establish a connection that one of the pair will establish the connection
127c0dd49bdSEiji Ota  * before the other's random delay fires.
128c0dd49bdSEiji Ota  *
129c0dd49bdSEiji Ota  * Connection attempts that arrive while a connection is already established
130c0dd49bdSEiji Ota  * are also considered to be racing connects.  This lets a connection from
131c0dd49bdSEiji Ota  * a rebooted machine replace an existing stale connection before the transport
132c0dd49bdSEiji Ota  * notices that the connection has failed.
133c0dd49bdSEiji Ota  *
134c0dd49bdSEiji Ota  * We should *always* start with a random backoff; otherwise a broken connection
135c0dd49bdSEiji Ota  * will always take several iterations to be re-established.
136c0dd49bdSEiji Ota  */
1375d5562f5SEiji Ota void
rdsv3_queue_reconnect(struct rdsv3_connection * conn)138c0dd49bdSEiji Ota rdsv3_queue_reconnect(struct rdsv3_connection *conn)
139c0dd49bdSEiji Ota {
140c0dd49bdSEiji Ota 	unsigned long rand;
141c0dd49bdSEiji Ota 
142c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("rdsv3_queue_reconnect",
143c0dd49bdSEiji Ota 	    "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu",
144c0dd49bdSEiji Ota 	    conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
145c0dd49bdSEiji Ota 	    conn->c_reconnect_jiffies);
146c0dd49bdSEiji Ota 
147c0dd49bdSEiji Ota 	set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags);
148c0dd49bdSEiji Ota 	if (conn->c_reconnect_jiffies == 0) {
149c0dd49bdSEiji Ota 		conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies;
150c0dd49bdSEiji Ota 		rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0);
151c0dd49bdSEiji Ota 		return;
152c0dd49bdSEiji Ota 	}
153c0dd49bdSEiji Ota 
154c0dd49bdSEiji Ota 	(void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
155cadbfdc3SEiji Ota 
156c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rdsv3",
157c0dd49bdSEiji Ota 	    "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
158c0dd49bdSEiji Ota 	    rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
159c0dd49bdSEiji Ota 	    conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
160cadbfdc3SEiji Ota 
161c0dd49bdSEiji Ota 	rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w,
162c0dd49bdSEiji Ota 	    rand % conn->c_reconnect_jiffies);
163c0dd49bdSEiji Ota 
164c0dd49bdSEiji Ota 	conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
165c0dd49bdSEiji Ota 	    rdsv3_sysctl_reconnect_max_jiffies);
166c0dd49bdSEiji Ota }
167c0dd49bdSEiji Ota 
168c0dd49bdSEiji Ota void
rdsv3_connect_worker(struct rdsv3_work_s * work)169c0dd49bdSEiji Ota rdsv3_connect_worker(struct rdsv3_work_s *work)
170c0dd49bdSEiji Ota {
171*94c3dad2SToomas Soome 	struct rdsv3_connection *conn = __containerof(work,
172c0dd49bdSEiji Ota 	    struct rdsv3_connection, c_conn_w.work);
173c0dd49bdSEiji Ota 	int ret;
174c0dd49bdSEiji Ota 
175c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work);
176c0dd49bdSEiji Ota 
177c0dd49bdSEiji Ota 	clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags);
178c0dd49bdSEiji Ota 	if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN,
179c0dd49bdSEiji Ota 	    RDSV3_CONN_CONNECTING)) {
180c0dd49bdSEiji Ota 		ret = conn->c_trans->conn_connect(conn);
181cadbfdc3SEiji Ota 
182c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3",
183c0dd49bdSEiji Ota 		    "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u "
184c0dd49bdSEiji Ota 		    "ret %d", conn, NIPQUAD(conn->c_laddr),
185c0dd49bdSEiji Ota 		    NIPQUAD(conn->c_faddr), ret);
186cadbfdc3SEiji Ota 
187c0dd49bdSEiji Ota 		RDSV3_DPRINTF2("rdsv3_connect_worker",
188c0dd49bdSEiji Ota 		    "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d",
189c0dd49bdSEiji Ota 		    conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret);
190c0dd49bdSEiji Ota 
191c0dd49bdSEiji Ota 		if (ret) {
192c0dd49bdSEiji Ota 			if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING,
193c0dd49bdSEiji Ota 			    RDSV3_CONN_DOWN))
194c0dd49bdSEiji Ota 				rdsv3_queue_reconnect(conn);
195c0dd49bdSEiji Ota 			else {
196c0dd49bdSEiji Ota 				RDSV3_DPRINTF2("rdsv3_connect_worker",
197c0dd49bdSEiji Ota 				    "RDS: connect failed: %p", conn);
198c0dd49bdSEiji Ota 				rdsv3_conn_drop(conn);
199c0dd49bdSEiji Ota 			}
200c0dd49bdSEiji Ota 		}
201c0dd49bdSEiji Ota 	}
202c0dd49bdSEiji Ota 
203c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work);
204c0dd49bdSEiji Ota }
205c0dd49bdSEiji Ota 
206c0dd49bdSEiji Ota void
rdsv3_send_worker(struct rdsv3_work_s * work)207c0dd49bdSEiji Ota rdsv3_send_worker(struct rdsv3_work_s *work)
208c0dd49bdSEiji Ota {
209*94c3dad2SToomas Soome 	struct rdsv3_connection *conn = __containerof(work,
210c0dd49bdSEiji Ota 	    struct rdsv3_connection, c_send_w.work);
211c0dd49bdSEiji Ota 	int ret;
212c0dd49bdSEiji Ota 
213c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work);
214c0dd49bdSEiji Ota 
215c0dd49bdSEiji Ota 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
216c0dd49bdSEiji Ota 		ret = rdsv3_send_xmit(conn);
217c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret);
218c0dd49bdSEiji Ota 		switch (ret) {
219c0dd49bdSEiji Ota 		case -EAGAIN:
220c0dd49bdSEiji Ota 			rdsv3_stats_inc(s_send_immediate_retry);
221c0dd49bdSEiji Ota 			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
222c0dd49bdSEiji Ota 			break;
223c0dd49bdSEiji Ota 		case -ENOMEM:
224c0dd49bdSEiji Ota 			rdsv3_stats_inc(s_send_delayed_retry);
225c0dd49bdSEiji Ota 			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2);
226c0dd49bdSEiji Ota 		default:
227c0dd49bdSEiji Ota 			break;
228c0dd49bdSEiji Ota 		}
229c0dd49bdSEiji Ota 	}
230c0dd49bdSEiji Ota 
231c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work);
232c0dd49bdSEiji Ota }
233c0dd49bdSEiji Ota 
234c0dd49bdSEiji Ota void
rdsv3_recv_worker(struct rdsv3_work_s * work)235c0dd49bdSEiji Ota rdsv3_recv_worker(struct rdsv3_work_s *work)
236c0dd49bdSEiji Ota {
237*94c3dad2SToomas Soome 	struct rdsv3_connection *conn = __containerof(work,
238c0dd49bdSEiji Ota 	    struct rdsv3_connection, c_recv_w.work);
239c0dd49bdSEiji Ota 	int ret;
240c0dd49bdSEiji Ota 
241c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work);
242c0dd49bdSEiji Ota 
243c0dd49bdSEiji Ota 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
244c0dd49bdSEiji Ota 		ret = conn->c_trans->recv(conn);
245c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret);
246c0dd49bdSEiji Ota 		switch (ret) {
247c0dd49bdSEiji Ota 		case -EAGAIN:
248c0dd49bdSEiji Ota 			rdsv3_stats_inc(s_recv_immediate_retry);
249c0dd49bdSEiji Ota 			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0);
250c0dd49bdSEiji Ota 			break;
251c0dd49bdSEiji Ota 		case -ENOMEM:
252c0dd49bdSEiji Ota 			rdsv3_stats_inc(s_recv_delayed_retry);
253c0dd49bdSEiji Ota 			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2);
254c0dd49bdSEiji Ota 		default:
255c0dd49bdSEiji Ota 			break;
256c0dd49bdSEiji Ota 		}
257c0dd49bdSEiji Ota 	}
258c0dd49bdSEiji Ota 
259c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work);
260c0dd49bdSEiji Ota }
261c0dd49bdSEiji Ota 
262c0dd49bdSEiji Ota void
rdsv3_shutdown_worker(struct rdsv3_work_s * work)2635d5562f5SEiji Ota rdsv3_shutdown_worker(struct rdsv3_work_s *work)
2645d5562f5SEiji Ota {
265*94c3dad2SToomas Soome 	struct rdsv3_connection *conn = __containerof(work,
2665d5562f5SEiji Ota 	    struct rdsv3_connection, c_down_w);
2675d5562f5SEiji Ota 	rdsv3_conn_shutdown(conn);
2685d5562f5SEiji Ota }
2695d5562f5SEiji Ota 
2705d5562f5SEiji Ota #define	time_after(a, b)	((long)(b) - (long)(a) < 0)
2715d5562f5SEiji Ota 
2725d5562f5SEiji Ota void
rdsv3_reaper_worker(struct rdsv3_work_s * work)2735d5562f5SEiji Ota rdsv3_reaper_worker(struct rdsv3_work_s *work)
2745d5562f5SEiji Ota {
275*94c3dad2SToomas Soome 	struct rdsv3_connection *conn = __containerof(work,
2765d5562f5SEiji Ota 	    struct rdsv3_connection, c_reap_w.work);
2775d5562f5SEiji Ota 
2785d5562f5SEiji Ota 	if (rdsv3_conn_state(conn) != RDSV3_CONN_UP &&
2795d5562f5SEiji Ota 	    !time_after(conn->c_last_connect_jiffies,
2805d5562f5SEiji Ota 	    ddi_get_lbolt() - RDSV3_REAPER_WAIT_JIFFIES)) {
2815d5562f5SEiji Ota 		rdsv3_conn_destroy(conn);
2825d5562f5SEiji Ota 	} else {
2835d5562f5SEiji Ota 		rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w,
2845d5562f5SEiji Ota 		    RDSV3_REAPER_WAIT_JIFFIES);
2855d5562f5SEiji Ota 	}
2865d5562f5SEiji Ota }
2875d5562f5SEiji Ota 
2885d5562f5SEiji Ota void
rdsv3_threads_exit(void)289c0dd49bdSEiji Ota rdsv3_threads_exit(void)
290c0dd49bdSEiji Ota {
291c0dd49bdSEiji Ota 	rdsv3_destroy_task_workqueue(rdsv3_wq);
292c0dd49bdSEiji Ota }
293c0dd49bdSEiji Ota 
294c0dd49bdSEiji Ota int
rdsv3_threads_init(void)295c0dd49bdSEiji Ota rdsv3_threads_init(void)
296c0dd49bdSEiji Ota {
297c0dd49bdSEiji Ota 	rdsv3_wq = rdsv3_create_task_workqueue("krdsd");
2985d5562f5SEiji Ota 	if (!rdsv3_wq)
299c0dd49bdSEiji Ota 		return (-ENOMEM);
300c0dd49bdSEiji Ota 
301c0dd49bdSEiji Ota 	return (0);
302c0dd49bdSEiji Ota }
303