1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file threads.c 7 * Oracle elects to have and use the contents of threads.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/rds.h> 45 #include <sys/sunddi.h> 46 #include <sys/containerof.h> 47 48 #include <sys/ib/clients/rdsv3/rdsv3.h> 49 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 50 51 /* 52 * All of connection management is simplified by serializing it through 53 * work queues that execute in a connection managing thread. 54 * 55 * TCP wants to send acks through sendpage() in response to data_ready(), 56 * but it needs a process context to do so. 57 * 58 * The receive paths need to allocate but can't drop packets (!) so we have 59 * a thread around to block allocating if the receive fast path sees an 60 * allocation failure. 61 */ 62 63 /* 64 * Grand Unified Theory of connection life cycle: 65 * At any point in time, the connection can be in one of these states: 66 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 67 * 68 * The following transitions are possible: 69 * ANY -> ERROR 70 * UP -> DISCONNECTING 71 * ERROR -> DISCONNECTING 72 * DISCONNECTING -> DOWN 73 * DOWN -> CONNECTING 74 * CONNECTING -> UP 75 * 76 * Transition to state DISCONNECTING/DOWN: 77 * - Inside the shutdown worker; synchronizes with xmit path 78 * through c_send_lock, and with connection management callbacks 79 * via c_cm_lock. 80 * 81 * For receive callbacks, we rely on the underlying transport 82 * (TCP, IB/RDMA) to provide the necessary synchronisation. 83 */ 84 struct rdsv3_workqueue_struct_s *rdsv3_wq; 85 86 void 87 rdsv3_connect_complete(struct rdsv3_connection *conn) 88 { 89 RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); 90 91 if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 92 RDSV3_CONN_UP)) { 93 #ifndef __lock_lint 94 RDSV3_DPRINTF2("rdsv3_connect_complete", 95 "%s: Cannot transition to state UP, " 96 "current state is %d", 97 __func__, 98 atomic_get(&conn->c_state)); 99 #endif 100 conn->c_state = RDSV3_CONN_ERROR; 101 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 102 return; 103 } 104 105 RDSV3_DPRINTF2("rdsv3_connect_complete", 106 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", 107 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 108 109 conn->c_reconnect_jiffies = 0; 110 conn->c_last_connect_jiffies = ddi_get_lbolt(); 111 112 set_bit(0, &conn->c_map_queued); 113 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 114 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 115 116 RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); 117 } 118 119 /* 120 * This random exponential backoff is relied on to eventually resolve racing 121 * connects. 122 * 123 * If connect attempts race then both parties drop both connections and come 124 * here to wait for a random amount of time before trying again. Eventually 125 * the backoff range will be so much greater than the time it takes to 126 * establish a connection that one of the pair will establish the connection 127 * before the other's random delay fires. 128 * 129 * Connection attempts that arrive while a connection is already established 130 * are also considered to be racing connects. This lets a connection from 131 * a rebooted machine replace an existing stale connection before the transport 132 * notices that the connection has failed. 133 * 134 * We should *always* start with a random backoff; otherwise a broken connection 135 * will always take several iterations to be re-established. 136 */ 137 void 138 rdsv3_queue_reconnect(struct rdsv3_connection *conn) 139 { 140 unsigned long rand; 141 142 RDSV3_DPRINTF2("rdsv3_queue_reconnect", 143 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", 144 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), 145 conn->c_reconnect_jiffies); 146 147 set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 148 if (conn->c_reconnect_jiffies == 0) { 149 conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; 150 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 151 return; 152 } 153 154 (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); 155 156 RDSV3_DPRINTF5("rdsv3", 157 "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 158 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, 159 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 160 161 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 162 rand % conn->c_reconnect_jiffies); 163 164 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, 165 rdsv3_sysctl_reconnect_max_jiffies); 166 } 167 168 void 169 rdsv3_connect_worker(struct rdsv3_work_s *work) 170 { 171 struct rdsv3_connection *conn = __containerof(work, 172 struct rdsv3_connection, c_conn_w.work); 173 int ret; 174 175 RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); 176 177 clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 178 if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 179 RDSV3_CONN_CONNECTING)) { 180 ret = conn->c_trans->conn_connect(conn); 181 182 RDSV3_DPRINTF5("rdsv3", 183 "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " 184 "ret %d", conn, NIPQUAD(conn->c_laddr), 185 NIPQUAD(conn->c_faddr), ret); 186 187 RDSV3_DPRINTF2("rdsv3_connect_worker", 188 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", 189 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); 190 191 if (ret) { 192 if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 193 RDSV3_CONN_DOWN)) 194 rdsv3_queue_reconnect(conn); 195 else { 196 RDSV3_DPRINTF2("rdsv3_connect_worker", 197 "RDS: connect failed: %p", conn); 198 rdsv3_conn_drop(conn); 199 } 200 } 201 } 202 203 RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); 204 } 205 206 void 207 rdsv3_send_worker(struct rdsv3_work_s *work) 208 { 209 struct rdsv3_connection *conn = __containerof(work, 210 struct rdsv3_connection, c_send_w.work); 211 int ret; 212 213 RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); 214 215 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 216 ret = rdsv3_send_xmit(conn); 217 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 218 switch (ret) { 219 case -EAGAIN: 220 rdsv3_stats_inc(s_send_immediate_retry); 221 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 222 break; 223 case -ENOMEM: 224 rdsv3_stats_inc(s_send_delayed_retry); 225 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); 226 default: 227 break; 228 } 229 } 230 231 RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); 232 } 233 234 void 235 rdsv3_recv_worker(struct rdsv3_work_s *work) 236 { 237 struct rdsv3_connection *conn = __containerof(work, 238 struct rdsv3_connection, c_recv_w.work); 239 int ret; 240 241 RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); 242 243 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 244 ret = conn->c_trans->recv(conn); 245 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 246 switch (ret) { 247 case -EAGAIN: 248 rdsv3_stats_inc(s_recv_immediate_retry); 249 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 250 break; 251 case -ENOMEM: 252 rdsv3_stats_inc(s_recv_delayed_retry); 253 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); 254 default: 255 break; 256 } 257 } 258 259 RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); 260 } 261 262 void 263 rdsv3_shutdown_worker(struct rdsv3_work_s *work) 264 { 265 struct rdsv3_connection *conn = __containerof(work, 266 struct rdsv3_connection, c_down_w); 267 rdsv3_conn_shutdown(conn); 268 } 269 270 #define time_after(a, b) ((long)(b) - (long)(a) < 0) 271 272 void 273 rdsv3_reaper_worker(struct rdsv3_work_s *work) 274 { 275 struct rdsv3_connection *conn = __containerof(work, 276 struct rdsv3_connection, c_reap_w.work); 277 278 if (rdsv3_conn_state(conn) != RDSV3_CONN_UP && 279 !time_after(conn->c_last_connect_jiffies, 280 ddi_get_lbolt() - RDSV3_REAPER_WAIT_JIFFIES)) { 281 rdsv3_conn_destroy(conn); 282 } else { 283 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, 284 RDSV3_REAPER_WAIT_JIFFIES); 285 } 286 } 287 288 void 289 rdsv3_threads_exit(void) 290 { 291 rdsv3_destroy_task_workqueue(rdsv3_wq); 292 } 293 294 int 295 rdsv3_threads_init(void) 296 { 297 rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); 298 if (!rdsv3_wq) 299 return (-ENOMEM); 300 301 return (0); 302 } 303