1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file threads.c 7 * Oracle elects to have and use the contents of threads.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/rds.h> 45 #include <sys/sunddi.h> 46 47 #include <sys/ib/clients/rdsv3/rdsv3.h> 48 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 49 50 /* 51 * All of connection management is simplified by serializing it through 52 * work queues that execute in a connection managing thread. 53 * 54 * TCP wants to send acks through sendpage() in response to data_ready(), 55 * but it needs a process context to do so. 56 * 57 * The receive paths need to allocate but can't drop packets (!) so we have 58 * a thread around to block allocating if the receive fast path sees an 59 * allocation failure. 60 */ 61 62 /* 63 * Grand Unified Theory of connection life cycle: 64 * At any point in time, the connection can be in one of these states: 65 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 66 * 67 * The following transitions are possible: 68 * ANY -> ERROR 69 * UP -> DISCONNECTING 70 * ERROR -> DISCONNECTING 71 * DISCONNECTING -> DOWN 72 * DOWN -> CONNECTING 73 * CONNECTING -> UP 74 * 75 * Transition to state DISCONNECTING/DOWN: 76 * - Inside the shutdown worker; synchronizes with xmit path 77 * through c_send_lock, and with connection management callbacks 78 * via c_cm_lock. 79 * 80 * For receive callbacks, we rely on the underlying transport 81 * (TCP, IB/RDMA) to provide the necessary synchronisation. 82 */ 83 struct rdsv3_workqueue_struct_s *rdsv3_wq; 84 85 void 86 rdsv3_connect_complete(struct rdsv3_connection *conn) 87 { 88 RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); 89 90 if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 91 RDSV3_CONN_UP)) { 92 #ifndef __lock_lint 93 RDSV3_DPRINTF2("rdsv3_connect_complete", 94 "%s: Cannot transition to state UP, " 95 "current state is %d", 96 __func__, 97 atomic_get(&conn->c_state)); 98 #endif 99 conn->c_state = RDSV3_CONN_ERROR; 100 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 101 return; 102 } 103 104 RDSV3_DPRINTF2("rdsv3_connect_complete", 105 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", 106 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 107 108 conn->c_reconnect_jiffies = 0; 109 conn->c_last_connect_jiffies = ddi_get_lbolt(); 110 111 set_bit(0, &conn->c_map_queued); 112 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 113 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 114 115 RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); 116 } 117 118 /* 119 * This random exponential backoff is relied on to eventually resolve racing 120 * connects. 121 * 122 * If connect attempts race then both parties drop both connections and come 123 * here to wait for a random amount of time before trying again. Eventually 124 * the backoff range will be so much greater than the time it takes to 125 * establish a connection that one of the pair will establish the connection 126 * before the other's random delay fires. 127 * 128 * Connection attempts that arrive while a connection is already established 129 * are also considered to be racing connects. This lets a connection from 130 * a rebooted machine replace an existing stale connection before the transport 131 * notices that the connection has failed. 132 * 133 * We should *always* start with a random backoff; otherwise a broken connection 134 * will always take several iterations to be re-established. 135 */ 136 void 137 rdsv3_queue_reconnect(struct rdsv3_connection *conn) 138 { 139 unsigned long rand; 140 141 RDSV3_DPRINTF2("rdsv3_queue_reconnect", 142 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", 143 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), 144 conn->c_reconnect_jiffies); 145 146 set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 147 if (conn->c_reconnect_jiffies == 0) { 148 conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; 149 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 150 return; 151 } 152 153 (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); 154 155 RDSV3_DPRINTF5("rdsv3", 156 "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 157 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, 158 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 159 160 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 161 rand % conn->c_reconnect_jiffies); 162 163 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, 164 rdsv3_sysctl_reconnect_max_jiffies); 165 } 166 167 void 168 rdsv3_connect_worker(struct rdsv3_work_s *work) 169 { 170 struct rdsv3_connection *conn = container_of(work, 171 struct rdsv3_connection, c_conn_w.work); 172 int ret; 173 174 RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); 175 176 clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 177 if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 178 RDSV3_CONN_CONNECTING)) { 179 ret = conn->c_trans->conn_connect(conn); 180 181 RDSV3_DPRINTF5("rdsv3", 182 "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " 183 "ret %d", conn, NIPQUAD(conn->c_laddr), 184 NIPQUAD(conn->c_faddr), ret); 185 186 RDSV3_DPRINTF2("rdsv3_connect_worker", 187 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", 188 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); 189 190 if (ret) { 191 if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 192 RDSV3_CONN_DOWN)) 193 rdsv3_queue_reconnect(conn); 194 else { 195 RDSV3_DPRINTF2("rdsv3_connect_worker", 196 "RDS: connect failed: %p", conn); 197 rdsv3_conn_drop(conn); 198 } 199 } 200 } 201 202 RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); 203 } 204 205 void 206 rdsv3_send_worker(struct rdsv3_work_s *work) 207 { 208 struct rdsv3_connection *conn = container_of(work, 209 struct rdsv3_connection, c_send_w.work); 210 int ret; 211 212 RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); 213 214 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 215 ret = rdsv3_send_xmit(conn); 216 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 217 switch (ret) { 218 case -EAGAIN: 219 rdsv3_stats_inc(s_send_immediate_retry); 220 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 221 break; 222 case -ENOMEM: 223 rdsv3_stats_inc(s_send_delayed_retry); 224 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); 225 default: 226 break; 227 } 228 } 229 230 RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); 231 } 232 233 void 234 rdsv3_recv_worker(struct rdsv3_work_s *work) 235 { 236 struct rdsv3_connection *conn = container_of(work, 237 struct rdsv3_connection, c_recv_w.work); 238 int ret; 239 240 RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); 241 242 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 243 ret = conn->c_trans->recv(conn); 244 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 245 switch (ret) { 246 case -EAGAIN: 247 rdsv3_stats_inc(s_recv_immediate_retry); 248 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 249 break; 250 case -ENOMEM: 251 rdsv3_stats_inc(s_recv_delayed_retry); 252 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); 253 default: 254 break; 255 } 256 } 257 258 RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); 259 } 260 261 void 262 rdsv3_shutdown_worker(struct rdsv3_work_s *work) 263 { 264 struct rdsv3_connection *conn = container_of(work, 265 struct rdsv3_connection, c_down_w); 266 rdsv3_conn_shutdown(conn); 267 } 268 269 #define time_after(a, b) ((long)(b) - (long)(a) < 0) 270 271 void 272 rdsv3_reaper_worker(struct rdsv3_work_s *work) 273 { 274 struct rdsv3_connection *conn = container_of(work, 275 struct rdsv3_connection, c_reap_w.work); 276 277 if (rdsv3_conn_state(conn) != RDSV3_CONN_UP && 278 !time_after(conn->c_last_connect_jiffies, 279 ddi_get_lbolt() - RDSV3_REAPER_WAIT_JIFFIES)) { 280 rdsv3_conn_destroy(conn); 281 } else { 282 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, 283 RDSV3_REAPER_WAIT_JIFFIES); 284 } 285 } 286 287 void 288 rdsv3_threads_exit(void) 289 { 290 rdsv3_destroy_task_workqueue(rdsv3_wq); 291 } 292 293 int 294 rdsv3_threads_init(void) 295 { 296 rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); 297 if (!rdsv3_wq) 298 return (-ENOMEM); 299 300 return (0); 301 } 302