1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 #include <sys/sunddi.h> 59 60 #include <sys/ib/clients/rdsv3/rdsv3.h> 61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 62 63 /* 64 * All of connection management is simplified by serializing it through 65 * work queues that execute in a connection managing thread. 66 * 67 * TCP wants to send acks through sendpage() in response to data_ready(), 68 * but it needs a process context to do so. 69 * 70 * The receive paths need to allocate but can't drop packets (!) so we have 71 * a thread around to block allocating if the receive fast path sees an 72 * allocation failure. 73 */ 74 75 /* 76 * Grand Unified Theory of connection life cycle: 77 * At any point in time, the connection can be in one of these states: 78 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 79 * 80 * The following transitions are possible: 81 * ANY -> ERROR 82 * UP -> DISCONNECTING 83 * ERROR -> DISCONNECTING 84 * DISCONNECTING -> DOWN 85 * DOWN -> CONNECTING 86 * CONNECTING -> UP 87 * 88 * Transition to state DISCONNECTING/DOWN: 89 * - Inside the shutdown worker; synchronizes with xmit path 90 * through c_send_lock, and with connection management callbacks 91 * via c_cm_lock. 92 * 93 * For receive callbacks, we rely on the underlying transport 94 * (TCP, IB/RDMA) to provide the necessary synchronisation. 95 */ 96 struct rdsv3_workqueue_struct_s *rdsv3_wq; 97 98 void 99 rdsv3_connect_complete(struct rdsv3_connection *conn) 100 { 101 RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); 102 103 if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 104 RDSV3_CONN_UP)) { 105 #ifndef __lock_lint 106 RDSV3_DPRINTF2("rdsv3_connect_complete", 107 "%s: Cannot transition to state UP, " 108 "current state is %d", 109 __func__, 110 atomic_get(&conn->c_state)); 111 #endif 112 conn->c_state = RDSV3_CONN_ERROR; 113 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 114 return; 115 } 116 117 RDSV3_DPRINTF2("rdsv3_connect_complete", 118 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", 119 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 120 121 conn->c_reconnect_jiffies = 0; 122 set_bit(0, &conn->c_map_queued); 123 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 124 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 125 126 RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); 127 } 128 129 /* 130 * This random exponential backoff is relied on to eventually resolve racing 131 * connects. 132 * 133 * If connect attempts race then both parties drop both connections and come 134 * here to wait for a random amount of time before trying again. Eventually 135 * the backoff range will be so much greater than the time it takes to 136 * establish a connection that one of the pair will establish the connection 137 * before the other's random delay fires. 138 * 139 * Connection attempts that arrive while a connection is already established 140 * are also considered to be racing connects. This lets a connection from 141 * a rebooted machine replace an existing stale connection before the transport 142 * notices that the connection has failed. 143 * 144 * We should *always* start with a random backoff; otherwise a broken connection 145 * will always take several iterations to be re-established. 146 */ 147 static void 148 rdsv3_queue_reconnect(struct rdsv3_connection *conn) 149 { 150 unsigned long rand; 151 152 RDSV3_DPRINTF2("rdsv3_queue_reconnect", 153 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", 154 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), 155 conn->c_reconnect_jiffies); 156 157 set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 158 if (conn->c_reconnect_jiffies == 0) { 159 conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; 160 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 161 return; 162 } 163 164 (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); 165 RDSV3_DPRINTF5("rdsv3", 166 "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 167 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, 168 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 169 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 170 rand % conn->c_reconnect_jiffies); 171 172 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, 173 rdsv3_sysctl_reconnect_max_jiffies); 174 } 175 176 void 177 rdsv3_connect_worker(struct rdsv3_work_s *work) 178 { 179 struct rdsv3_connection *conn = container_of(work, 180 struct rdsv3_connection, c_conn_w.work); 181 int ret; 182 183 RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); 184 185 clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 186 if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 187 RDSV3_CONN_CONNECTING)) { 188 ret = conn->c_trans->conn_connect(conn); 189 RDSV3_DPRINTF5("rdsv3", 190 "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " 191 "ret %d", conn, NIPQUAD(conn->c_laddr), 192 NIPQUAD(conn->c_faddr), ret); 193 RDSV3_DPRINTF2("rdsv3_connect_worker", 194 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", 195 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); 196 197 if (ret) { 198 if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 199 RDSV3_CONN_DOWN)) 200 rdsv3_queue_reconnect(conn); 201 else { 202 RDSV3_DPRINTF2("rdsv3_connect_worker", 203 "RDS: connect failed: %p", conn); 204 rdsv3_conn_drop(conn); 205 } 206 } 207 } 208 209 RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); 210 } 211 212 extern struct avl_tree rdsv3_conn_hash; 213 214 void 215 rdsv3_shutdown_worker(struct rdsv3_work_s *work) 216 { 217 struct rdsv3_connection *conn = container_of(work, 218 struct rdsv3_connection, c_down_w); 219 struct rdsv3_conn_info_s conn_info; 220 221 RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Enter(work: %p)", work); 222 223 /* shut it down unless it's down already */ 224 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) { 225 /* 226 * Quiesce the connection mgmt handlers before we start tearing 227 * things down. We don't hold the mutex for the entire 228 * duration of the shutdown operation, else we may be 229 * deadlocking with the CM handler. Instead, the CM event 230 * handler is supposed to check for state DISCONNECTING 231 */ 232 mutex_enter(&conn->c_cm_lock); 233 if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP, 234 RDSV3_CONN_DISCONNECTING) && 235 !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR, 236 RDSV3_CONN_DISCONNECTING)) { 237 RDSV3_DPRINTF2("rdsv3_shutdown_worker", 238 "RDS: connect failed: conn: %p, state: %d", 239 conn, atomic_get(&conn->c_state)); 240 rdsv3_conn_drop(conn); 241 mutex_exit(&conn->c_cm_lock); 242 return; 243 } 244 mutex_exit(&conn->c_cm_lock); 245 246 mutex_enter(&conn->c_send_lock); 247 conn->c_trans->conn_shutdown(conn); 248 rdsv3_conn_reset(conn); 249 mutex_exit(&conn->c_send_lock); 250 251 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING, 252 RDSV3_CONN_DOWN)) { 253 /* 254 * This can happen - eg when we're in the middle of 255 * tearing down the connection, and someone unloads 256 * the rds module. Quite reproduceable with loopback 257 * connections. Mostly harmless. 258 */ 259 #ifndef __lock_lint 260 RDSV3_DPRINTF2("rdsv3_shutdown_worker", 261 "failed to transition to state DOWN, " 262 "current statis is: %d conn: %p", 263 atomic_get(&conn->c_state), conn); 264 rdsv3_conn_drop(conn); 265 #endif 266 return; 267 } 268 } 269 270 /* 271 * Then reconnect if it's still live. 272 * The passive side of an IB loopback connection is never added 273 * to the conn hash, so we never trigger a reconnect on this 274 * conn - the reconnect is always triggered by the active peer. 275 */ 276 rdsv3_cancel_delayed_work(&conn->c_conn_w); 277 278 conn_info.c_laddr = conn->c_laddr; 279 conn_info.c_faddr = conn->c_faddr; 280 if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn) 281 rdsv3_queue_reconnect(conn); 282 283 RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Return(work: %p)", work); 284 } 285 286 void 287 rdsv3_send_worker(struct rdsv3_work_s *work) 288 { 289 struct rdsv3_connection *conn = container_of(work, 290 struct rdsv3_connection, c_send_w.work); 291 int ret; 292 293 RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); 294 295 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 296 ret = rdsv3_send_xmit(conn); 297 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 298 switch (ret) { 299 case -EAGAIN: 300 rdsv3_stats_inc(s_send_immediate_retry); 301 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 302 break; 303 case -ENOMEM: 304 rdsv3_stats_inc(s_send_delayed_retry); 305 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); 306 default: 307 break; 308 } 309 } 310 311 RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); 312 } 313 314 void 315 rdsv3_recv_worker(struct rdsv3_work_s *work) 316 { 317 struct rdsv3_connection *conn = container_of(work, 318 struct rdsv3_connection, c_recv_w.work); 319 int ret; 320 321 RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); 322 323 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 324 ret = conn->c_trans->recv(conn); 325 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 326 switch (ret) { 327 case -EAGAIN: 328 rdsv3_stats_inc(s_recv_immediate_retry); 329 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 330 break; 331 case -ENOMEM: 332 rdsv3_stats_inc(s_recv_delayed_retry); 333 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); 334 default: 335 break; 336 } 337 } 338 339 RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); 340 } 341 342 void 343 rdsv3_threads_exit(void) 344 { 345 rdsv3_destroy_task_workqueue(rdsv3_wq); 346 } 347 348 int 349 rdsv3_threads_init(void) 350 { 351 rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); 352 if (rdsv3_wq == NULL) 353 return (-ENOMEM); 354 355 return (0); 356 } 357