1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 #include <sys/sunddi.h> 59 60 #include <sys/ib/clients/rdsv3/rdsv3.h> 61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 62 63 /* 64 * All of connection management is simplified by serializing it through 65 * work queues that execute in a connection managing thread. 66 * 67 * TCP wants to send acks through sendpage() in response to data_ready(), 68 * but it needs a process context to do so. 69 * 70 * The receive paths need to allocate but can't drop packets (!) so we have 71 * a thread around to block allocating if the receive fast path sees an 72 * allocation failure. 73 */ 74 75 /* 76 * Grand Unified Theory of connection life cycle: 77 * At any point in time, the connection can be in one of these states: 78 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 79 * 80 * The following transitions are possible: 81 * ANY -> ERROR 82 * UP -> DISCONNECTING 83 * ERROR -> DISCONNECTING 84 * DISCONNECTING -> DOWN 85 * DOWN -> CONNECTING 86 * CONNECTING -> UP 87 * 88 * Transition to state DISCONNECTING/DOWN: 89 * - Inside the shutdown worker; synchronizes with xmit path 90 * through c_send_lock, and with connection management callbacks 91 * via c_cm_lock. 92 * 93 * For receive callbacks, we rely on the underlying transport 94 * (TCP, IB/RDMA) to provide the necessary synchronisation. 95 */ 96 struct rdsv3_workqueue_struct_s *rdsv3_wq; 97 98 void 99 rdsv3_connect_complete(struct rdsv3_connection *conn) 100 { 101 RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); 102 103 if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 104 RDSV3_CONN_UP)) { 105 #ifndef __lock_lint 106 RDSV3_DPRINTF2("rdsv3_connect_complete", 107 "%s: Cannot transition to state UP, " 108 "current state is %d", 109 __func__, 110 atomic_get(&conn->c_state)); 111 #endif 112 conn->c_state = RDSV3_CONN_ERROR; 113 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 114 return; 115 } 116 117 RDSV3_DPRINTF2("rdsv3_connect_complete", 118 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", 119 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 120 121 conn->c_reconnect_jiffies = 0; 122 set_bit(0, &conn->c_map_queued); 123 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 124 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 125 126 RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); 127 } 128 129 /* 130 * This random exponential backoff is relied on to eventually resolve racing 131 * connects. 132 * 133 * If connect attempts race then both parties drop both connections and come 134 * here to wait for a random amount of time before trying again. Eventually 135 * the backoff range will be so much greater than the time it takes to 136 * establish a connection that one of the pair will establish the connection 137 * before the other's random delay fires. 138 * 139 * Connection attempts that arrive while a connection is already established 140 * are also considered to be racing connects. This lets a connection from 141 * a rebooted machine replace an existing stale connection before the transport 142 * notices that the connection has failed. 143 * 144 * We should *always* start with a random backoff; otherwise a broken connection 145 * will always take several iterations to be re-established. 146 */ 147 static void 148 rdsv3_queue_reconnect(struct rdsv3_connection *conn) 149 { 150 unsigned long rand; 151 152 RDSV3_DPRINTF2("rdsv3_queue_reconnect", 153 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", 154 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), 155 conn->c_reconnect_jiffies); 156 157 set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 158 if (conn->c_reconnect_jiffies == 0) { 159 conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; 160 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 161 return; 162 } 163 164 (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); 165 166 RDSV3_DPRINTF5("rdsv3", 167 "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 168 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, 169 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 170 171 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 172 rand % conn->c_reconnect_jiffies); 173 174 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, 175 rdsv3_sysctl_reconnect_max_jiffies); 176 } 177 178 void 179 rdsv3_connect_worker(struct rdsv3_work_s *work) 180 { 181 struct rdsv3_connection *conn = container_of(work, 182 struct rdsv3_connection, c_conn_w.work); 183 int ret; 184 185 RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); 186 187 clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 188 if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 189 RDSV3_CONN_CONNECTING)) { 190 ret = conn->c_trans->conn_connect(conn); 191 192 RDSV3_DPRINTF5("rdsv3", 193 "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " 194 "ret %d", conn, NIPQUAD(conn->c_laddr), 195 NIPQUAD(conn->c_faddr), ret); 196 197 RDSV3_DPRINTF2("rdsv3_connect_worker", 198 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", 199 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); 200 201 if (ret) { 202 if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 203 RDSV3_CONN_DOWN)) 204 rdsv3_queue_reconnect(conn); 205 else { 206 RDSV3_DPRINTF2("rdsv3_connect_worker", 207 "RDS: connect failed: %p", conn); 208 rdsv3_conn_drop(conn); 209 } 210 } 211 } 212 213 RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); 214 } 215 216 extern struct avl_tree rdsv3_conn_hash; 217 218 void 219 rdsv3_shutdown_worker(struct rdsv3_work_s *work) 220 { 221 struct rdsv3_connection *conn = container_of(work, 222 struct rdsv3_connection, c_down_w); 223 struct rdsv3_conn_info_s conn_info; 224 225 RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Enter(work: %p)", work); 226 227 /* shut it down unless it's down already */ 228 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) { 229 /* 230 * Quiesce the connection mgmt handlers before we start tearing 231 * things down. We don't hold the mutex for the entire 232 * duration of the shutdown operation, else we may be 233 * deadlocking with the CM handler. Instead, the CM event 234 * handler is supposed to check for state DISCONNECTING 235 */ 236 mutex_enter(&conn->c_cm_lock); 237 if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP, 238 RDSV3_CONN_DISCONNECTING) && 239 !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR, 240 RDSV3_CONN_DISCONNECTING)) { 241 RDSV3_DPRINTF2("rdsv3_shutdown_worker", 242 "RDS: connect failed: conn: %p, state: %d", 243 conn, atomic_get(&conn->c_state)); 244 rdsv3_conn_drop(conn); 245 mutex_exit(&conn->c_cm_lock); 246 return; 247 } 248 mutex_exit(&conn->c_cm_lock); 249 250 mutex_enter(&conn->c_send_lock); 251 conn->c_trans->conn_shutdown(conn); 252 rdsv3_conn_reset(conn); 253 mutex_exit(&conn->c_send_lock); 254 255 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING, 256 RDSV3_CONN_DOWN)) { 257 /* 258 * This can happen - eg when we're in the middle of 259 * tearing down the connection, and someone unloads 260 * the rds module. Quite reproduceable with loopback 261 * connections. Mostly harmless. 262 */ 263 #ifndef __lock_lint 264 RDSV3_DPRINTF2("rdsv3_shutdown_worker", 265 "failed to transition to state DOWN, " 266 "current statis is: %d conn: %p", 267 atomic_get(&conn->c_state), conn); 268 rdsv3_conn_drop(conn); 269 #endif 270 return; 271 } 272 } 273 274 /* 275 * Then reconnect if it's still live. 276 * The passive side of an IB loopback connection is never added 277 * to the conn hash, so we never trigger a reconnect on this 278 * conn - the reconnect is always triggered by the active peer. 279 */ 280 rdsv3_cancel_delayed_work(&conn->c_conn_w); 281 282 conn_info.c_laddr = conn->c_laddr; 283 conn_info.c_faddr = conn->c_faddr; 284 if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn) 285 rdsv3_queue_reconnect(conn); 286 287 RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Return(work: %p)", work); 288 } 289 290 void 291 rdsv3_send_worker(struct rdsv3_work_s *work) 292 { 293 struct rdsv3_connection *conn = container_of(work, 294 struct rdsv3_connection, c_send_w.work); 295 int ret; 296 297 RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); 298 299 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 300 ret = rdsv3_send_xmit(conn); 301 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 302 switch (ret) { 303 case -EAGAIN: 304 rdsv3_stats_inc(s_send_immediate_retry); 305 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 306 break; 307 case -ENOMEM: 308 rdsv3_stats_inc(s_send_delayed_retry); 309 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); 310 default: 311 break; 312 } 313 } 314 315 RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); 316 } 317 318 void 319 rdsv3_recv_worker(struct rdsv3_work_s *work) 320 { 321 struct rdsv3_connection *conn = container_of(work, 322 struct rdsv3_connection, c_recv_w.work); 323 int ret; 324 325 RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); 326 327 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 328 ret = conn->c_trans->recv(conn); 329 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 330 switch (ret) { 331 case -EAGAIN: 332 rdsv3_stats_inc(s_recv_immediate_retry); 333 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 334 break; 335 case -ENOMEM: 336 rdsv3_stats_inc(s_recv_delayed_retry); 337 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); 338 default: 339 break; 340 } 341 } 342 343 RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); 344 } 345 346 void 347 rdsv3_threads_exit(void) 348 { 349 rdsv3_destroy_task_workqueue(rdsv3_wq); 350 } 351 352 int 353 rdsv3_threads_init(void) 354 { 355 rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); 356 if (rdsv3_wq == NULL) 357 return (-ENOMEM); 358 359 return (0); 360 } 361