1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 #include <sys/sunddi.h> 59 60 #include <sys/ib/clients/rdsv3/rdsv3.h> 61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 62 63 /* 64 * All of connection management is simplified by serializing it through 65 * work queues that execute in a connection managing thread. 66 * 67 * TCP wants to send acks through sendpage() in response to data_ready(), 68 * but it needs a process context to do so. 69 * 70 * The receive paths need to allocate but can't drop packets (!) so we have 71 * a thread around to block allocating if the receive fast path sees an 72 * allocation failure. 73 */ 74 75 /* 76 * Grand Unified Theory of connection life cycle: 77 * At any point in time, the connection can be in one of these states: 78 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 79 * 80 * The following transitions are possible: 81 * ANY -> ERROR 82 * UP -> DISCONNECTING 83 * ERROR -> DISCONNECTING 84 * DISCONNECTING -> DOWN 85 * DOWN -> CONNECTING 86 * CONNECTING -> UP 87 * 88 * Transition to state DISCONNECTING/DOWN: 89 * - Inside the shutdown worker; synchronizes with xmit path 90 * through c_send_lock, and with connection management callbacks 91 * via c_cm_lock. 92 * 93 * For receive callbacks, we rely on the underlying transport 94 * (TCP, IB/RDMA) to provide the necessary synchronisation. 95 */ 96 struct rdsv3_workqueue_struct_s *rdsv3_wq; 97 98 void 99 rdsv3_connect_complete(struct rdsv3_connection *conn) 100 { 101 RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); 102 103 if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 104 RDSV3_CONN_UP)) { 105 #ifndef __lock_lint 106 RDSV3_DPRINTF2("rdsv3_connect_complete", 107 "%s: Cannot transition to state UP, " 108 "current state is %d", 109 __func__, 110 atomic_get(&conn->c_state)); 111 #endif 112 conn->c_state = RDSV3_CONN_ERROR; 113 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 114 return; 115 } 116 117 RDSV3_DPRINTF2("rdsv3_connect_complete", 118 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", 119 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 120 121 conn->c_reconnect_jiffies = 0; 122 conn->c_last_connect_jiffies = ddi_get_lbolt(); 123 124 set_bit(0, &conn->c_map_queued); 125 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 126 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 127 128 RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); 129 } 130 131 /* 132 * This random exponential backoff is relied on to eventually resolve racing 133 * connects. 134 * 135 * If connect attempts race then both parties drop both connections and come 136 * here to wait for a random amount of time before trying again. Eventually 137 * the backoff range will be so much greater than the time it takes to 138 * establish a connection that one of the pair will establish the connection 139 * before the other's random delay fires. 140 * 141 * Connection attempts that arrive while a connection is already established 142 * are also considered to be racing connects. This lets a connection from 143 * a rebooted machine replace an existing stale connection before the transport 144 * notices that the connection has failed. 145 * 146 * We should *always* start with a random backoff; otherwise a broken connection 147 * will always take several iterations to be re-established. 148 */ 149 void 150 rdsv3_queue_reconnect(struct rdsv3_connection *conn) 151 { 152 unsigned long rand; 153 154 RDSV3_DPRINTF2("rdsv3_queue_reconnect", 155 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", 156 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), 157 conn->c_reconnect_jiffies); 158 159 set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 160 if (conn->c_reconnect_jiffies == 0) { 161 conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; 162 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 163 return; 164 } 165 166 (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); 167 168 RDSV3_DPRINTF5("rdsv3", 169 "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 170 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, 171 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 172 173 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 174 rand % conn->c_reconnect_jiffies); 175 176 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, 177 rdsv3_sysctl_reconnect_max_jiffies); 178 } 179 180 void 181 rdsv3_connect_worker(struct rdsv3_work_s *work) 182 { 183 struct rdsv3_connection *conn = container_of(work, 184 struct rdsv3_connection, c_conn_w.work); 185 int ret; 186 187 RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); 188 189 clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 190 if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 191 RDSV3_CONN_CONNECTING)) { 192 ret = conn->c_trans->conn_connect(conn); 193 194 RDSV3_DPRINTF5("rdsv3", 195 "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " 196 "ret %d", conn, NIPQUAD(conn->c_laddr), 197 NIPQUAD(conn->c_faddr), ret); 198 199 RDSV3_DPRINTF2("rdsv3_connect_worker", 200 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", 201 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); 202 203 if (ret) { 204 if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 205 RDSV3_CONN_DOWN)) 206 rdsv3_queue_reconnect(conn); 207 else { 208 RDSV3_DPRINTF2("rdsv3_connect_worker", 209 "RDS: connect failed: %p", conn); 210 rdsv3_conn_drop(conn); 211 } 212 } 213 } 214 215 RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); 216 } 217 218 void 219 rdsv3_send_worker(struct rdsv3_work_s *work) 220 { 221 struct rdsv3_connection *conn = container_of(work, 222 struct rdsv3_connection, c_send_w.work); 223 int ret; 224 225 RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); 226 227 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 228 ret = rdsv3_send_xmit(conn); 229 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 230 switch (ret) { 231 case -EAGAIN: 232 rdsv3_stats_inc(s_send_immediate_retry); 233 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 234 break; 235 case -ENOMEM: 236 rdsv3_stats_inc(s_send_delayed_retry); 237 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); 238 default: 239 break; 240 } 241 } 242 243 RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); 244 } 245 246 void 247 rdsv3_recv_worker(struct rdsv3_work_s *work) 248 { 249 struct rdsv3_connection *conn = container_of(work, 250 struct rdsv3_connection, c_recv_w.work); 251 int ret; 252 253 RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); 254 255 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 256 ret = conn->c_trans->recv(conn); 257 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 258 switch (ret) { 259 case -EAGAIN: 260 rdsv3_stats_inc(s_recv_immediate_retry); 261 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 262 break; 263 case -ENOMEM: 264 rdsv3_stats_inc(s_recv_delayed_retry); 265 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); 266 default: 267 break; 268 } 269 } 270 271 RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); 272 } 273 274 void 275 rdsv3_shutdown_worker(struct rdsv3_work_s *work) 276 { 277 struct rdsv3_connection *conn = container_of(work, 278 struct rdsv3_connection, c_down_w); 279 rdsv3_conn_shutdown(conn); 280 } 281 282 #define time_after(a, b) ((long)(b) - (long)(a) < 0) 283 284 void 285 rdsv3_reaper_worker(struct rdsv3_work_s *work) 286 { 287 struct rdsv3_connection *conn = container_of(work, 288 struct rdsv3_connection, c_reap_w.work); 289 290 if (rdsv3_conn_state(conn) != RDSV3_CONN_UP && 291 !time_after(conn->c_last_connect_jiffies, 292 ddi_get_lbolt() - RDSV3_REAPER_WAIT_JIFFIES)) { 293 rdsv3_conn_destroy(conn); 294 } else { 295 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, 296 RDSV3_REAPER_WAIT_JIFFIES); 297 } 298 } 299 300 void 301 rdsv3_threads_exit(void) 302 { 303 rdsv3_destroy_task_workqueue(rdsv3_wq); 304 } 305 306 int 307 rdsv3_threads_init(void) 308 { 309 rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); 310 if (!rdsv3_wq) 311 return (-ENOMEM); 312 313 return (0); 314 } 315