1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/types.h> 58 #include <sys/kmem.h> 59 #include <sys/rds.h> 60 61 #include <sys/ib/clients/rdsv3/rdsv3.h> 62 #include <sys/ib/clients/rdsv3/loop.h> 63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 64 65 /* converting this to RCU is a chore for another day.. */ 66 static krwlock_t rdsv3_conn_lock; 67 static unsigned long rdsv3_conn_count; 68 struct avl_tree rdsv3_conn_hash; 69 static struct kmem_cache *rdsv3_conn_slab = NULL; 70 71 #define rdsv3_conn_info_set(var, test, suffix) do { \ 72 if (test) \ 73 var |= RDSV3_INFO_CONNECTION_FLAG_##suffix; \ 74 } while (0) 75 76 static inline int 77 rdsv3_conn_is_sending(struct rdsv3_connection *conn) 78 { 79 int ret = 0; 80 81 if (!mutex_tryenter(&conn->c_send_lock)) 82 ret = 1; 83 else 84 mutex_exit(&conn->c_send_lock); 85 86 return (ret); 87 } 88 89 static struct rdsv3_connection * 90 rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos) 91 { 92 struct rdsv3_connection *conn; 93 struct rdsv3_conn_info_s conn_info; 94 avl_index_t place = 0; 95 96 conn_info.c_laddr = laddr; 97 conn_info.c_faddr = faddr; 98 99 conn = avl_find(&rdsv3_conn_hash, &conn_info, &place); 100 101 RDSV3_DPRINTF5("rdsv3_conn_lookup", 102 "returning conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 103 conn, NIPQUAD(laddr), NIPQUAD(faddr)); 104 105 if (pos != NULL) 106 *pos = place; 107 108 return (conn); 109 } 110 111 /* 112 * This is called by transports as they're bringing down a connection. 113 * It clears partial message state so that the transport can start sending 114 * and receiving over this connection again in the future. It is up to 115 * the transport to have serialized this call with its send and recv. 116 */ 117 void 118 rdsv3_conn_reset(struct rdsv3_connection *conn) 119 { 120 RDSV3_DPRINTF2("rdsv3_conn_reset", 121 "connection %u.%u.%u.%u to %u.%u.%u.%u reset", 122 NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 123 124 rdsv3_stats_inc(s_conn_reset); 125 rdsv3_send_reset(conn); 126 conn->c_flags = 0; 127 128 /* 129 * Do not clear next_rx_seq here, else we cannot distinguish 130 * retransmitted packets from new packets, and will hand all 131 * of them to the application. That is not consistent with the 132 * reliability guarantees of RDS. 133 */ 134 } 135 136 /* 137 * There is only every one 'conn' for a given pair of addresses in the 138 * system at a time. They contain messages to be retransmitted and so 139 * span the lifetime of the actual underlying transport connections. 140 * 141 * For now they are not garbage collected once they're created. They 142 * are torn down as the module is removed, if ever. 143 */ 144 static struct rdsv3_connection * 145 __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, 146 struct rdsv3_transport *trans, int gfp, 147 int is_outgoing) 148 { 149 struct rdsv3_connection *conn, *parent = NULL; 150 avl_index_t pos; 151 int ret; 152 153 rw_enter(&rdsv3_conn_lock, RW_READER); 154 conn = rdsv3_conn_lookup(laddr, faddr, &pos); 155 if (conn && 156 conn->c_loopback && 157 conn->c_trans != &rdsv3_loop_transport && 158 !is_outgoing) { 159 /* 160 * This is a looped back IB connection, and we're 161 * called by the code handling the incoming connect. 162 * We need a second connection object into which we 163 * can stick the other QP. 164 */ 165 parent = conn; 166 conn = parent->c_passive; 167 } 168 rw_exit(&rdsv3_conn_lock); 169 if (conn) 170 goto out; 171 172 RDSV3_DPRINTF2("__rdsv3_conn_create", "Enter(%x -> %x)", 173 ntohl(laddr), ntohl(faddr)); 174 175 conn = kmem_cache_alloc(rdsv3_conn_slab, gfp); 176 if (conn == NULL) { 177 conn = ERR_PTR(-ENOMEM); 178 goto out; 179 } 180 181 /* see rdsv3_conn_constructor */ 182 conn->c_laddr = laddr; 183 conn->c_faddr = faddr; 184 185 ret = rdsv3_cong_get_maps(conn); 186 if (ret) { 187 kmem_cache_free(rdsv3_conn_slab, conn); 188 conn = ERR_PTR(ret); 189 goto out; 190 } 191 192 /* 193 * This is where a connection becomes loopback. If *any* RDS sockets 194 * can bind to the destination address then we'd rather the messages 195 * flow through loopback rather than either transport. 196 */ 197 if (rdsv3_trans_get_preferred(faddr)) { 198 conn->c_loopback = 1; 199 if (is_outgoing && trans->t_prefer_loopback) { 200 /* 201 * "outgoing" connection - and the transport 202 * says it wants the connection handled by the 203 * loopback transport. This is what TCP does. 204 */ 205 trans = &rdsv3_loop_transport; 206 } 207 } 208 209 conn->c_trans = trans; 210 211 ret = trans->conn_alloc(conn, gfp); 212 if (ret) { 213 kmem_cache_free(rdsv3_conn_slab, conn); 214 conn = ERR_PTR(ret); 215 goto out; 216 } 217 218 conn->c_state = RDSV3_CONN_DOWN; 219 conn->c_reconnect_jiffies = 0; 220 RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker); 221 RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker); 222 RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker); 223 RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker); 224 mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL); 225 conn->c_flags = 0; 226 227 RDSV3_DPRINTF2("__rdsv3_conn_create", 228 "allocated conn %p for %u.%u.%u.%u -> %u.%u.%u.%u over %s %s", 229 conn, NIPQUAD(laddr), NIPQUAD(faddr), 230 trans->t_name ? trans->t_name : "[unknown]", 231 is_outgoing ? "(outgoing)" : ""); 232 233 /* 234 * Since we ran without holding the conn lock, someone could 235 * have created the same conn (either normal or passive) in the 236 * interim. We check while holding the lock. If we won, we complete 237 * init and return our conn. If we lost, we rollback and return the 238 * other one. 239 */ 240 rw_enter(&rdsv3_conn_lock, RW_WRITER); 241 if (parent) { 242 /* Creating passive conn */ 243 if (parent->c_passive) { 244 trans->conn_free(conn->c_transport_data); 245 kmem_cache_free(rdsv3_conn_slab, conn); 246 conn = parent->c_passive; 247 } else { 248 parent->c_passive = conn; 249 rdsv3_cong_add_conn(conn); 250 rdsv3_conn_count++; 251 } 252 } else { 253 /* Creating normal conn */ 254 struct rdsv3_connection *found; 255 256 found = rdsv3_conn_lookup(laddr, faddr, &pos); 257 if (found) { 258 trans->conn_free(conn->c_transport_data); 259 kmem_cache_free(rdsv3_conn_slab, conn); 260 conn = found; 261 } else { 262 avl_insert(&rdsv3_conn_hash, conn, pos); 263 rdsv3_cong_add_conn(conn); 264 rdsv3_conn_count++; 265 } 266 } 267 268 rw_exit(&rdsv3_conn_lock); 269 270 RDSV3_DPRINTF2("__rdsv3_conn_create", "Return(conn: %p)", conn); 271 272 out: 273 return (conn); 274 } 275 276 struct rdsv3_connection * 277 rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, 278 struct rdsv3_transport *trans, int gfp) 279 { 280 return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 0)); 281 } 282 283 struct rdsv3_connection * 284 rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr, 285 struct rdsv3_transport *trans, int gfp) 286 { 287 return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1)); 288 } 289 290 void 291 rdsv3_conn_destroy(struct rdsv3_connection *conn) 292 { 293 struct rdsv3_message *rm, *rtmp; 294 295 RDSV3_DPRINTF4("rdsv3_conn_destroy", 296 "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 297 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 298 299 avl_remove(&rdsv3_conn_hash, conn); 300 301 /* wait for the rds thread to shut it down */ 302 conn->c_state = RDSV3_CONN_ERROR; 303 rdsv3_cancel_delayed_work(&conn->c_conn_w); 304 rdsv3_cancel_delayed_work(&conn->c_send_w); 305 rdsv3_cancel_delayed_work(&conn->c_recv_w); 306 rdsv3_shutdown_worker(&conn->c_down_w); 307 rdsv3_flush_workqueue(rdsv3_wq); 308 309 /* tear down queued messages */ 310 RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, 311 &conn->c_send_queue, 312 m_conn_item) { 313 list_remove_node(&rm->m_conn_item); 314 ASSERT(!list_link_active(&rm->m_sock_item)); 315 rdsv3_message_put(rm); 316 } 317 if (conn->c_xmit_rm) 318 rdsv3_message_put(conn->c_xmit_rm); 319 320 conn->c_trans->conn_free(conn->c_transport_data); 321 322 /* 323 * The congestion maps aren't freed up here. They're 324 * freed by rdsv3_cong_exit() after all the connections 325 * have been freed. 326 */ 327 rdsv3_cong_remove_conn(conn); 328 329 ASSERT(list_is_empty(&conn->c_retrans)); 330 kmem_cache_free(rdsv3_conn_slab, conn); 331 332 rdsv3_conn_count--; 333 } 334 335 /* ARGSUSED */ 336 static void 337 rdsv3_conn_message_info(struct rsock *sock, unsigned int len, 338 struct rdsv3_info_iterator *iter, 339 struct rdsv3_info_lengths *lens, 340 int want_send) 341 { 342 struct list *list; 343 struct rdsv3_connection *conn; 344 struct rdsv3_message *rm; 345 unsigned int total = 0; 346 347 RDSV3_DPRINTF4("rdsv3_conn_message_info", "Enter"); 348 349 len /= sizeof (struct rdsv3_info_message); 350 351 rw_enter(&rdsv3_conn_lock, RW_READER); 352 353 if (avl_is_empty(&rdsv3_conn_hash)) { 354 /* no connections */ 355 rw_exit(&rdsv3_conn_lock); 356 return; 357 } 358 359 conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash); 360 361 do { 362 if (want_send) 363 list = &conn->c_send_queue; 364 else 365 list = &conn->c_retrans; 366 367 mutex_enter(&conn->c_lock); 368 369 /* XXX too lazy to maintain counts.. */ 370 RDSV3_FOR_EACH_LIST_NODE(rm, list, m_conn_item) { 371 total++; 372 if (total <= len) 373 rdsv3_inc_info_copy(&rm->m_inc, iter, 374 conn->c_laddr, conn->c_faddr, 0); 375 } 376 377 mutex_exit(&conn->c_lock); 378 379 conn = AVL_NEXT(&rdsv3_conn_hash, conn); 380 } while (conn != NULL); 381 382 rw_exit(&rdsv3_conn_lock); 383 384 lens->nr = total; 385 lens->each = sizeof (struct rdsv3_info_message); 386 387 RDSV3_DPRINTF4("rdsv3_conn_message_info", "Return"); 388 } 389 390 static void 391 rdsv3_conn_message_info_send(struct rsock *sock, unsigned int len, 392 struct rdsv3_info_iterator *iter, 393 struct rdsv3_info_lengths *lens) 394 { 395 rdsv3_conn_message_info(sock, len, iter, lens, 1); 396 } 397 398 static void 399 rdsv3_conn_message_info_retrans(struct rsock *sock, 400 unsigned int len, 401 struct rdsv3_info_iterator *iter, 402 struct rdsv3_info_lengths *lens) 403 { 404 rdsv3_conn_message_info(sock, len, iter, lens, 0); 405 } 406 407 /* ARGSUSED */ 408 void 409 rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len, 410 struct rdsv3_info_iterator *iter, 411 struct rdsv3_info_lengths *lens, 412 int (*visitor)(struct rdsv3_connection *, void *), 413 size_t item_len) 414 { 415 #if !defined(__lock_lint) && !defined(__GNUC__) 416 uint64_t buffer[(item_len + 7) / 8]; 417 #else 418 uint64_t buffer[256]; 419 #endif 420 struct rdsv3_connection *conn; 421 422 rw_enter(&rdsv3_conn_lock, RW_READER); 423 424 lens->nr = 0; 425 lens->each = item_len; 426 427 if (avl_is_empty(&rdsv3_conn_hash)) { 428 /* no connections */ 429 rw_exit(&rdsv3_conn_lock); 430 return; 431 } 432 433 conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash); 434 435 do { 436 /* XXX no c_lock usage.. */ 437 if (!visitor(conn, buffer)) 438 continue; 439 440 /* 441 * We copy as much as we can fit in the buffer, 442 * but we count all items so that the caller 443 * can resize the buffer. 444 */ 445 if (len >= item_len) { 446 rdsv3_info_copy(iter, buffer, item_len); 447 len -= item_len; 448 } 449 lens->nr++; 450 conn = AVL_NEXT(&rdsv3_conn_hash, conn); 451 } while (conn != NULL); 452 453 rw_exit(&rdsv3_conn_lock); 454 } 455 456 static int 457 rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer) 458 { 459 struct rdsv3_info_connection *cinfo = buffer; 460 461 cinfo->next_tx_seq = conn->c_next_tx_seq; 462 cinfo->next_rx_seq = conn->c_next_rx_seq; 463 cinfo->laddr = conn->c_laddr; 464 cinfo->faddr = conn->c_faddr; 465 (void) strncpy((char *)cinfo->transport, conn->c_trans->t_name, 466 sizeof (cinfo->transport)); 467 cinfo->flags = 0; 468 469 rdsv3_conn_info_set(cinfo->flags, 470 rdsv3_conn_is_sending(conn), SENDING); 471 /* XXX Future: return the state rather than these funky bits */ 472 rdsv3_conn_info_set(cinfo->flags, 473 atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING, 474 CONNECTING); 475 rdsv3_conn_info_set(cinfo->flags, 476 atomic_get(&conn->c_state) == RDSV3_CONN_UP, 477 CONNECTED); 478 return (1); 479 } 480 481 static void 482 rdsv3_conn_info(struct rsock *sock, unsigned int len, 483 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 484 { 485 rdsv3_for_each_conn_info(sock, len, iter, lens, 486 rdsv3_conn_info_visitor, sizeof (struct rdsv3_info_connection)); 487 } 488 489 int 490 rdsv3_conn_init() 491 { 492 RDSV3_DPRINTF4("rdsv3_conn_init", "Enter"); 493 494 rdsv3_conn_slab = kmem_cache_create("rdsv3_connection", 495 sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor, 496 rdsv3_conn_destructor, NULL, NULL, NULL, 0); 497 if (rdsv3_conn_slab == NULL) { 498 RDSV3_DPRINTF2("rdsv3_conn_init", 499 "kmem_cache_create(rdsv3_conn_slab) failed"); 500 return (-1); 501 } 502 503 avl_create(&rdsv3_conn_hash, rdsv3_conn_compare, 504 sizeof (struct rdsv3_connection), offsetof(struct rdsv3_connection, 505 c_hash_node)); 506 507 rw_init(&rdsv3_conn_lock, NULL, RW_DRIVER, NULL); 508 509 rdsv3_loop_init(); 510 511 rdsv3_info_register_func(RDSV3_INFO_CONNECTIONS, rdsv3_conn_info); 512 rdsv3_info_register_func(RDSV3_INFO_SEND_MESSAGES, 513 rdsv3_conn_message_info_send); 514 rdsv3_info_register_func(RDSV3_INFO_RETRANS_MESSAGES, 515 rdsv3_conn_message_info_retrans); 516 517 RDSV3_DPRINTF4("rdsv3_conn_init", "Return"); 518 519 return (0); 520 } 521 522 void 523 rdsv3_conn_exit() 524 { 525 RDSV3_DPRINTF4("rdsv3_conn_exit", "Enter"); 526 527 rdsv3_loop_exit(); 528 529 rw_destroy(&rdsv3_conn_lock); 530 avl_destroy(&rdsv3_conn_hash); 531 532 ASSERT(rdsv3_conn_slab); 533 kmem_cache_destroy(rdsv3_conn_slab); 534 535 RDSV3_DPRINTF4("rdsv3_conn_exit", "Return"); 536 } 537 538 /* 539 * Force a disconnect 540 */ 541 void 542 rdsv3_conn_drop(struct rdsv3_connection *conn) 543 { 544 conn->c_state = RDSV3_CONN_ERROR; 545 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 546 } 547