1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file connection.c 7 * Oracle elects to have and use the contents of connection.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/types.h> 45 #include <sys/kmem.h> 46 #include <sys/rds.h> 47 48 #include <sys/ib/clients/rdsv3/rdsv3.h> 49 #include <sys/ib/clients/rdsv3/loop.h> 50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 51 52 /* converting this to RCU is a chore for another day.. */ 53 static krwlock_t rdsv3_conn_lock; 54 static unsigned long rdsv3_conn_count; 55 struct avl_tree rdsv3_conn_hash; 56 static struct kmem_cache *rdsv3_conn_slab = NULL; 57 58 #define rdsv3_conn_info_set(var, test, suffix) do { \ 59 if (test) \ 60 var |= RDSV3_INFO_CONNECTION_FLAG_##suffix; \ 61 } while (0) 62 63 64 static struct rdsv3_connection * 65 rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos) 66 { 67 struct rdsv3_connection *conn; 68 struct rdsv3_conn_info_s conn_info; 69 avl_index_t place = 0; 70 71 conn_info.c_laddr = laddr; 72 conn_info.c_faddr = faddr; 73 74 conn = avl_find(&rdsv3_conn_hash, &conn_info, &place); 75 76 RDSV3_DPRINTF5("rdsv3_conn_lookup", 77 "returning conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 78 conn, NIPQUAD(laddr), NIPQUAD(faddr)); 79 80 if (pos != NULL) 81 *pos = place; 82 83 return (conn); 84 } 85 86 /* 87 * This is called by transports as they're bringing down a connection. 88 * It clears partial message state so that the transport can start sending 89 * and receiving over this connection again in the future. It is up to 90 * the transport to have serialized this call with its send and recv. 91 */ 92 void 93 rdsv3_conn_reset(struct rdsv3_connection *conn) 94 { 95 RDSV3_DPRINTF2("rdsv3_conn_reset", 96 "connection %u.%u.%u.%u to %u.%u.%u.%u reset", 97 NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 98 99 rdsv3_stats_inc(s_conn_reset); 100 rdsv3_send_reset(conn); 101 conn->c_flags = 0; 102 103 /* 104 * Do not clear next_rx_seq here, else we cannot distinguish 105 * retransmitted packets from new packets, and will hand all 106 * of them to the application. That is not consistent with the 107 * reliability guarantees of RDS. 108 */ 109 } 110 111 /* 112 * There is only every one 'conn' for a given pair of addresses in the 113 * system at a time. They contain messages to be retransmitted and so 114 * span the lifetime of the actual underlying transport connections. 115 * 116 * For now they are not garbage collected once they're created. They 117 * are torn down as the module is removed, if ever. 118 */ 119 static struct rdsv3_connection * 120 __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, 121 struct rdsv3_transport *trans, int gfp, int is_outgoing) 122 { 123 struct rdsv3_connection *conn, *parent = NULL; 124 avl_index_t pos; 125 int ret; 126 127 rw_enter(&rdsv3_conn_lock, RW_READER); 128 conn = rdsv3_conn_lookup(laddr, faddr, &pos); 129 if (conn && 130 conn->c_loopback && 131 conn->c_trans != &rdsv3_loop_transport && 132 !is_outgoing) { 133 /* 134 * This is a looped back IB connection, and we're 135 * called by the code handling the incoming connect. 136 * We need a second connection object into which we 137 * can stick the other QP. 138 */ 139 parent = conn; 140 conn = parent->c_passive; 141 } 142 rw_exit(&rdsv3_conn_lock); 143 if (conn) 144 goto out; 145 146 RDSV3_DPRINTF2("__rdsv3_conn_create", "Enter(%x -> %x)", 147 ntohl(laddr), ntohl(faddr)); 148 149 conn = kmem_cache_alloc(rdsv3_conn_slab, gfp); 150 if (!conn) { 151 conn = ERR_PTR(-ENOMEM); 152 goto out; 153 } 154 155 /* see rdsv3_conn_constructor */ 156 conn->c_laddr = laddr; 157 conn->c_faddr = faddr; 158 159 ret = rdsv3_cong_get_maps(conn); 160 if (ret) { 161 kmem_cache_free(rdsv3_conn_slab, conn); 162 conn = ERR_PTR(ret); 163 goto out; 164 } 165 166 /* 167 * This is where a connection becomes loopback. If *any* RDS sockets 168 * can bind to the destination address then we'd rather the messages 169 * flow through loopback rather than either transport. 170 */ 171 if (rdsv3_trans_get_preferred(faddr)) { 172 conn->c_loopback = 1; 173 if (is_outgoing && trans->t_prefer_loopback) { 174 /* 175 * "outgoing" connection - and the transport 176 * says it wants the connection handled by the 177 * loopback transport. This is what TCP does. 178 */ 179 trans = &rdsv3_loop_transport; 180 } 181 } 182 183 conn->c_trans = trans; 184 185 ret = trans->conn_alloc(conn, gfp); 186 if (ret) { 187 kmem_cache_free(rdsv3_conn_slab, conn); 188 conn = ERR_PTR(ret); 189 goto out; 190 } 191 192 conn->c_state = RDSV3_CONN_DOWN; 193 conn->c_reconnect_jiffies = 0; 194 RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker); 195 RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker); 196 RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker); 197 RDSV3_INIT_DELAYED_WORK(&conn->c_reap_w, rdsv3_reaper_worker); 198 RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker); 199 mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL); 200 conn->c_flags = 0; 201 202 RDSV3_DPRINTF2("__rdsv3_conn_create", 203 "allocated conn %p for %u.%u.%u.%u -> %u.%u.%u.%u over %s %s", 204 conn, NIPQUAD(laddr), NIPQUAD(faddr), 205 trans->t_name ? trans->t_name : "[unknown]", 206 is_outgoing ? "(outgoing)" : ""); 207 208 /* 209 * Since we ran without holding the conn lock, someone could 210 * have created the same conn (either normal or passive) in the 211 * interim. We check while holding the lock. If we won, we complete 212 * init and return our conn. If we lost, we rollback and return the 213 * other one. 214 */ 215 rw_enter(&rdsv3_conn_lock, RW_WRITER); 216 if (parent) { 217 /* Creating passive conn */ 218 if (parent->c_passive) { 219 trans->conn_free(conn->c_transport_data); 220 kmem_cache_free(rdsv3_conn_slab, conn); 221 conn = parent->c_passive; 222 } else { 223 parent->c_passive = conn; 224 rdsv3_cong_add_conn(conn); 225 rdsv3_conn_count++; 226 } 227 } else { 228 /* Creating normal conn */ 229 struct rdsv3_connection *found; 230 231 found = rdsv3_conn_lookup(laddr, faddr, &pos); 232 if (found) { 233 trans->conn_free(conn->c_transport_data); 234 kmem_cache_free(rdsv3_conn_slab, conn); 235 conn = found; 236 } else { 237 avl_insert(&rdsv3_conn_hash, conn, pos); 238 rdsv3_cong_add_conn(conn); 239 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, 240 RDSV3_REAPER_WAIT_JIFFIES); 241 rdsv3_conn_count++; 242 } 243 } 244 245 rw_exit(&rdsv3_conn_lock); 246 247 RDSV3_DPRINTF2("__rdsv3_conn_create", "Return(conn: %p)", conn); 248 249 out: 250 return (conn); 251 } 252 253 struct rdsv3_connection * 254 rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, 255 struct rdsv3_transport *trans, int gfp) 256 { 257 return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 0)); 258 } 259 260 struct rdsv3_connection * 261 rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr, 262 struct rdsv3_transport *trans, int gfp) 263 { 264 return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1)); 265 } 266 267 extern struct avl_tree rdsv3_conn_hash; 268 269 void 270 rdsv3_conn_shutdown(struct rdsv3_connection *conn) 271 { 272 RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Enter(conn: %p)", conn); 273 274 /* shut it down unless it's down already */ 275 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) { 276 /* 277 * Quiesce the connection mgmt handlers before we start tearing 278 * things down. We don't hold the mutex for the entire 279 * duration of the shutdown operation, else we may be 280 * deadlocking with the CM handler. Instead, the CM event 281 * handler is supposed to check for state DISCONNECTING 282 */ 283 mutex_enter(&conn->c_cm_lock); 284 if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP, 285 RDSV3_CONN_DISCONNECTING) && 286 !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR, 287 RDSV3_CONN_DISCONNECTING)) { 288 RDSV3_DPRINTF2("rdsv3_conn_shutdown", 289 "shutdown called in state %d", 290 atomic_get(&conn->c_state)); 291 rdsv3_conn_drop(conn); 292 mutex_exit(&conn->c_cm_lock); 293 return; 294 } 295 mutex_exit(&conn->c_cm_lock); 296 297 /* verify everybody's out of rds_send_xmit() */ 298 mutex_enter(&conn->c_send_lock); 299 while (atomic_get(&conn->c_senders)) { 300 mutex_exit(&conn->c_send_lock); 301 delay(1); 302 mutex_enter(&conn->c_send_lock); 303 } 304 305 conn->c_trans->conn_shutdown(conn); 306 rdsv3_conn_reset(conn); 307 mutex_exit(&conn->c_send_lock); 308 309 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING, 310 RDSV3_CONN_DOWN)) { 311 /* 312 * This can happen - eg when we're in the middle of 313 * tearing down the connection, and someone unloads 314 * the rds module. 315 * Quite reproduceable with loopback connections. 316 * Mostly harmless. 317 */ 318 #ifndef __lock_lint 319 RDSV3_DPRINTF2("rdsv3_conn_shutdown", 320 "failed to transition to state DOWN, " 321 "current statis is: %d", 322 atomic_get(&conn->c_state)); 323 rdsv3_conn_drop(conn); 324 #endif 325 return; 326 } 327 } 328 329 /* 330 * Then reconnect if it's still live. 331 * The passive side of an IB loopback connection is never added 332 * to the conn hash, so we never trigger a reconnect on this 333 * conn - the reconnect is always triggered by the active peer. 334 */ 335 rdsv3_cancel_delayed_work(&conn->c_conn_w); 336 337 { 338 struct rdsv3_conn_info_s conn_info; 339 340 conn_info.c_laddr = conn->c_laddr; 341 conn_info.c_faddr = conn->c_faddr; 342 if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn) 343 rdsv3_queue_reconnect(conn); 344 } 345 RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Exit"); 346 } 347 348 /* 349 * Stop and free a connection. 350 */ 351 void 352 rdsv3_conn_destroy(struct rdsv3_connection *conn) 353 { 354 struct rdsv3_message *rm, *rtmp; 355 list_t to_be_dropped; 356 357 RDSV3_DPRINTF4("rdsv3_conn_destroy", 358 "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 359 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 360 361 avl_remove(&rdsv3_conn_hash, conn); 362 363 rdsv3_cancel_delayed_work(&conn->c_reap_w); 364 rdsv3_cancel_delayed_work(&conn->c_send_w); 365 rdsv3_cancel_delayed_work(&conn->c_recv_w); 366 367 rdsv3_conn_shutdown(conn); 368 369 /* tear down queued messages */ 370 371 list_create(&to_be_dropped, sizeof (struct rdsv3_message), 372 offsetof(struct rdsv3_message, m_conn_item)); 373 374 RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_retrans, m_conn_item) { 375 list_remove_node(&rm->m_conn_item); 376 list_insert_tail(&to_be_dropped, rm); 377 } 378 379 RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_send_queue, 380 m_conn_item) { 381 list_remove_node(&rm->m_conn_item); 382 list_insert_tail(&to_be_dropped, rm); 383 } 384 385 RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &to_be_dropped, m_conn_item) { 386 clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 387 list_remove_node(&rm->m_conn_item); 388 rdsv3_message_put(rm); 389 } 390 391 if (conn->c_xmit_rm) 392 rdsv3_message_put(conn->c_xmit_rm); 393 394 conn->c_trans->conn_free(conn->c_transport_data); 395 396 /* 397 * The congestion maps aren't freed up here. They're 398 * freed by rdsv3_cong_exit() after all the connections 399 * have been freed. 400 */ 401 rdsv3_cong_remove_conn(conn); 402 403 ASSERT(list_is_empty(&conn->c_retrans)); 404 kmem_cache_free(rdsv3_conn_slab, conn); 405 406 rdsv3_conn_count--; 407 } 408 409 /* ARGSUSED */ 410 static void 411 rdsv3_conn_message_info(struct rsock *sock, unsigned int len, 412 struct rdsv3_info_iterator *iter, 413 struct rdsv3_info_lengths *lens, 414 int want_send) 415 { 416 struct list *list; 417 struct rdsv3_connection *conn; 418 struct rdsv3_message *rm; 419 unsigned int total = 0; 420 421 RDSV3_DPRINTF4("rdsv3_conn_message_info", "Enter"); 422 423 len /= sizeof (struct rdsv3_info_message); 424 425 rw_enter(&rdsv3_conn_lock, RW_READER); 426 427 if (avl_is_empty(&rdsv3_conn_hash)) { 428 /* no connections */ 429 rw_exit(&rdsv3_conn_lock); 430 return; 431 } 432 433 conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash); 434 435 do { 436 if (want_send) 437 list = &conn->c_send_queue; 438 else 439 list = &conn->c_retrans; 440 441 mutex_enter(&conn->c_lock); 442 443 /* XXX too lazy to maintain counts.. */ 444 RDSV3_FOR_EACH_LIST_NODE(rm, list, m_conn_item) { 445 total++; 446 if (total <= len) 447 rdsv3_inc_info_copy(&rm->m_inc, iter, 448 conn->c_laddr, conn->c_faddr, 0); 449 } 450 451 mutex_exit(&conn->c_lock); 452 453 conn = AVL_NEXT(&rdsv3_conn_hash, conn); 454 } while (conn != NULL); 455 rw_exit(&rdsv3_conn_lock); 456 457 lens->nr = total; 458 lens->each = sizeof (struct rdsv3_info_message); 459 460 RDSV3_DPRINTF4("rdsv3_conn_message_info", "Return"); 461 } 462 463 static void 464 rdsv3_conn_message_info_send(struct rsock *sock, unsigned int len, 465 struct rdsv3_info_iterator *iter, 466 struct rdsv3_info_lengths *lens) 467 { 468 rdsv3_conn_message_info(sock, len, iter, lens, 1); 469 } 470 471 static void 472 rdsv3_conn_message_info_retrans(struct rsock *sock, 473 unsigned int len, 474 struct rdsv3_info_iterator *iter, 475 struct rdsv3_info_lengths *lens) 476 { 477 rdsv3_conn_message_info(sock, len, iter, lens, 0); 478 } 479 480 /* ARGSUSED */ 481 void 482 rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len, 483 struct rdsv3_info_iterator *iter, 484 struct rdsv3_info_lengths *lens, 485 int (*visitor)(struct rdsv3_connection *, void *), 486 size_t item_len) 487 { 488 uint8_t *buffer; 489 struct rdsv3_connection *conn; 490 491 rw_enter(&rdsv3_conn_lock, RW_READER); 492 493 lens->nr = 0; 494 lens->each = item_len; 495 496 if (avl_is_empty(&rdsv3_conn_hash)) { 497 /* no connections */ 498 rw_exit(&rdsv3_conn_lock); 499 return; 500 } 501 502 /* allocate a little extra as this can get cast to a uint64_t */ 503 buffer = kmem_zalloc(item_len + 8, KM_SLEEP); 504 505 conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash); 506 507 do { 508 /* XXX no c_lock usage.. */ 509 if (visitor(conn, buffer)) { 510 /* 511 * We copy as much as we can fit in the buffer, 512 * but we count all items so that the caller 513 * can resize the buffer. 514 */ 515 if (len >= item_len) { 516 RDSV3_DPRINTF4("rdsv3_for_each_conn_info", 517 "buffer: %p iter: %p bytes: %d", buffer, 518 iter->addr + iter->offset, item_len); 519 rdsv3_info_copy(iter, buffer, item_len); 520 len -= item_len; 521 } 522 lens->nr++; 523 } 524 conn = AVL_NEXT(&rdsv3_conn_hash, conn); 525 } while (conn != NULL); 526 rw_exit(&rdsv3_conn_lock); 527 528 kmem_free(buffer, item_len + 8); 529 } 530 531 static int 532 rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer) 533 { 534 struct rdsv3_info_connection *cinfo = buffer; 535 536 cinfo->next_tx_seq = conn->c_next_tx_seq; 537 cinfo->next_rx_seq = conn->c_next_rx_seq; 538 cinfo->laddr = conn->c_laddr; 539 cinfo->faddr = conn->c_faddr; 540 (void) strncpy((char *)cinfo->transport, conn->c_trans->t_name, 541 sizeof (cinfo->transport)); 542 cinfo->flags = 0; 543 544 rdsv3_conn_info_set(cinfo->flags, 545 MUTEX_HELD(&conn->c_send_lock), SENDING); 546 547 /* XXX Future: return the state rather than these funky bits */ 548 rdsv3_conn_info_set(cinfo->flags, 549 atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING, 550 CONNECTING); 551 rdsv3_conn_info_set(cinfo->flags, 552 atomic_get(&conn->c_state) == RDSV3_CONN_UP, 553 CONNECTED); 554 return (1); 555 } 556 557 static void 558 rdsv3_conn_info(struct rsock *sock, unsigned int len, 559 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 560 { 561 rdsv3_for_each_conn_info(sock, len, iter, lens, 562 rdsv3_conn_info_visitor, sizeof (struct rdsv3_info_connection)); 563 } 564 565 int 566 rdsv3_conn_init() 567 { 568 RDSV3_DPRINTF4("rdsv3_conn_init", "Enter"); 569 570 rdsv3_conn_slab = kmem_cache_create("rdsv3_connection", 571 sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor, 572 rdsv3_conn_destructor, NULL, NULL, NULL, 0); 573 if (!rdsv3_conn_slab) { 574 RDSV3_DPRINTF2("rdsv3_conn_init", 575 "kmem_cache_create(rdsv3_conn_slab) failed"); 576 return (-ENOMEM); 577 } 578 579 avl_create(&rdsv3_conn_hash, rdsv3_conn_compare, 580 sizeof (struct rdsv3_connection), offsetof(struct rdsv3_connection, 581 c_hash_node)); 582 583 rw_init(&rdsv3_conn_lock, NULL, RW_DRIVER, NULL); 584 585 rdsv3_loop_init(); 586 587 rdsv3_info_register_func(RDSV3_INFO_CONNECTIONS, rdsv3_conn_info); 588 rdsv3_info_register_func(RDSV3_INFO_SEND_MESSAGES, 589 rdsv3_conn_message_info_send); 590 rdsv3_info_register_func(RDSV3_INFO_RETRANS_MESSAGES, 591 rdsv3_conn_message_info_retrans); 592 593 RDSV3_DPRINTF4("rdsv3_conn_init", "Return"); 594 595 return (0); 596 } 597 598 void 599 rdsv3_conn_exit() 600 { 601 RDSV3_DPRINTF4("rdsv3_conn_exit", "Enter"); 602 603 rdsv3_loop_exit(); 604 605 rw_destroy(&rdsv3_conn_lock); 606 avl_destroy(&rdsv3_conn_hash); 607 608 ASSERT(rdsv3_conn_slab); 609 kmem_cache_destroy(rdsv3_conn_slab); 610 611 RDSV3_DPRINTF4("rdsv3_conn_exit", "Return"); 612 } 613 614 /* 615 * Force a disconnect 616 */ 617 void 618 rdsv3_conn_drop(struct rdsv3_connection *conn) 619 { 620 conn->c_state = RDSV3_CONN_ERROR; 621 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 622 } 623