1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file ib_cm.c 7 * Oracle elects to have and use the contents of ib_cm.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/rds.h> 45 46 #include <sys/ib/clients/of/ofed_kernel.h> 47 #include <sys/ib/clients/of/rdma/ib_addr.h> 48 #include <sys/ib/clients/of/rdma/rdma_cm.h> 49 50 #include <sys/ib/clients/rdsv3/rdsv3.h> 51 #include <sys/ib/clients/rdsv3/ib.h> 52 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 53 54 extern int rdsv3_enable_snd_cq; 55 56 /* 57 * Set the selected protocol version 58 */ 59 static void 60 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version) 61 { 62 RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d", 63 conn, version); 64 conn->c_version = version; 65 } 66 67 /* 68 * Set up flow control 69 */ 70 static void 71 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits) 72 { 73 struct rdsv3_ib_connection *ic = conn->c_transport_data; 74 75 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 76 "Enter: conn: %p credits: %d", conn, credits); 77 78 if (rdsv3_ib_sysctl_flow_control && credits != 0) { 79 /* We're doing flow control */ 80 ic->i_flowctl = 1; 81 rdsv3_ib_send_add_credits(conn, credits); 82 } else { 83 ic->i_flowctl = 0; 84 } 85 86 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 87 "Return: conn: %p credits: %d", 88 conn, credits); 89 } 90 91 /* 92 * Tune RNR behavior. Without flow control, we use a rather 93 * low timeout, but not the absolute minimum - this should 94 * be tunable. 95 * 96 * We already set the RNR retry count to 7 (which is the 97 * smallest infinite number :-) above. 98 * If flow control is off, we want to change this back to 0 99 * so that we learn quickly when our credit accounting is 100 * buggy. 101 * 102 * Caller passes in a qp_attr pointer - don't waste stack spacv 103 * by allocation this twice. 104 */ 105 static void 106 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr) 107 { 108 int ret; 109 110 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p", 111 ic, attr); 112 113 attr->min_rnr_timer = IB_RNR_TIMER_000_32; 114 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); 115 if (ret) 116 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", 117 "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret); 118 } 119 120 /* 121 * Connection established. 122 * We get here for both outgoing and incoming connection. 123 */ 124 void 125 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 126 struct rdma_cm_event *event) 127 { 128 const struct rdsv3_ib_connect_private *dp = NULL; 129 struct rdsv3_ib_connection *ic = conn->c_transport_data; 130 struct rdsv3_ib_device *rds_ibdev = 131 ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); 132 struct ib_qp_attr qp_attr; 133 int err; 134 135 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 136 "Enter conn: %p event: %p", conn, event); 137 138 if (event->param.conn.private_data_len >= sizeof (*dp)) { 139 dp = event->param.conn.private_data; 140 141 /* make sure it isn't empty data */ 142 if (dp->dp_protocol_major) { 143 rdsv3_ib_set_protocol(conn, 144 RDS_PROTOCOL(dp->dp_protocol_major, 145 dp->dp_protocol_minor)); 146 rdsv3_ib_set_flow_control(conn, 147 ntohl(dp->dp_credit)); 148 } 149 } 150 151 if (conn->c_version < RDS_PROTOCOL(3, 1)) { 152 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 153 "RDS/IB: Connection to %u.%u.%u.%u version %u.%u failed", 154 NIPQUAD(conn->c_faddr), 155 RDS_PROTOCOL_MAJOR(conn->c_version), 156 RDS_PROTOCOL_MINOR(conn->c_version)); 157 rdsv3_conn_destroy(conn); 158 return; 159 } else { 160 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 161 "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", 162 NIPQUAD(conn->c_faddr), 163 RDS_PROTOCOL_MAJOR(conn->c_version), 164 RDS_PROTOCOL_MINOR(conn->c_version), 165 ic->i_flowctl ? ", flow control" : ""); 166 } 167 168 ASSERT(ic->i_soft_cq == NULL); 169 ic->i_soft_cq = rdsv3_af_intr_thr_create(rdsv3_ib_tasklet_fn, 170 (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp, 171 ic->i_cq->ibt_cq); 172 if (rdsv3_enable_snd_cq) { 173 ic->i_snd_soft_cq = rdsv3_af_intr_thr_create( 174 rdsv3_ib_snd_tasklet_fn, 175 (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp, 176 ic->i_snd_cq->ibt_cq); 177 } 178 ic->i_refill_rq = rdsv3_af_thr_create(rdsv3_ib_refill_fn, (void *)conn, 179 SCQ_WRK_BIND_CPU, rds_ibdev->aft_hcagp); 180 rdsv3_af_grp_draw(rds_ibdev->aft_hcagp); 181 182 (void) ib_req_notify_cq(ic->i_cq, IB_CQ_SOLICITED); 183 if (rdsv3_enable_snd_cq) { 184 (void) ib_req_notify_cq(ic->i_snd_cq, IB_CQ_NEXT_COMP); 185 } 186 187 /* 188 * Init rings and fill recv. this needs to wait until protocol 189 * negotiation 190 * is complete, since ring layout is different from 3.0 to 3.1. 191 */ 192 rdsv3_ib_send_init_ring(ic); 193 rdsv3_ib_recv_init_ring(ic); 194 /* 195 * Post receive buffers - as a side effect, this will update 196 * the posted credit count. 197 */ 198 (void) rdsv3_ib_recv_refill(conn, 1); 199 200 /* Tune RNR behavior */ 201 rdsv3_ib_tune_rnr(ic, &qp_attr); 202 203 qp_attr.qp_state = IB_QPS_RTS; 204 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 205 if (err) 206 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 207 "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); 208 209 /* update ib_device with this local ipaddr & conn */ 210 err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); 211 if (err) 212 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 213 "rdsv3_ib_update_ipaddr failed (%d)", err); 214 rdsv3_ib_add_conn(rds_ibdev, conn); 215 216 /* 217 * If the peer gave us the last packet it saw, process this as if 218 * we had received a regular ACK. 219 */ 220 if (dp && dp->dp_ack_seq) 221 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 222 223 rdsv3_connect_complete(conn); 224 225 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 226 "Return conn: %p event: %p", 227 conn, event); 228 } 229 230 static void 231 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, 232 struct rdma_conn_param *conn_param, 233 struct rdsv3_ib_connect_private *dp, 234 uint32_t protocol_version, 235 uint32_t max_responder_resources, 236 uint32_t max_initiator_depth) 237 { 238 struct rdsv3_ib_connection *ic = conn->c_transport_data; 239 struct rdsv3_ib_device *rds_ibdev; 240 241 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 242 "Enter conn: %p conn_param: %p private: %p version: %d", 243 conn, conn_param, dp, protocol_version); 244 245 (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); 246 247 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); 248 249 conn_param->responder_resources = 250 MIN(rds_ibdev->max_responder_resources, max_responder_resources); 251 conn_param->initiator_depth = 252 MIN(rds_ibdev->max_initiator_depth, max_initiator_depth); 253 conn_param->retry_count = min(rdsv3_ib_retry_count, 7); 254 conn_param->rnr_retry_count = 7; 255 256 if (dp) { 257 (void) memset(dp, 0, sizeof (*dp)); 258 dp->dp_saddr = conn->c_laddr; 259 dp->dp_daddr = conn->c_faddr; 260 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 261 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 262 dp->dp_protocol_minor_mask = 263 htons(RDSV3_IB_SUPPORTED_PROTOCOLS); 264 dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic); 265 266 /* Advertise flow control */ 267 if (ic->i_flowctl) { 268 unsigned int credits; 269 270 credits = IB_GET_POST_CREDITS( 271 atomic_get(&ic->i_credits)); 272 dp->dp_credit = htonl(credits); 273 atomic_add_32(&ic->i_credits, 274 -IB_SET_POST_CREDITS(credits)); 275 } 276 277 conn_param->private_data = dp; 278 conn_param->private_data_len = sizeof (*dp); 279 } 280 281 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 282 "Return conn: %p conn_param: %p private: %p version: %d", 283 conn, conn_param, dp, protocol_version); 284 } 285 286 static void 287 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) 288 { 289 RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p", 290 event->event, data); 291 } 292 293 static void 294 rdsv3_ib_snd_cq_comp_handler(struct ib_cq *cq, void *context) 295 { 296 struct rdsv3_connection *conn = context; 297 struct rdsv3_ib_connection *ic = conn->c_transport_data; 298 299 RDSV3_DPRINTF4("rdsv3_ib_snd_cq_comp_handler", 300 "Enter(conn: %p ic: %p cq: %p)", conn, ic, cq); 301 302 rdsv3_af_thr_fire(ic->i_snd_soft_cq); 303 } 304 305 void 306 rdsv3_ib_snd_tasklet_fn(void *data) 307 { 308 struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data; 309 struct rdsv3_connection *conn = ic->conn; 310 struct rdsv3_ib_ack_state ack_state = { 0, }; 311 ibt_wc_t wc; 312 uint_t polled; 313 314 RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn", 315 "Enter(conn: %p ic: %p)", conn, ic); 316 317 /* 318 * Poll in a loop before and after enabling the next event 319 */ 320 while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) == 321 IBT_SUCCESS) { 322 RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn", 323 "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n", 324 (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status, 325 wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); 326 327 ASSERT(wc.wc_id & RDSV3_IB_SEND_OP); 328 rdsv3_ib_send_cqe_handler(ic, &wc); 329 } 330 (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_snd_cq), 331 IBT_NEXT_COMPLETION); 332 if (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) == 333 IBT_SUCCESS) { 334 ASSERT(wc.wc_id & RDSV3_IB_SEND_OP); 335 rdsv3_ib_send_cqe_handler(ic, &wc); 336 } 337 } 338 339 static void 340 rdsv3_ib_cq_comp_handler(struct ib_cq *cq, void *context) 341 { 342 struct rdsv3_connection *conn = context; 343 struct rdsv3_ib_connection *ic = conn->c_transport_data; 344 345 RDSV3_DPRINTF4("rdsv3_ib_cq_comp_handler", 346 "Enter(conn: %p cq: %p)", conn, cq); 347 348 rdsv3_ib_stats_inc(s_ib_evt_handler_call); 349 350 rdsv3_af_thr_fire(ic->i_soft_cq); 351 } 352 353 void 354 rdsv3_ib_refill_fn(void *data) 355 { 356 struct rdsv3_connection *conn = (struct rdsv3_connection *)data; 357 358 (void) rdsv3_ib_recv_refill(conn, 0); 359 } 360 361 void 362 rdsv3_ib_tasklet_fn(void *data) 363 { 364 struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data; 365 struct rdsv3_connection *conn = ic->conn; 366 struct rdsv3_ib_ack_state ack_state = { 0, }; 367 ibt_wc_t wc; 368 uint_t polled; 369 370 RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn", 371 "Enter(conn: %p ic: %p)", conn, ic); 372 373 rdsv3_ib_stats_inc(s_ib_tasklet_call); 374 375 /* 376 * Poll in a loop before and after enabling the next event 377 */ 378 while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc, 1, &polled) == 379 IBT_SUCCESS) { 380 RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn", 381 "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n", 382 (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status, 383 wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); 384 385 if (wc.wc_id & RDSV3_IB_SEND_OP) { 386 rdsv3_ib_send_cqe_handler(ic, &wc); 387 } else { 388 rdsv3_ib_recv_cqe_handler(ic, &wc, &ack_state); 389 } 390 } 391 (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_cq), 392 IBT_NEXT_SOLICITED); 393 394 if (ack_state.ack_next_valid) { 395 rdsv3_ib_set_ack(ic, ack_state.ack_next, 396 ack_state.ack_required); 397 } 398 if (ack_state.ack_recv_valid && ack_state.ack_recv > ic->i_ack_recv) { 399 rdsv3_send_drop_acked(conn, ack_state.ack_recv, NULL); 400 ic->i_ack_recv = ack_state.ack_recv; 401 } 402 if (rdsv3_conn_up(conn)) { 403 if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 404 (void) rdsv3_send_xmit(ic->conn); 405 rdsv3_ib_attempt_ack(ic); 406 } 407 } 408 409 static void 410 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) 411 { 412 struct rdsv3_connection *conn = data; 413 struct rdsv3_ib_connection *ic = conn->c_transport_data; 414 415 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u", 416 conn, ic, event->event); 417 418 switch (event->event) { 419 case IB_EVENT_COMM_EST: 420 (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 421 break; 422 default: 423 if (conn) { 424 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 425 "RDS/IB: Fatal QP Event %u - " 426 "connection %u.%u.%u.%u ->%u.%u.%u.%u " 427 "...reconnecting", 428 event->event, NIPQUAD(conn->c_laddr), 429 NIPQUAD(conn->c_faddr)); 430 rdsv3_conn_drop(conn); 431 } else { 432 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 433 "RDS/IB: Fatal QP Event %u - connection" 434 "...reconnecting", event->event); 435 } 436 break; 437 } 438 439 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p", 440 conn, event); 441 } 442 443 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev, 444 struct rdsv3_ib_connection *ic); 445 extern void rdsv3_ib_free_hdrs(ib_device_t *dev, 446 struct rdsv3_ib_connection *ic); 447 448 /* 449 * This needs to be very careful to not leave IS_ERR pointers around for 450 * cleanup to trip over. 451 */ 452 static int 453 rdsv3_ib_setup_qp(struct rdsv3_connection *conn) 454 { 455 struct rdsv3_ib_connection *ic = conn->c_transport_data; 456 struct ib_device *dev = ic->i_cm_id->device; 457 struct ib_qp_init_attr attr; 458 struct rdsv3_ib_device *rds_ibdev; 459 ibt_send_wr_t *wrp; 460 ibt_wr_ds_t *sgl; 461 int ret, i; 462 463 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn); 464 465 /* 466 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device, 467 * and allocates a protection domain, memory range and FMR pool 468 * for each. If that fails for any reason, it will not register 469 * the rds_ibdev at all. 470 */ 471 rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); 472 if (!rds_ibdev) { 473 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 474 "RDS/IB: No client_data for device %s", dev->name); 475 return (-EOPNOTSUPP); 476 } 477 ic->rds_ibdev = rds_ibdev; 478 479 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 480 rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 481 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) 482 rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); 483 484 /* Protection domain and memory range */ 485 ic->i_pd = rds_ibdev->pd; 486 487 /* 488 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is 489 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() 490 * anyway. 491 */ 492 ic->i_cq = ib_create_cq(dev, rdsv3_ib_cq_comp_handler, 493 rdsv3_ib_cq_event_handler, conn, 494 ic->i_recv_ring.w_nr + ic->i_send_ring.w_nr + 1, 495 (intptr_t)rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp)); 496 if (IS_ERR(ic->i_cq)) { 497 ret = PTR_ERR(ic->i_cq); 498 ic->i_cq = NULL; 499 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 500 "ib_create_cq failed: %d", ret); 501 goto out; 502 } 503 if (rdsv3_enable_snd_cq) { 504 ic->i_snd_cq = ib_create_cq(dev, rdsv3_ib_snd_cq_comp_handler, 505 rdsv3_ib_cq_event_handler, conn, ic->i_send_ring.w_nr + 1, 506 (intptr_t)rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp)); 507 if (IS_ERR(ic->i_snd_cq)) { 508 ret = PTR_ERR(ic->i_snd_cq); 509 (void) ib_destroy_cq(ic->i_cq); 510 ic->i_cq = NULL; 511 ic->i_snd_cq = NULL; 512 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 513 "ib_create_cq send cq failed: %d", ret); 514 goto out; 515 } 516 } 517 518 /* XXX negotiate max send/recv with remote? */ 519 (void) memset(&attr, 0, sizeof (attr)); 520 attr.event_handler = rdsv3_ib_qp_event_handler; 521 attr.qp_context = conn; 522 /* + 1 to allow for the single ack message */ 523 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; 524 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; 525 attr.cap.max_send_sge = rds_ibdev->max_sge; 526 attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; 527 attr.sq_sig_type = IB_SIGNAL_REQ_WR; 528 attr.qp_type = IB_QPT_RC; 529 if (rdsv3_enable_snd_cq) { 530 attr.send_cq = ic->i_snd_cq; 531 } else { 532 attr.send_cq = ic->i_cq; 533 } 534 attr.recv_cq = ic->i_cq; 535 536 /* 537 * XXX this can fail if max_*_wr is too large? Are we supposed 538 * to back off until we get a value that the hardware can support? 539 */ 540 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 541 if (ret) { 542 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 543 "rdma_create_qp failed: %d", ret); 544 goto out; 545 } 546 547 ret = rdsv3_ib_alloc_hdrs(dev, ic); 548 if (ret != 0) { 549 ret = -ENOMEM; 550 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 551 "rdsv3_ib_alloc_hdrs failed: %d", ret); 552 goto out; 553 } 554 555 ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr * 556 sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP); 557 if (ic->i_sends == NULL) { 558 ret = -ENOMEM; 559 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 560 "send allocation failed: %d", ret); 561 goto out; 562 } 563 (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * 564 sizeof (struct rdsv3_ib_send_work)); 565 566 ic->i_send_wrs = 567 kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) + 568 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP); 569 if (ic->i_send_wrs == NULL) { 570 ret = -ENOMEM; 571 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 572 "Send WR allocation failed: %d", ret); 573 goto out; 574 } 575 sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs + 576 (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t))); 577 for (i = 0; i < RDSV3_IB_SEND_WRS; i++) { 578 wrp = &ic->i_send_wrs[i]; 579 wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE]; 580 } 581 582 ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr * 583 sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP); 584 if (ic->i_recvs == NULL) { 585 ret = -ENOMEM; 586 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 587 "recv allocation failed: %d", ret); 588 goto out; 589 } 590 (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * 591 sizeof (struct rdsv3_ib_recv_work)); 592 593 ic->i_recv_wrs = 594 kmem_alloc(ic->i_recv_ring.w_nr * sizeof (ibt_recv_wr_t), 595 KM_NOSLEEP); 596 if (ic->i_recv_wrs == NULL) { 597 ret = -ENOMEM; 598 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 599 "Recv WR allocation failed: %d", ret); 600 goto out; 601 } 602 603 rdsv3_ib_recv_init_ack(ic); 604 605 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p", 606 conn, ic->i_pd, ic->i_mr, ic->i_cq); 607 608 out: 609 return (ret); 610 } 611 612 static uint32_t 613 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event) 614 { 615 const struct rdsv3_ib_connect_private *dp = 616 event->param.conn.private_data; 617 uint16_t common; 618 uint32_t version = 0; 619 620 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p", 621 event); 622 623 /* 624 * rdma_cm private data is odd - when there is any private data in the 625 * request, we will be given a pretty large buffer without telling us 626 * the 627 * original size. The only way to tell the difference is by looking at 628 * the contents, which are initialized to zero. 629 * If the protocol version fields aren't set, 630 * this is a connection attempt 631 * from an older version. This could could be 3.0 or 2.0 - 632 * we can't tell. 633 * We really should have changed this for OFED 1.3 :-( 634 */ 635 636 /* Be paranoid. RDS always has privdata */ 637 if (!event->param.conn.private_data_len) { 638 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 639 "RDS incoming connection has no private data, rejecting"); 640 return (0); 641 } 642 643 /* Even if len is crap *now* I still want to check it. -ASG */ 644 if (event->param.conn.private_data_len < sizeof (*dp) || 645 dp->dp_protocol_major == 0) 646 return (RDS_PROTOCOL_3_0); 647 648 common = ntohs(dp->dp_protocol_minor_mask) & 649 RDSV3_IB_SUPPORTED_PROTOCOLS; 650 if (dp->dp_protocol_major == 3 && common) { 651 version = RDS_PROTOCOL_3_0; 652 while ((common >>= 1) != 0) 653 version++; 654 } else { 655 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 656 "RDS: Connection from %u.%u.%u.%u using " 657 "incompatible protocol version %u.%u\n", 658 NIPQUAD(dp->dp_saddr), 659 dp->dp_protocol_major, 660 dp->dp_protocol_minor); 661 } 662 663 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p", 664 event); 665 666 return (version); 667 } 668 669 int 670 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 671 struct rdma_cm_event *event) 672 { 673 uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id; 674 uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id; 675 const struct rdsv3_ib_connect_private *dp = 676 event->param.conn.private_data; 677 struct rdsv3_ib_connect_private dp_rep; 678 struct rdsv3_connection *conn = NULL; 679 struct rdsv3_ib_connection *ic = NULL; 680 struct rdma_conn_param conn_param; 681 uint32_t version; 682 int err, destroy = 1; 683 boolean_t conn_created = B_FALSE; 684 685 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 686 "Enter cm_id: %p event: %p", cm_id, event); 687 688 /* Check whether the remote protocol version matches ours. */ 689 version = rdsv3_ib_protocol_compatible(event); 690 if (!version) { 691 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 692 "version mismatch"); 693 goto out; 694 } 695 696 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 697 "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid " 698 "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr), 699 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 700 (unsigned long long)ntohll(lguid), 701 (unsigned long long)ntohll(fguid)); 702 703 conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr, 704 &rdsv3_ib_transport, KM_NOSLEEP); 705 if (IS_ERR(conn)) { 706 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 707 "rdsv3_conn_create failed (%ld)", PTR_ERR(conn)); 708 conn = NULL; 709 goto out; 710 } 711 712 /* 713 * The connection request may occur while the 714 * previous connection exist, e.g. in case of failover. 715 * But as connections may be initiated simultaneously 716 * by both hosts, we have a random backoff mechanism - 717 * see the comment above rdsv3_queue_reconnect() 718 */ 719 mutex_enter(&conn->c_cm_lock); 720 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 721 RDSV3_CONN_CONNECTING)) { 722 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 723 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 724 "incoming connect when connected: %p", 725 conn); 726 rdsv3_conn_drop(conn); 727 rdsv3_ib_stats_inc(s_ib_listen_closed_stale); 728 mutex_exit(&conn->c_cm_lock); 729 goto out; 730 } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) { 731 /* Wait and see - our connect may still be succeeding */ 732 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 733 "peer-to-peer connection request: %p, " 734 "lguid: 0x%llx fguid: 0x%llx", 735 conn, lguid, fguid); 736 rdsv3_ib_stats_inc(s_ib_connect_raced); 737 } 738 mutex_exit(&conn->c_cm_lock); 739 goto out; 740 } 741 742 ic = conn->c_transport_data; 743 744 rdsv3_ib_set_protocol(conn, version); 745 rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit)); 746 747 /* 748 * If the peer gave us the last packet it saw, process this as if 749 * we had received a regular ACK. 750 */ 751 if (dp->dp_ack_seq) 752 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 753 754 ASSERT(!cm_id->context); 755 ASSERT(!ic->i_cm_id); 756 757 if (ic->i_cm_id != NULL) 758 RDSV3_PANIC(); 759 760 ic->i_cm_id = cm_id; 761 cm_id->context = conn; 762 763 /* 764 * We got halfway through setting up the ib_connection, if we 765 * fail now, we have to take the long route out of this mess. 766 */ 767 destroy = 0; 768 769 err = rdsv3_ib_setup_qp(conn); 770 if (err) { 771 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 772 "rdsv3_ib_setup_qp failed (%d)", err); 773 mutex_exit(&conn->c_cm_lock); 774 rdsv3_conn_drop(conn); 775 goto out; 776 } 777 778 rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, 779 event->param.conn.responder_resources, 780 event->param.conn.initiator_depth); 781 782 /* rdma_accept() calls rdma_reject() internally if it fails */ 783 err = rdma_accept(cm_id, &conn_param); 784 mutex_exit(&conn->c_cm_lock); 785 if (err) { 786 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 787 "rdma_accept failed (%d)", err); 788 rdsv3_conn_drop(conn); 789 goto out; 790 } 791 792 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 793 "Return cm_id: %p event: %p", cm_id, event); 794 795 return (0); 796 797 out: 798 (void) rdma_reject(cm_id, NULL, 0); 799 return (destroy); 800 } 801 802 803 int 804 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 805 { 806 struct rdsv3_connection *conn = cm_id->context; 807 struct rdsv3_ib_connection *ic = conn->c_transport_data; 808 struct rdma_conn_param conn_param; 809 struct rdsv3_ib_connect_private dp; 810 int ret; 811 812 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p", 813 cm_id); 814 815 /* 816 * If the peer doesn't do protocol negotiation, we must 817 * default to RDSv3.0 818 */ 819 rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0); 820 ic->i_flowctl = 821 rdsv3_ib_sysctl_flow_control; /* advertise flow control */ 822 823 ret = rdsv3_ib_setup_qp(conn); 824 if (ret) { 825 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 826 "rdsv3_ib_setup_qp failed (%d)", ret); 827 rdsv3_conn_drop(conn); 828 goto out; 829 } 830 831 rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, 832 RDS_PROTOCOL_VERSION, UINT_MAX, UINT_MAX); 833 834 ret = rdma_connect(cm_id, &conn_param); 835 if (ret) { 836 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 837 "rdma_connect failed (%d)", ret); 838 rdsv3_conn_drop(conn); 839 } 840 841 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 842 "Return: cm_id: %p", cm_id); 843 844 out: 845 /* 846 * Beware - returning non-zero tells the rdma_cm to destroy 847 * the cm_id. We should certainly not do it as long as we still 848 * "own" the cm_id. 849 */ 850 if (ret) { 851 if (ic->i_cm_id == cm_id) 852 ret = 0; 853 } 854 return (ret); 855 } 856 857 int 858 rdsv3_ib_conn_connect(struct rdsv3_connection *conn) 859 { 860 struct rdsv3_ib_connection *ic = conn->c_transport_data; 861 struct sockaddr_in src, dest; 862 ipaddr_t laddr, faddr; 863 int ret; 864 865 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn); 866 867 /* 868 * XXX I wonder what affect the port space has 869 */ 870 /* delegate cm event handler to rdma_transport */ 871 ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn, 872 RDMA_PS_TCP); 873 if (IS_ERR(ic->i_cm_id)) { 874 ret = PTR_ERR(ic->i_cm_id); 875 ic->i_cm_id = NULL; 876 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 877 "rdma_create_id() failed: %d", ret); 878 goto out; 879 } 880 881 RDSV3_DPRINTF3("rdsv3_ib_conn_connect", 882 "created cm id %p for conn %p", ic->i_cm_id, conn); 883 884 /* The ipaddr should be in the network order */ 885 laddr = conn->c_laddr; 886 faddr = conn->c_faddr; 887 ret = rdsv3_sc_path_lookup(&laddr, &faddr); 888 if (ret == 0) { 889 RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)", 890 ntohl(laddr), ntohl(faddr)); 891 } 892 893 src.sin_family = AF_INET; 894 src.sin_addr.s_addr = (uint32_t)laddr; 895 src.sin_port = (uint16_t)htons(0); 896 897 dest.sin_family = AF_INET; 898 dest.sin_addr.s_addr = (uint32_t)faddr; 899 dest.sin_port = (uint16_t)htons(RDSV3_PORT); 900 901 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 902 (struct sockaddr *)&dest, 903 RDSV3_RDMA_RESOLVE_TIMEOUT_MS); 904 if (ret) { 905 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 906 "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret); 907 rdma_destroy_id(ic->i_cm_id); 908 ic->i_cm_id = NULL; 909 } 910 911 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn); 912 913 out: 914 return (ret); 915 } 916 917 /* 918 * This is so careful about only cleaning up resources that were built up 919 * so that it can be called at any point during startup. In fact it 920 * can be called multiple times for a given connection. 921 */ 922 void 923 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) 924 { 925 struct rdsv3_ib_connection *ic = conn->c_transport_data; 926 int err = 0; 927 928 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 929 "cm %p pd %p cq %p qp %p", ic->i_cm_id, 930 ic->i_pd, ic->i_cq, ic->i_cm_id ? ic->i_cm_id->qp : NULL); 931 932 if (ic->i_cm_id) { 933 struct ib_device *dev = ic->i_cm_id->device; 934 935 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 936 "disconnecting cm %p", ic->i_cm_id); 937 err = rdma_disconnect(ic->i_cm_id); 938 if (err) { 939 /* 940 * Actually this may happen quite frequently, when 941 * an outgoing connect raced with an incoming connect. 942 */ 943 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 944 "failed to disconnect, cm: %p err %d", 945 ic->i_cm_id, err); 946 } 947 948 if (ic->i_cm_id->qp) { 949 (void) ibt_flush_qp( 950 ib_get_ibt_channel_hdl(ic->i_cm_id)); 951 /* 952 * Don't wait for the send ring to be empty -- there 953 * may be completed non-signaled entries sitting on 954 * there. We unmap these below. 955 */ 956 rdsv3_wait_event(&ic->i_recv_ring.w_empty_wait, 957 rdsv3_ib_ring_empty(&ic->i_recv_ring)); 958 /* 959 * Note that Linux original code calls 960 * rdma_destroy_qp() after rdsv3_ib_recv_clear_ring(ic). 961 */ 962 rdma_destroy_qp(ic->i_cm_id); 963 } 964 965 if (rdsv3_enable_snd_cq) { 966 if (ic->i_snd_soft_cq) { 967 rdsv3_af_thr_destroy(ic->i_snd_soft_cq); 968 ic->i_snd_soft_cq = NULL; 969 } 970 if (ic->i_snd_cq) 971 (void) ib_destroy_cq(ic->i_snd_cq); 972 } 973 if (ic->i_soft_cq) { 974 rdsv3_af_thr_destroy(ic->i_soft_cq); 975 ic->i_soft_cq = NULL; 976 } 977 if (ic->i_refill_rq) { 978 rdsv3_af_thr_destroy(ic->i_refill_rq); 979 ic->i_refill_rq = NULL; 980 } 981 if (ic->i_cq) 982 (void) ib_destroy_cq(ic->i_cq); 983 984 if (ic->i_mr) 985 rdsv3_ib_free_hdrs(dev, ic); 986 987 if (ic->i_sends) 988 rdsv3_ib_send_clear_ring(ic); 989 if (ic->i_recvs) 990 rdsv3_ib_recv_clear_ring(ic); 991 992 rdma_destroy_id(ic->i_cm_id); 993 994 /* 995 * Move connection back to the nodev list. 996 */ 997 if (ic->i_on_dev_list) 998 rdsv3_ib_remove_conn(ic->rds_ibdev, conn); 999 1000 ic->i_cm_id = NULL; 1001 ic->i_pd = NULL; 1002 ic->i_mr = NULL; 1003 ic->i_cq = NULL; 1004 ic->i_snd_cq = NULL; 1005 ic->i_send_hdrs = NULL; 1006 ic->i_recv_hdrs = NULL; 1007 ic->i_ack = NULL; 1008 } 1009 ASSERT(!ic->i_on_dev_list); 1010 1011 /* Clear pending transmit */ 1012 if (ic->i_rm) { 1013 rdsv3_message_put(ic->i_rm); 1014 ic->i_rm = NULL; 1015 } 1016 1017 /* Clear the ACK state */ 1018 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 1019 ic->i_ack_next = 0; 1020 ic->i_ack_recv = 0; 1021 1022 /* Clear flow control state */ 1023 ic->i_flowctl = 0; 1024 ic->i_credits = 0; 1025 1026 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 1027 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 1028 1029 if (ic->i_ibinc) { 1030 rdsv3_inc_put(&ic->i_ibinc->ii_inc); 1031 ic->i_ibinc = NULL; 1032 } 1033 1034 if (ic->i_sends) { 1035 kmem_free(ic->i_sends, 1036 ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work)); 1037 ic->i_sends = NULL; 1038 } 1039 if (ic->i_send_wrs) { 1040 kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS * 1041 (sizeof (ibt_send_wr_t) + 1042 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t))); 1043 ic->i_send_wrs = NULL; 1044 } 1045 if (ic->i_recvs) { 1046 kmem_free(ic->i_recvs, 1047 ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); 1048 ic->i_recvs = NULL; 1049 } 1050 if (ic->i_recv_wrs) { 1051 kmem_free(ic->i_recv_wrs, ic->i_recv_ring.w_nr * 1052 (sizeof (ibt_recv_wr_t))); 1053 ic->i_recv_wrs = NULL; 1054 } 1055 1056 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); 1057 } 1058 1059 /* 1060 * the connection can be allocated from either rdsv3_conn_create_outgoing() 1061 * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the 1062 * same string. This can print the kstat warning on the console. To prevent 1063 * it, this counter value is used. 1064 * Note that requests from rdsv3_conn_create_outgoing() refers to the cached 1065 * value with the mutex lock before it allocates the connection, so that 1066 * the warning cannot be produced in the case. (only between 1067 * rdsv3_conn_create() and rdsv3_conn_create_outgoing(). 1068 */ 1069 static int conn_cnt; 1070 1071 /* ARGSUSED */ 1072 int 1073 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) 1074 { 1075 struct rdsv3_ib_connection *ic; 1076 1077 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); 1078 1079 /* XXX too lazy? */ 1080 ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); 1081 if (!ic) 1082 return (-ENOMEM); 1083 1084 list_link_init(&ic->ib_node); 1085 1086 mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); 1087 mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); 1088 1089 /* 1090 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they 1091 * must be initialized before it can be called. 1092 */ 1093 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 1094 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 1095 1096 ic->conn = conn; 1097 conn->c_transport_data = ic; 1098 1099 mutex_enter(&ib_nodev_conns_lock); 1100 list_insert_tail(&ib_nodev_conns, ic); 1101 mutex_exit(&ib_nodev_conns_lock); 1102 1103 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", 1104 conn, conn->c_transport_data); 1105 return (0); 1106 } 1107 1108 /* 1109 * Free a connection. Connection must be shut down and not set for reconnect. 1110 */ 1111 void 1112 rdsv3_ib_conn_free(void *arg) 1113 { 1114 struct rdsv3_ib_connection *ic = arg; 1115 kmutex_t *lock_ptr; 1116 1117 RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic); 1118 1119 #ifndef __lock_lint 1120 /* 1121 * Conn is either on a dev's list or on the nodev list. 1122 * A race with shutdown() or connect() would cause problems 1123 * (since rds_ibdev would change) but that should never happen. 1124 */ 1125 lock_ptr = ic->i_on_dev_list ? 1126 &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; 1127 1128 mutex_enter(lock_ptr); 1129 list_remove_node(&ic->ib_node); 1130 mutex_exit(lock_ptr); 1131 #endif 1132 kmem_free(ic, sizeof (*ic)); 1133 } 1134 1135 /* 1136 * An error occurred on the connection 1137 */ 1138 void 1139 __rdsv3_ib_conn_error(struct rdsv3_connection *conn) 1140 { 1141 rdsv3_conn_drop(conn); 1142 } 1143