1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * The rpcib plugin. Implements the interface for RDMATF's 31 * interaction with IBTF. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysmacros.h> 39 #include <sys/proc.h> 40 #include <sys/socket.h> 41 #include <sys/file.h> 42 #include <sys/stream.h> 43 #include <sys/strsubr.h> 44 #include <sys/stropts.h> 45 #include <sys/errno.h> 46 #include <sys/kmem.h> 47 #include <sys/debug.h> 48 #include <sys/systm.h> 49 #include <sys/pathname.h> 50 #include <sys/kstat.h> 51 #include <sys/t_lock.h> 52 #include <sys/ddi.h> 53 #include <sys/cmn_err.h> 54 #include <sys/time.h> 55 #include <sys/isa_defs.h> 56 #include <sys/callb.h> 57 #include <sys/sunddi.h> 58 #include <sys/sunndi.h> 59 60 #include <sys/ib/ibtl/ibti.h> 61 #include <rpc/rpc.h> 62 #include <rpc/ib.h> 63 64 #include <sys/modctl.h> 65 66 #include <sys/pathname.h> 67 #include <sys/kstr.h> 68 #include <sys/sockio.h> 69 #include <sys/vnode.h> 70 #include <sys/tiuser.h> 71 #include <net/if.h> 72 #include <sys/cred.h> 73 74 75 extern char *inet_ntop(int, const void *, char *, int); 76 77 78 /* 79 * Prototype declarations for driver ops 80 */ 81 82 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 83 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 84 void *, void **); 85 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 86 87 88 /* rpcib cb_ops */ 89 static struct cb_ops rpcib_cbops = { 90 nulldev, /* open */ 91 nulldev, /* close */ 92 nodev, /* strategy */ 93 nodev, /* print */ 94 nodev, /* dump */ 95 nodev, /* read */ 96 nodev, /* write */ 97 nodev, /* ioctl */ 98 nodev, /* devmap */ 99 nodev, /* mmap */ 100 nodev, /* segmap */ 101 nochpoll, /* poll */ 102 ddi_prop_op, /* prop_op */ 103 NULL, /* stream */ 104 D_MP, /* cb_flag */ 105 CB_REV, /* rev */ 106 nodev, /* int (*cb_aread)() */ 107 nodev /* int (*cb_awrite)() */ 108 }; 109 110 /* 111 * Device options 112 */ 113 static struct dev_ops rpcib_ops = { 114 DEVO_REV, /* devo_rev, */ 115 0, /* refcnt */ 116 rpcib_getinfo, /* info */ 117 nulldev, /* identify */ 118 nulldev, /* probe */ 119 rpcib_attach, /* attach */ 120 rpcib_detach, /* detach */ 121 nodev, /* reset */ 122 &rpcib_cbops, /* driver ops - devctl interfaces */ 123 NULL, /* bus operations */ 124 NULL /* power */ 125 }; 126 127 /* 128 * Module linkage information. 129 */ 130 131 static struct modldrv rib_modldrv = { 132 &mod_driverops, /* Driver module */ 133 "RPCIB plugin driver, ver %I%", /* Driver name and version */ 134 &rpcib_ops, /* Driver ops */ 135 }; 136 137 static struct modlinkage rib_modlinkage = { 138 MODREV_1, 139 (void *)&rib_modldrv, 140 NULL 141 }; 142 143 /* 144 * rib_stat: private data pointer used when registering 145 * with the IBTF. It is returned to the consumer 146 * in all callbacks. 147 */ 148 static rpcib_state_t *rib_stat = NULL; 149 150 #define RNR_RETRIES 2 151 #define MAX_PORTS 2 152 153 int preposted_rbufs = 16; 154 int send_threshold = 1; 155 156 /* 157 * State of the plugin. 158 * ACCEPT = accepting new connections and requests. 159 * NO_ACCEPT = not accepting new connection and requests. 160 * This should eventually move to rpcib_state_t structure, since this 161 * will tell in which state the plugin is for a particular type of service 162 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 163 * state for one and in no_accept state for the other. 164 */ 165 int plugin_state; 166 kmutex_t plugin_state_lock; 167 168 169 /* 170 * RPCIB RDMATF operations 171 */ 172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 173 static rdma_stat rib_disconnect(CONN *conn); 174 static void rib_listen(struct rdma_svc_data *rd); 175 static void rib_listen_stop(struct rdma_svc_data *rd); 176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 177 struct mrc *buf_handle); 178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 179 struct mrc buf_handle); 180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 181 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle); 182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 183 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle); 184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 185 caddr_t buf, int len, int cpu); 186 187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 188 189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 191 192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 193 194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 203 static rdma_stat rib_conn_release(CONN *conn); 204 static rdma_stat rib_getinfo(rdma_info_t *info); 205 static rdma_stat rib_register_ats(rib_hca_t *); 206 static void rib_deregister_ats(); 207 static void rib_stop_services(rib_hca_t *); 208 209 /* 210 * RPCIB addressing operations 211 */ 212 char ** get_ip_addrs(int *count); 213 int get_interfaces(TIUSER *tiptr, int *num); 214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs); 215 int get_ibd_ipaddr(rpcib_ibd_insts_t *); 216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *); 217 void rib_get_ibd_insts(rpcib_ibd_insts_t *); 218 219 220 /* 221 * RDMA operations the RPCIB module exports 222 */ 223 static rdmaops_t rib_ops = { 224 rib_reachable, 225 rib_conn_get, 226 rib_conn_release, 227 rib_listen, 228 rib_listen_stop, 229 rib_registermem, 230 rib_deregistermem, 231 rib_registermemsync, 232 rib_deregistermemsync, 233 rib_syncmem, 234 rib_reg_buf_alloc, 235 rib_reg_buf_free, 236 rib_send, 237 rib_send_resp, 238 rib_post_resp, 239 rib_post_recv, 240 rib_recv, 241 rib_read, 242 rib_write, 243 rib_getinfo 244 }; 245 246 /* 247 * RDMATF RPCIB plugin details 248 */ 249 static rdma_mod_t rib_mod = { 250 "ibtf", /* api name */ 251 RDMATF_VERS_1, 252 0, 253 &rib_ops, /* rdma op vector for ibtf */ 254 }; 255 256 static rdma_stat open_hcas(rpcib_state_t *); 257 static rdma_stat rib_qp_init(rib_qp_t *, int); 258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 264 ibt_mr_hdl_t *, ibt_mr_desc_t *); 265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *); 266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 267 rib_qp_t **); 268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 269 rib_qp_t **); 270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 272 static int rib_free_sendwait(struct send_wid *); 273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 275 static void rdma_done_rem_list(rib_qp_t *); 276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 277 278 static void rib_async_handler(void *, 279 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 282 static int rib_free_svc_recv(struct svc_recv *); 283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 284 static void rib_free_wid(struct recv_wid *); 285 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 286 static void rib_detach_hca(rib_hca_t *); 287 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int, 288 ibt_path_info_t *); 289 290 /* 291 * Registration with IBTF as a consumer 292 */ 293 static struct ibt_clnt_modinfo_s rib_modinfo = { 294 IBTI_V1, 295 IBT_GENERIC, 296 rib_async_handler, /* async event handler */ 297 NULL, /* Memory Region Handler */ 298 "nfs/ib" 299 }; 300 301 /* 302 * Global strucuture 303 */ 304 305 typedef struct rpcib_s { 306 dev_info_t *rpcib_dip; 307 kmutex_t rpcib_mutex; 308 } rpcib_t; 309 310 rpcib_t rpcib; 311 312 /* 313 * /etc/system controlled variable to control 314 * debugging in rpcib kernel module. 315 * Set it to values greater that 1 to control 316 * the amount of debugging messages required. 317 */ 318 int rib_debug = 0; 319 320 static int ats_running = 0; 321 int 322 _init(void) 323 { 324 int error; 325 326 error = mod_install((struct modlinkage *)&rib_modlinkage); 327 if (error != 0) { 328 /* 329 * Could not load module 330 */ 331 return (error); 332 } 333 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 334 335 return (0); 336 } 337 338 int 339 _fini() 340 { 341 int status; 342 343 if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) { 344 return (EBUSY); 345 } 346 347 rib_deregister_ats(); 348 349 /* 350 * Remove module 351 */ 352 if ((status = mod_remove(&rib_modlinkage)) != 0) { 353 (void) rdma_register_mod(&rib_mod); 354 return (status); 355 } 356 mutex_destroy(&plugin_state_lock); 357 return (0); 358 } 359 360 int 361 _info(struct modinfo *modinfop) 362 { 363 return (mod_info(&rib_modlinkage, modinfop)); 364 } 365 366 367 /* 368 * rpcib_getinfo() 369 * Given the device number, return the devinfo pointer or the 370 * instance number. 371 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 372 */ 373 374 /*ARGSUSED*/ 375 static int 376 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 377 { 378 int ret = DDI_SUCCESS; 379 380 switch (cmd) { 381 case DDI_INFO_DEVT2DEVINFO: 382 if (rpcib.rpcib_dip != NULL) 383 *result = rpcib.rpcib_dip; 384 else { 385 *result = NULL; 386 ret = DDI_FAILURE; 387 } 388 break; 389 390 case DDI_INFO_DEVT2INSTANCE: 391 *result = NULL; 392 break; 393 394 default: 395 ret = DDI_FAILURE; 396 } 397 return (ret); 398 } 399 400 static int 401 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 402 { 403 ibt_status_t ibt_status; 404 rdma_stat r_status; 405 406 switch (cmd) { 407 case DDI_ATTACH: 408 break; 409 case DDI_RESUME: 410 return (DDI_SUCCESS); 411 default: 412 return (DDI_FAILURE); 413 } 414 415 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 416 417 mutex_enter(&rpcib.rpcib_mutex); 418 if (rpcib.rpcib_dip != NULL) { 419 mutex_exit(&rpcib.rpcib_mutex); 420 return (DDI_FAILURE); 421 } 422 rpcib.rpcib_dip = dip; 423 mutex_exit(&rpcib.rpcib_mutex); 424 /* 425 * Create the "rpcib" minor-node. 426 */ 427 if (ddi_create_minor_node(dip, 428 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 429 /* Error message, no cmn_err as they print on console */ 430 return (DDI_FAILURE); 431 } 432 433 if (rib_stat == NULL) { 434 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 435 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 436 } 437 438 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 439 if (rib_stat->hca_count < 1) { 440 mutex_destroy(&rib_stat->open_hca_lock); 441 kmem_free(rib_stat, sizeof (*rib_stat)); 442 rib_stat = NULL; 443 return (DDI_FAILURE); 444 } 445 446 ibt_status = ibt_attach(&rib_modinfo, dip, 447 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 448 if (ibt_status != IBT_SUCCESS) { 449 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 450 mutex_destroy(&rib_stat->open_hca_lock); 451 kmem_free(rib_stat, sizeof (*rib_stat)); 452 rib_stat = NULL; 453 return (DDI_FAILURE); 454 } 455 456 mutex_enter(&rib_stat->open_hca_lock); 457 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 458 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 459 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 460 mutex_exit(&rib_stat->open_hca_lock); 461 mutex_destroy(&rib_stat->open_hca_lock); 462 kmem_free(rib_stat, sizeof (*rib_stat)); 463 rib_stat = NULL; 464 return (DDI_FAILURE); 465 } 466 mutex_exit(&rib_stat->open_hca_lock); 467 468 /* 469 * Register with rdmatf 470 */ 471 rib_mod.rdma_count = rib_stat->hca_count; 472 r_status = rdma_register_mod(&rib_mod); 473 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 474 rib_detach_hca(rib_stat->hca); 475 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 476 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 477 mutex_destroy(&rib_stat->open_hca_lock); 478 kmem_free(rib_stat, sizeof (*rib_stat)); 479 rib_stat = NULL; 480 return (DDI_FAILURE); 481 } 482 483 484 return (DDI_SUCCESS); 485 } 486 487 /*ARGSUSED*/ 488 static int 489 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 490 { 491 switch (cmd) { 492 493 case DDI_DETACH: 494 break; 495 496 case DDI_SUSPEND: 497 default: 498 return (DDI_FAILURE); 499 } 500 501 /* 502 * Detach the hca and free resources 503 */ 504 mutex_enter(&plugin_state_lock); 505 plugin_state = NO_ACCEPT; 506 mutex_exit(&plugin_state_lock); 507 rib_detach_hca(rib_stat->hca); 508 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 509 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 510 511 mutex_enter(&rpcib.rpcib_mutex); 512 rpcib.rpcib_dip = NULL; 513 mutex_exit(&rpcib.rpcib_mutex); 514 515 mutex_destroy(&rpcib.rpcib_mutex); 516 return (DDI_SUCCESS); 517 } 518 519 520 static void 521 rib_deregister_ats() 522 { 523 rib_hca_t *hca; 524 rib_service_t *srv_list, *to_remove; 525 ibt_status_t ibt_status; 526 527 /* 528 * deregister the Address Translation Service. 529 */ 530 hca = rib_stat->hca; 531 rw_enter(&hca->service_list_lock, RW_WRITER); 532 srv_list = hca->ats_list; 533 while (srv_list != NULL) { 534 to_remove = srv_list; 535 srv_list = to_remove->srv_next; 536 537 ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl, 538 &to_remove->srv_ar); 539 if (ibt_status != IBT_SUCCESS) { 540 #ifdef DEBUG 541 if (rib_debug) { 542 cmn_err(CE_WARN, "_fini: " 543 "ibt_deregister_ar FAILED" 544 " status: %d", ibt_status); 545 } 546 #endif 547 } else { 548 mutex_enter(&rib_stat->open_hca_lock); 549 ats_running = 0; 550 mutex_exit(&rib_stat->open_hca_lock); 551 #ifdef DEBUG 552 if (rib_debug) { 553 554 cmn_err(CE_NOTE, "_fini: " 555 "Successfully unregistered" 556 " ATS service: %s", 557 to_remove->srv_name); 558 } 559 #endif 560 } 561 kmem_free(to_remove, sizeof (rib_service_t)); 562 } 563 hca->ats_list = NULL; 564 rw_exit(&hca->service_list_lock); 565 } 566 567 static void rib_rbufpool_free(rib_hca_t *, int); 568 static void rib_rbufpool_deregister(rib_hca_t *, int); 569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 571 static rdma_stat rib_rem_replylist(rib_qp_t *); 572 static int rib_remreply(rib_qp_t *, struct reply *); 573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 575 576 /* 577 * One CQ pair per HCA 578 */ 579 static rdma_stat 580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 581 rib_cq_t **cqp, rpcib_state_t *ribstat) 582 { 583 rib_cq_t *cq; 584 ibt_cq_attr_t cq_attr; 585 uint32_t real_size; 586 ibt_status_t status; 587 rdma_stat error = RDMA_SUCCESS; 588 589 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 590 cq->rib_hca = hca; 591 cq_attr.cq_size = cq_size; 592 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 593 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 594 &real_size); 595 if (status != IBT_SUCCESS) { 596 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 597 " status=%d", status); 598 error = RDMA_FAILED; 599 goto fail; 600 } 601 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 602 603 /* 604 * Enable CQ callbacks. CQ Callbacks are single shot 605 * (e.g. you have to call ibt_enable_cq_notify() 606 * after each callback to get another one). 607 */ 608 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 609 if (status != IBT_SUCCESS) { 610 cmn_err(CE_WARN, "rib_create_cq: " 611 "enable_cq_notify failed, status %d", status); 612 error = RDMA_FAILED; 613 goto fail; 614 } 615 *cqp = cq; 616 617 return (error); 618 fail: 619 if (cq->rib_cq_hdl) 620 (void) ibt_free_cq(cq->rib_cq_hdl); 621 if (cq) 622 kmem_free(cq, sizeof (rib_cq_t)); 623 return (error); 624 } 625 626 static rdma_stat 627 open_hcas(rpcib_state_t *ribstat) 628 { 629 rib_hca_t *hca; 630 ibt_status_t ibt_status; 631 rdma_stat status; 632 ibt_hca_portinfo_t *pinfop; 633 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 634 uint_t size, cq_size; 635 int i; 636 637 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 638 if (ribstat->hcas == NULL) 639 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 640 sizeof (rib_hca_t), KM_SLEEP); 641 642 /* 643 * Open a hca and setup for RDMA 644 */ 645 for (i = 0; i < ribstat->hca_count; i++) { 646 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 647 ribstat->hca_guids[i], 648 &ribstat->hcas[i].hca_hdl); 649 if (ibt_status != IBT_SUCCESS) { 650 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) " 651 "returned %d", i, ibt_status); 652 continue; 653 } 654 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 655 hca = &(ribstat->hcas[i]); 656 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 657 hca->state = HCA_INITED; 658 659 /* 660 * query HCA info 661 */ 662 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 663 if (ibt_status != IBT_SUCCESS) { 664 cmn_err(CE_WARN, "open_hcas: ibt_query_hca " 665 "returned %d (hca_guid 0x%llx)", 666 ibt_status, (longlong_t)ribstat->hca_guids[i]); 667 goto fail1; 668 } 669 670 /* 671 * One PD (Protection Domain) per HCA. 672 * A qp is allowed to access a memory region 673 * only when it's in the same PD as that of 674 * the memory region. 675 */ 676 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 677 if (ibt_status != IBT_SUCCESS) { 678 cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd " 679 "returned %d (hca_guid 0x%llx)", 680 ibt_status, (longlong_t)ribstat->hca_guids[i]); 681 goto fail1; 682 } 683 684 /* 685 * query HCA ports 686 */ 687 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 688 0, &pinfop, &hca->hca_nports, &size); 689 if (ibt_status != IBT_SUCCESS) { 690 cmn_err(CE_WARN, "open_hcas: " 691 "ibt_query_hca_ports returned %d " 692 "(hca_guid 0x%llx)", 693 ibt_status, (longlong_t)hca->hca_guid); 694 goto fail2; 695 } 696 hca->hca_ports = pinfop; 697 hca->hca_pinfosz = size; 698 pinfop = NULL; 699 700 cq_size = DEF_CQ_SIZE; /* default cq size */ 701 /* 702 * Create 2 pairs of cq's (1 pair for client 703 * and the other pair for server) on this hca. 704 * If number of qp's gets too large, then several 705 * cq's will be needed. 706 */ 707 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 708 &hca->svc_rcq, ribstat); 709 if (status != RDMA_SUCCESS) { 710 goto fail3; 711 } 712 713 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 714 &hca->svc_scq, ribstat); 715 if (status != RDMA_SUCCESS) { 716 goto fail3; 717 } 718 719 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 720 &hca->clnt_rcq, ribstat); 721 if (status != RDMA_SUCCESS) { 722 goto fail3; 723 } 724 725 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 726 &hca->clnt_scq, ribstat); 727 if (status != RDMA_SUCCESS) { 728 goto fail3; 729 } 730 731 /* 732 * Create buffer pools. 733 * Note rib_rbuf_create also allocates memory windows. 734 */ 735 hca->recv_pool = rib_rbufpool_create(hca, 736 RECV_BUFFER, MAX_BUFS); 737 if (hca->recv_pool == NULL) { 738 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n"); 739 goto fail3; 740 } 741 742 hca->send_pool = rib_rbufpool_create(hca, 743 SEND_BUFFER, MAX_BUFS); 744 if (hca->send_pool == NULL) { 745 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n"); 746 rib_rbufpool_destroy(hca, RECV_BUFFER); 747 goto fail3; 748 } 749 750 /* 751 * Initialize the registered service list and 752 * the lock 753 */ 754 hca->service_list = NULL; 755 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 756 757 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 758 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 759 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 760 hca->iblock); 761 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 762 hca->iblock); 763 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 764 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 765 hca->inuse = TRUE; 766 /* 767 * XXX One hca only. Add multi-hca functionality if needed 768 * later. 769 */ 770 ribstat->hca = hca; 771 ribstat->nhca_inited++; 772 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 773 break; 774 775 fail3: 776 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 777 fail2: 778 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 779 fail1: 780 (void) ibt_close_hca(hca->hca_hdl); 781 782 } 783 if (ribstat->hca != NULL) 784 return (RDMA_SUCCESS); 785 else 786 return (RDMA_FAILED); 787 } 788 789 /* 790 * Callback routines 791 */ 792 793 /* 794 * SCQ handlers 795 */ 796 /* ARGSUSED */ 797 static void 798 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 799 { 800 ibt_status_t ibt_status; 801 ibt_wc_t wc; 802 int i; 803 804 /* 805 * Re-enable cq notify here to avoid missing any 806 * completion queue notification. 807 */ 808 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 809 810 ibt_status = IBT_SUCCESS; 811 while (ibt_status != IBT_CQ_EMPTY) { 812 bzero(&wc, sizeof (wc)); 813 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 814 if (ibt_status != IBT_SUCCESS) 815 return; 816 817 /* 818 * Got a send completion 819 */ 820 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 821 struct send_wid *wd = (struct send_wid *)wc.wc_id; 822 CONN *conn = qptoc(wd->qp); 823 824 mutex_enter(&wd->sendwait_lock); 825 switch (wc.wc_status) { 826 case IBT_WC_SUCCESS: 827 wd->status = RDMA_SUCCESS; 828 break; 829 case IBT_WC_WR_FLUSHED_ERR: 830 wd->status = RDMA_FAILED; 831 break; 832 default: 833 /* 834 * RC Send Q Error Code Local state Remote State 835 * ==================== =========== ============ 836 * IBT_WC_BAD_RESPONSE_ERR ERROR None 837 * IBT_WC_LOCAL_LEN_ERR ERROR None 838 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 839 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 840 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 841 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 842 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 843 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 844 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 845 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 846 * IBT_WC_WR_FLUSHED_ERR None None 847 */ 848 #ifdef DEBUG 849 if (rib_debug > 1) { 850 if (wc.wc_status != IBT_WC_SUCCESS) { 851 cmn_err(CE_NOTE, "rib_clnt_scq_handler: " 852 "WR completed in error, wc.wc_status:%d, " 853 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id); 854 } 855 } 856 #endif 857 /* 858 * Channel in error state. Set connection to 859 * ERROR and cleanup will happen either from 860 * conn_release or from rib_conn_get 861 */ 862 wd->status = RDMA_FAILED; 863 mutex_enter(&conn->c_lock); 864 if (conn->c_state != C_DISCONN_PEND) 865 conn->c_state = C_ERROR; 866 mutex_exit(&conn->c_lock); 867 break; 868 } 869 if (wd->cv_sig == 1) { 870 /* 871 * Notify poster 872 */ 873 cv_signal(&wd->wait_cv); 874 mutex_exit(&wd->sendwait_lock); 875 } else { 876 /* 877 * Poster not waiting for notification. 878 * Free the send buffers and send_wid 879 */ 880 for (i = 0; i < wd->nsbufs; i++) { 881 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 882 (void *)wd->sbufaddr[i]); 883 } 884 mutex_exit(&wd->sendwait_lock); 885 (void) rib_free_sendwait(wd); 886 } 887 } 888 } 889 } 890 891 /* ARGSUSED */ 892 static void 893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 894 { 895 ibt_status_t ibt_status; 896 ibt_wc_t wc; 897 int i; 898 899 /* 900 * Re-enable cq notify here to avoid missing any 901 * completion queue notification. 902 */ 903 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 904 905 ibt_status = IBT_SUCCESS; 906 while (ibt_status != IBT_CQ_EMPTY) { 907 bzero(&wc, sizeof (wc)); 908 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 909 if (ibt_status != IBT_SUCCESS) 910 return; 911 912 /* 913 * Got a send completion 914 */ 915 #ifdef DEBUG 916 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) { 917 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error " 918 "wc.wc_status:%d, wc_id:%llX", 919 wc.wc_status, (longlong_t)wc.wc_id); 920 } 921 #endif 922 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 923 struct send_wid *wd = (struct send_wid *)wc.wc_id; 924 925 mutex_enter(&wd->sendwait_lock); 926 if (wd->cv_sig == 1) { 927 /* 928 * Update completion status and notify poster 929 */ 930 if (wc.wc_status == IBT_WC_SUCCESS) 931 wd->status = RDMA_SUCCESS; 932 else 933 wd->status = RDMA_FAILED; 934 cv_signal(&wd->wait_cv); 935 mutex_exit(&wd->sendwait_lock); 936 } else { 937 /* 938 * Poster not waiting for notification. 939 * Free the send buffers and send_wid 940 */ 941 for (i = 0; i < wd->nsbufs; i++) { 942 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 943 (void *)wd->sbufaddr[i]); 944 } 945 mutex_exit(&wd->sendwait_lock); 946 (void) rib_free_sendwait(wd); 947 } 948 } 949 } 950 } 951 952 /* 953 * RCQ handler 954 */ 955 /* ARGSUSED */ 956 static void 957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 958 { 959 rib_qp_t *qp; 960 ibt_status_t ibt_status; 961 ibt_wc_t wc; 962 struct recv_wid *rwid; 963 964 /* 965 * Re-enable cq notify here to avoid missing any 966 * completion queue notification. 967 */ 968 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 969 970 ibt_status = IBT_SUCCESS; 971 while (ibt_status != IBT_CQ_EMPTY) { 972 bzero(&wc, sizeof (wc)); 973 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 974 if (ibt_status != IBT_SUCCESS) 975 return; 976 977 rwid = (struct recv_wid *)wc.wc_id; 978 qp = rwid->qp; 979 if (wc.wc_status == IBT_WC_SUCCESS) { 980 XDR inxdrs, *xdrs; 981 uint_t xid, vers, op, find_xid = 0; 982 struct reply *r; 983 CONN *conn = qptoc(qp); 984 985 xdrs = &inxdrs; 986 xdrmem_create(xdrs, (caddr_t)rwid->addr, 987 wc.wc_bytes_xfer, XDR_DECODE); 988 /* 989 * Treat xid as opaque (xid is the first entity 990 * in the rpc rdma message). 991 */ 992 xid = *(uint32_t *)rwid->addr; 993 /* Skip xid and set the xdr position accordingly. */ 994 XDR_SETPOS(xdrs, sizeof (uint32_t)); 995 (void) xdr_u_int(xdrs, &vers); 996 (void) xdr_u_int(xdrs, &op); 997 XDR_DESTROY(xdrs); 998 if (vers != RPCRDMA_VERS) { 999 /* 1000 * Invalid RPC/RDMA version. Cannot interoperate. 1001 * Set connection to ERROR state and bail out. 1002 */ 1003 mutex_enter(&conn->c_lock); 1004 if (conn->c_state != C_DISCONN_PEND) 1005 conn->c_state = C_ERROR; 1006 mutex_exit(&conn->c_lock); 1007 rib_rbuf_free(conn, RECV_BUFFER, (void *)rwid->addr); 1008 rib_free_wid(rwid); 1009 continue; 1010 } 1011 1012 mutex_enter(&qp->replylist_lock); 1013 for (r = qp->replylist; r != NULL; r = r->next) { 1014 if (r->xid == xid) { 1015 find_xid = 1; 1016 switch (op) { 1017 case RDMA_MSG: 1018 case RDMA_NOMSG: 1019 case RDMA_MSGP: 1020 r->status = RDMA_SUCCESS; 1021 r->vaddr_cq = rwid->addr; 1022 r->bytes_xfer = wc.wc_bytes_xfer; 1023 cv_signal(&r->wait_cv); 1024 break; 1025 default: 1026 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1027 (void *)rwid->addr); 1028 break; 1029 } 1030 break; 1031 } 1032 } 1033 mutex_exit(&qp->replylist_lock); 1034 if (find_xid == 0) { 1035 /* RPC caller not waiting for reply */ 1036 #ifdef DEBUG 1037 if (rib_debug) { 1038 cmn_err(CE_NOTE, "rib_clnt_rcq_handler: " 1039 "NO matching xid %u!\n", xid); 1040 } 1041 #endif 1042 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1043 (void *)rwid->addr); 1044 } 1045 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1046 CONN *conn = qptoc(qp); 1047 1048 /* 1049 * Connection being flushed. Just free 1050 * the posted buffer 1051 */ 1052 rib_rbuf_free(conn, RECV_BUFFER, (void *)rwid->addr); 1053 } else { 1054 CONN *conn = qptoc(qp); 1055 /* 1056 * RC Recv Q Error Code Local state Remote State 1057 * ==================== =========== ============ 1058 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1059 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1060 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1061 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1062 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1063 * IBT_WC_WR_FLUSHED_ERR None None 1064 */ 1065 /* 1066 * Channel in error state. Set connection 1067 * in ERROR state. 1068 */ 1069 mutex_enter(&conn->c_lock); 1070 if (conn->c_state != C_DISCONN_PEND) 1071 conn->c_state = C_ERROR; 1072 mutex_exit(&conn->c_lock); 1073 rib_rbuf_free(conn, RECV_BUFFER, (void *)rwid->addr); 1074 } 1075 rib_free_wid(rwid); 1076 } 1077 } 1078 1079 /* Server side */ 1080 /* ARGSUSED */ 1081 static void 1082 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1083 { 1084 struct recv_data *rd; 1085 rib_qp_t *qp; 1086 ibt_status_t ibt_status; 1087 ibt_wc_t wc; 1088 struct svc_recv *s_recvp; 1089 CONN *conn; 1090 mblk_t *mp; 1091 1092 /* 1093 * Re-enable cq notify here to avoid missing any 1094 * completion queue notification. 1095 */ 1096 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1097 1098 ibt_status = IBT_SUCCESS; 1099 while (ibt_status != IBT_CQ_EMPTY) { 1100 bzero(&wc, sizeof (wc)); 1101 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1102 if (ibt_status != IBT_SUCCESS) 1103 return; 1104 1105 s_recvp = (struct svc_recv *)wc.wc_id; 1106 qp = s_recvp->qp; 1107 conn = qptoc(qp); 1108 mutex_enter(&qp->posted_rbufs_lock); 1109 qp->n_posted_rbufs--; 1110 if (qp->n_posted_rbufs == 0) 1111 cv_signal(&qp->posted_rbufs_cv); 1112 mutex_exit(&qp->posted_rbufs_lock); 1113 1114 if (wc.wc_status == IBT_WC_SUCCESS) { 1115 XDR inxdrs, *xdrs; 1116 uint_t xid, vers, op; 1117 1118 xdrs = &inxdrs; 1119 /* s_recvp->vaddr stores data */ 1120 xdrmem_create(xdrs, (caddr_t)s_recvp->vaddr, 1121 wc.wc_bytes_xfer, XDR_DECODE); 1122 1123 /* 1124 * Treat xid as opaque (xid is the first entity 1125 * in the rpc rdma message). 1126 */ 1127 xid = *(uint32_t *)s_recvp->vaddr; 1128 /* Skip xid and set the xdr position accordingly. */ 1129 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1130 if (!xdr_u_int(xdrs, &vers) || 1131 !xdr_u_int(xdrs, &op)) { 1132 rib_rbuf_free(conn, RECV_BUFFER, 1133 (void *)s_recvp->vaddr); 1134 XDR_DESTROY(xdrs); 1135 #ifdef DEBUG 1136 cmn_err(CE_NOTE, "rib_svc_rcq_handler: " 1137 "xdr_u_int failed for qp %p, wc_id=%llx", 1138 (void *)qp, (longlong_t)wc.wc_id); 1139 #endif 1140 (void) rib_free_svc_recv(s_recvp); 1141 continue; 1142 } 1143 XDR_DESTROY(xdrs); 1144 1145 if (vers != RPCRDMA_VERS) { 1146 /* 1147 * Invalid RPC/RDMA version. Drop rpc rdma message. 1148 */ 1149 rib_rbuf_free(conn, RECV_BUFFER, 1150 (void *)s_recvp->vaddr); 1151 (void) rib_free_svc_recv(s_recvp); 1152 continue; 1153 } 1154 /* 1155 * Is this for RDMA_DONE? 1156 */ 1157 if (op == RDMA_DONE) { 1158 rib_rbuf_free(conn, RECV_BUFFER, 1159 (void *)s_recvp->vaddr); 1160 /* 1161 * Wake up the thread waiting on 1162 * a RDMA_DONE for xid 1163 */ 1164 mutex_enter(&qp->rdlist_lock); 1165 rdma_done_notify(qp, xid); 1166 mutex_exit(&qp->rdlist_lock); 1167 (void) rib_free_svc_recv(s_recvp); 1168 continue; 1169 } 1170 1171 mutex_enter(&plugin_state_lock); 1172 if (plugin_state == ACCEPT) { 1173 while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL) 1174 (void) strwaitbuf(sizeof (*rd), BPRI_LO); 1175 /* 1176 * Plugin is in accept state, hence the master 1177 * transport queue for this is still accepting 1178 * requests. Hence we can call svc_queuereq to 1179 * queue this recieved msg. 1180 */ 1181 rd = (struct recv_data *)mp->b_rptr; 1182 rd->conn = conn; 1183 rd->rpcmsg.addr = (caddr_t)s_recvp->vaddr; 1184 rd->rpcmsg.type = RECV_BUFFER; 1185 rd->rpcmsg.len = wc.wc_bytes_xfer; 1186 rd->status = wc.wc_status; 1187 mutex_enter(&conn->c_lock); 1188 conn->c_ref++; 1189 mutex_exit(&conn->c_lock); 1190 mp->b_wptr += sizeof (*rd); 1191 svc_queuereq((queue_t *)rib_stat->q, mp); 1192 mutex_exit(&plugin_state_lock); 1193 } else { 1194 /* 1195 * The master transport for this is going 1196 * away and the queue is not accepting anymore 1197 * requests for krpc, so don't do anything, just 1198 * free the msg. 1199 */ 1200 mutex_exit(&plugin_state_lock); 1201 rib_rbuf_free(conn, RECV_BUFFER, 1202 (void *)s_recvp->vaddr); 1203 } 1204 } else { 1205 rib_rbuf_free(conn, RECV_BUFFER, 1206 (void *)s_recvp->vaddr); 1207 } 1208 (void) rib_free_svc_recv(s_recvp); 1209 } 1210 } 1211 1212 /* 1213 * Handles DR event of IBT_HCA_DETACH_EVENT. 1214 */ 1215 /* ARGSUSED */ 1216 static void 1217 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1218 ibt_async_code_t code, ibt_async_event_t *event) 1219 { 1220 1221 switch (code) { 1222 case IBT_HCA_ATTACH_EVENT: 1223 /* ignore */ 1224 break; 1225 case IBT_HCA_DETACH_EVENT: 1226 { 1227 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1228 rib_detach_hca(rib_stat->hca); 1229 #ifdef DEBUG 1230 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1231 #endif 1232 break; 1233 } 1234 #ifdef DEBUG 1235 case IBT_EVENT_PATH_MIGRATED: 1236 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n"); 1237 break; 1238 case IBT_EVENT_SQD: 1239 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1240 break; 1241 case IBT_EVENT_COM_EST: 1242 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1243 break; 1244 case IBT_ERROR_CATASTROPHIC_CHAN: 1245 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n"); 1246 break; 1247 case IBT_ERROR_INVALID_REQUEST_CHAN: 1248 cmn_err(CE_NOTE, "rib_async_handler(): " 1249 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1250 break; 1251 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1252 cmn_err(CE_NOTE, "rib_async_handler(): " 1253 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1254 break; 1255 case IBT_ERROR_PATH_MIGRATE_REQ: 1256 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n"); 1257 break; 1258 case IBT_ERROR_CQ: 1259 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1260 break; 1261 case IBT_ERROR_PORT_DOWN: 1262 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1263 break; 1264 case IBT_EVENT_PORT_UP: 1265 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1266 break; 1267 case IBT_ASYNC_OPAQUE1: 1268 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1269 break; 1270 case IBT_ASYNC_OPAQUE2: 1271 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1272 break; 1273 case IBT_ASYNC_OPAQUE3: 1274 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1275 break; 1276 case IBT_ASYNC_OPAQUE4: 1277 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1278 break; 1279 #endif 1280 default: 1281 break; 1282 } 1283 } 1284 1285 /* 1286 * Client's reachable function. 1287 */ 1288 static rdma_stat 1289 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1290 { 1291 rib_hca_t *hca; 1292 rdma_stat status; 1293 1294 /* 1295 * First check if a hca is still attached 1296 */ 1297 *handle = NULL; 1298 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1299 if (rib_stat->hca->state != HCA_INITED) { 1300 rw_exit(&rib_stat->hca->state_lock); 1301 return (RDMA_FAILED); 1302 } 1303 status = rib_ping_srv(addr_type, raddr, &hca); 1304 rw_exit(&rib_stat->hca->state_lock); 1305 1306 if (status == RDMA_SUCCESS) { 1307 *handle = (void *)hca; 1308 /* 1309 * Register the Address translation service 1310 */ 1311 mutex_enter(&rib_stat->open_hca_lock); 1312 if (ats_running == 0) { 1313 if (rib_register_ats(rib_stat->hca) 1314 == RDMA_SUCCESS) { 1315 ats_running = 1; 1316 mutex_exit(&rib_stat->open_hca_lock); 1317 return (RDMA_SUCCESS); 1318 } else { 1319 mutex_exit(&rib_stat->open_hca_lock); 1320 return (RDMA_FAILED); 1321 } 1322 } else { 1323 mutex_exit(&rib_stat->open_hca_lock); 1324 return (RDMA_SUCCESS); 1325 } 1326 } else { 1327 *handle = NULL; 1328 if (rib_debug > 2) 1329 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n"); 1330 return (RDMA_FAILED); 1331 } 1332 } 1333 1334 /* Client side qp creation */ 1335 static rdma_stat 1336 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1337 { 1338 rib_qp_t *kqp = NULL; 1339 CONN *conn; 1340 1341 ASSERT(qp != NULL); 1342 *qp = NULL; 1343 1344 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1345 conn = qptoc(kqp); 1346 kqp->hca = hca; 1347 kqp->rdmaconn.c_rdmamod = &rib_mod; 1348 kqp->rdmaconn.c_private = (caddr_t)kqp; 1349 1350 kqp->mode = RIB_CLIENT; 1351 kqp->chan_flags = IBT_BLOCKING; 1352 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1353 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1354 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1355 1356 /* 1357 * Initialize 1358 */ 1359 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1360 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1361 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1362 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1363 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1364 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1365 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1366 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1367 1368 *qp = kqp; 1369 return (RDMA_SUCCESS); 1370 } 1371 1372 /* Server side qp creation */ 1373 static rdma_stat 1374 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1375 { 1376 rib_qp_t *kqp = NULL; 1377 ibt_chan_sizes_t chan_sizes; 1378 ibt_rc_chan_alloc_args_t qp_attr; 1379 ibt_status_t ibt_status; 1380 1381 ASSERT(qp != NULL); 1382 *qp = NULL; 1383 1384 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1385 kqp->hca = hca; 1386 kqp->port_num = port; 1387 kqp->rdmaconn.c_rdmamod = &rib_mod; 1388 kqp->rdmaconn.c_private = (caddr_t)kqp; 1389 1390 /* 1391 * Create the qp handle 1392 */ 1393 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1394 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1395 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1396 qp_attr.rc_pd = hca->pd_hdl; 1397 qp_attr.rc_hca_port_num = port; 1398 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1399 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1400 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1401 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1402 qp_attr.rc_clone_chan = NULL; 1403 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1404 qp_attr.rc_flags = IBT_WR_SIGNALED; 1405 1406 rw_enter(&hca->state_lock, RW_READER); 1407 if (hca->state != HCA_DETACHED) { 1408 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1409 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1410 &chan_sizes); 1411 } else { 1412 rw_exit(&hca->state_lock); 1413 goto fail; 1414 } 1415 rw_exit(&hca->state_lock); 1416 1417 if (ibt_status != IBT_SUCCESS) { 1418 cmn_err(CE_WARN, "rib_svc_create_chan: " 1419 "ibt_alloc_rc_channel failed, ibt_status=%d.", 1420 ibt_status); 1421 goto fail; 1422 } 1423 1424 kqp->mode = RIB_SERVER; 1425 kqp->chan_flags = IBT_BLOCKING; 1426 kqp->q = q; /* server ONLY */ 1427 1428 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1429 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1430 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1431 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1432 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1433 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1434 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1435 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1436 /* 1437 * Set the private data area to qp to be used in callbacks 1438 */ 1439 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1440 kqp->rdmaconn.c_state = C_CONNECTED; 1441 *qp = kqp; 1442 return (RDMA_SUCCESS); 1443 fail: 1444 if (kqp) 1445 kmem_free(kqp, sizeof (rib_qp_t)); 1446 1447 return (RDMA_FAILED); 1448 } 1449 1450 void 1451 rib_dump_pathrec(ibt_path_info_t *path_rec) 1452 { 1453 ib_pkey_t pkey; 1454 1455 if (rib_debug > 1) { 1456 cmn_err(CE_NOTE, "Path Record:\n"); 1457 1458 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n", 1459 (longlong_t)path_rec->pi_hca_guid); 1460 cmn_err(CE_NOTE, "Dest Service ID = %llx\n", 1461 (longlong_t)path_rec->pi_sid); 1462 cmn_err(CE_NOTE, "Port Num = %02d\n", 1463 path_rec->pi_prim_cep_path.cep_hca_port_num); 1464 cmn_err(CE_NOTE, "P_Key Index = %04d\n", 1465 path_rec->pi_prim_cep_path.cep_pkey_ix); 1466 1467 (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid, 1468 path_rec->pi_prim_cep_path.cep_hca_port_num, 1469 path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey); 1470 cmn_err(CE_NOTE, "P_Key = 0x%x\n", pkey); 1471 1472 1473 cmn_err(CE_NOTE, "SGID: = %llx:%llx\n", 1474 (longlong_t) 1475 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix, 1476 (longlong_t) 1477 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid); 1478 1479 cmn_err(CE_NOTE, "DGID: = %llx:%llx\n", 1480 (longlong_t) 1481 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix, 1482 (longlong_t) 1483 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid); 1484 1485 cmn_err(CE_NOTE, "Path Rate = %02x\n", 1486 path_rec->pi_prim_cep_path.cep_adds_vect.av_srate); 1487 cmn_err(CE_NOTE, "SL = %02x\n", 1488 path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl); 1489 cmn_err(CE_NOTE, "Prim Packet LT = %02x\n", 1490 path_rec->pi_prim_pkt_lt); 1491 cmn_err(CE_NOTE, "Path MTU = %02x\n", 1492 path_rec->pi_path_mtu); 1493 } 1494 } 1495 1496 /* ARGSUSED */ 1497 ibt_cm_status_t 1498 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1499 ibt_cm_return_args_t *ret_args, void *priv_data, 1500 ibt_priv_data_len_t len) 1501 { 1502 rpcib_state_t *ribstat; 1503 rib_hca_t *hca; 1504 1505 ribstat = (rpcib_state_t *)clnt_hdl; 1506 hca = (rib_hca_t *)ribstat->hca; 1507 1508 switch (event->cm_type) { 1509 1510 /* got a connection close event */ 1511 case IBT_CM_EVENT_CONN_CLOSED: 1512 { 1513 CONN *conn; 1514 rib_qp_t *qp; 1515 1516 /* check reason why connection was closed */ 1517 switch (event->cm_event.closed) { 1518 case IBT_CM_CLOSED_DREP_RCVD: 1519 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1520 case IBT_CM_CLOSED_DUP: 1521 case IBT_CM_CLOSED_ABORT: 1522 case IBT_CM_CLOSED_ALREADY: 1523 /* 1524 * These cases indicate the local end initiated 1525 * the closing of the channel. Nothing to do here. 1526 */ 1527 break; 1528 default: 1529 /* 1530 * Reason for CONN_CLOSED event must be one of 1531 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1532 * or IBT_CM_CLOSED_STALE. These indicate cases were 1533 * the remote end is closing the channel. In these 1534 * cases free the channel and transition to error 1535 * state 1536 */ 1537 qp = ibt_get_chan_private(event->cm_channel); 1538 conn = qptoc(qp); 1539 mutex_enter(&conn->c_lock); 1540 if (conn->c_state == C_DISCONN_PEND) { 1541 mutex_exit(&conn->c_lock); 1542 break; 1543 } 1544 1545 conn->c_state = C_ERROR; 1546 1547 /* 1548 * Free the rc_channel. Channel has already 1549 * transitioned to ERROR state and WRs have been 1550 * FLUSHED_ERR already. 1551 */ 1552 (void) ibt_free_channel(qp->qp_hdl); 1553 qp->qp_hdl = NULL; 1554 1555 /* 1556 * Free the conn if c_ref is down to 0 already 1557 */ 1558 if (conn->c_ref == 0) { 1559 /* 1560 * Remove from list and free conn 1561 */ 1562 conn->c_state = C_DISCONN_PEND; 1563 mutex_exit(&conn->c_lock); 1564 (void) rib_disconnect_channel(conn, 1565 &hca->cl_conn_list); 1566 } else { 1567 mutex_exit(&conn->c_lock); 1568 } 1569 #ifdef DEBUG 1570 if (rib_debug) 1571 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1572 "(CONN_CLOSED) channel disconnected"); 1573 #endif 1574 break; 1575 } 1576 break; 1577 } 1578 default: 1579 break; 1580 } 1581 return (IBT_CM_ACCEPT); 1582 } 1583 1584 1585 /* Check if server has done ATS registration */ 1586 rdma_stat 1587 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr, 1588 int addr_type, ibt_path_info_t *path) 1589 { 1590 struct sockaddr_in *sin4; 1591 struct sockaddr_in6 *sin6; 1592 ibt_path_attr_t path_attr; 1593 ibt_status_t ibt_status; 1594 ib_pkey_t pkey; 1595 ibt_ar_t ar_query, ar_result; 1596 rib_service_t *ats; 1597 ib_gid_t sgid; 1598 ibt_path_info_t paths[MAX_PORTS]; 1599 uint8_t npaths, i; 1600 1601 (void) bzero(&path_attr, sizeof (ibt_path_attr_t)); 1602 (void) bzero(path, sizeof (ibt_path_info_t)); 1603 1604 /* 1605 * Construct svc name 1606 */ 1607 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 1608 switch (addr_type) { 1609 case AF_INET: 1610 sin4 = (struct sockaddr_in *)raddr->buf; 1611 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname, 1612 IB_SVC_NAME_LEN); 1613 break; 1614 1615 case AF_INET6: 1616 sin6 = (struct sockaddr_in6 *)raddr->buf; 1617 (void) inet_ntop(AF_INET6, &sin6->sin6_addr, 1618 path_attr.pa_sname, IB_SVC_NAME_LEN); 1619 break; 1620 1621 default: 1622 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1623 return (RDMA_INVAL); 1624 } 1625 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN); 1626 1627 /* 1628 * Attempt a path to the server on an ATS-registered port. 1629 * Try all ATS-registered ports until one succeeds. 1630 * The first one that succeeds will be used to connect 1631 * to the server. If none of them succeed, return RDMA_FAILED. 1632 */ 1633 rw_enter(&hca->state_lock, RW_READER); 1634 if (hca->state != HCA_DETACHED) { 1635 rw_enter(&hca->service_list_lock, RW_READER); 1636 for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) { 1637 path_attr.pa_hca_guid = hca->hca_guid; 1638 path_attr.pa_hca_port_num = ats->srv_port; 1639 ibt_status = ibt_get_paths(hca->ibt_clnt_hdl, 1640 IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths); 1641 if (ibt_status == IBT_SUCCESS || 1642 ibt_status == IBT_INSUFF_DATA) { 1643 for (i = 0; i < npaths; i++) { 1644 if (paths[i].pi_hca_guid) { 1645 /* 1646 * do ibt_query_ar() 1647 */ 1648 sgid = 1649 paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid; 1650 1651 (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid, 1652 paths[i].pi_prim_cep_path.cep_hca_port_num, 1653 paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey); 1654 1655 bzero(&ar_query, sizeof (ar_query)); 1656 bzero(&ar_result, sizeof (ar_result)); 1657 ar_query.ar_gid = 1658 paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid; 1659 ar_query.ar_pkey = pkey; 1660 ibt_status = ibt_query_ar(&sgid, &ar_query, 1661 &ar_result); 1662 if (ibt_status == IBT_SUCCESS) { 1663 #ifdef DEBUG 1664 if (rib_debug > 1) 1665 rib_dump_pathrec(&paths[i]); 1666 #endif 1667 bcopy(&paths[i], path, 1668 sizeof (ibt_path_info_t)); 1669 rw_exit(&hca->service_list_lock); 1670 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1671 rw_exit(&hca->state_lock); 1672 return (RDMA_SUCCESS); 1673 } 1674 #ifdef DEBUG 1675 if (rib_debug) { 1676 cmn_err(CE_NOTE, "rib_chk_srv_ats: " 1677 "ibt_query_ar FAILED, return\n"); 1678 } 1679 #endif 1680 } 1681 } 1682 } 1683 } 1684 rw_exit(&hca->service_list_lock); 1685 } 1686 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1687 rw_exit(&hca->state_lock); 1688 return (RDMA_FAILED); 1689 } 1690 1691 1692 /* 1693 * Connect to the server. 1694 */ 1695 rdma_stat 1696 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path) 1697 { 1698 ibt_chan_open_args_t chan_args; /* channel args */ 1699 ibt_chan_sizes_t chan_sizes; 1700 ibt_rc_chan_alloc_args_t qp_attr; 1701 ibt_status_t ibt_status; 1702 ibt_rc_returns_t ret_args; /* conn reject info */ 1703 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1704 1705 (void) bzero(&chan_args, sizeof (chan_args)); 1706 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1707 1708 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 1709 /* Alloc a RC channel */ 1710 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1711 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1712 qp_attr.rc_pd = hca->pd_hdl; 1713 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1714 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1715 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1716 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1717 qp_attr.rc_clone_chan = NULL; 1718 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1719 qp_attr.rc_flags = IBT_WR_SIGNALED; 1720 1721 chan_args.oc_path = path; 1722 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1723 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1724 chan_args.oc_rdma_ra_out = 1; 1725 chan_args.oc_rdma_ra_in = 1; 1726 chan_args.oc_path_retry_cnt = 2; 1727 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1728 1729 refresh: 1730 rw_enter(&hca->state_lock, RW_READER); 1731 if (hca->state != HCA_DETACHED) { 1732 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1733 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl, 1734 &chan_sizes); 1735 } else { 1736 rw_exit(&hca->state_lock); 1737 return (RDMA_FAILED); 1738 } 1739 rw_exit(&hca->state_lock); 1740 1741 if (ibt_status != IBT_SUCCESS) { 1742 #ifdef DEBUG 1743 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel " 1744 "failed, ibt_status=%d.", ibt_status); 1745 #endif 1746 return (RDMA_FAILED); 1747 } 1748 1749 /* Connect to the Server */ 1750 (void) bzero(&ret_args, sizeof (ret_args)); 1751 mutex_enter(&qp->cb_lock); 1752 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1753 IBT_BLOCKING, &chan_args, &ret_args); 1754 if (ibt_status != IBT_SUCCESS) { 1755 #ifdef DEBUG 1756 if (rib_debug) 1757 cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel" 1758 " failed for qp %p, status=%d, " 1759 "ret_args.rc_status=%d\n", 1760 (void *)qp, ibt_status, ret_args.rc_status); 1761 #endif 1762 (void) ibt_free_channel(qp->qp_hdl); 1763 qp->qp_hdl = NULL; 1764 mutex_exit(&qp->cb_lock); 1765 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1766 ret_args.rc_status == IBT_CM_CONN_STALE) { 1767 /* 1768 * Got IBT_CM_CONN_STALE probably because of stale 1769 * data on the passive end of a channel that existed 1770 * prior to reboot. Retry establishing a channel 1771 * REFRESH_ATTEMPTS times, during which time the 1772 * stale conditions on the server might clear up. 1773 */ 1774 goto refresh; 1775 } 1776 return (RDMA_FAILED); 1777 } 1778 mutex_exit(&qp->cb_lock); 1779 /* 1780 * Set the private data area to qp to be used in callbacks 1781 */ 1782 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1783 return (RDMA_SUCCESS); 1784 } 1785 1786 rdma_stat 1787 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) 1788 { 1789 struct sockaddr_in *sin4; 1790 struct sockaddr_in6 *sin6; 1791 ibt_path_attr_t path_attr; 1792 ibt_path_info_t path; 1793 ibt_status_t ibt_status; 1794 1795 ASSERT(raddr->buf != NULL); 1796 1797 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1798 bzero(&path, sizeof (ibt_path_info_t)); 1799 1800 /* 1801 * Conctruct svc name 1802 */ 1803 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 1804 switch (addr_type) { 1805 case AF_INET: 1806 sin4 = (struct sockaddr_in *)raddr->buf; 1807 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname, 1808 IB_SVC_NAME_LEN); 1809 break; 1810 1811 case AF_INET6: 1812 sin6 = (struct sockaddr_in6 *)raddr->buf; 1813 (void) inet_ntop(AF_INET6, &sin6->sin6_addr, 1814 path_attr.pa_sname, IB_SVC_NAME_LEN); 1815 break; 1816 1817 default: 1818 #ifdef DEBUG 1819 if (rib_debug) { 1820 cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n"); 1821 } 1822 #endif 1823 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1824 return (RDMA_INVAL); 1825 } 1826 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN); 1827 1828 ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl, 1829 IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL); 1830 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1831 if (ibt_status != IBT_SUCCESS) { 1832 if (rib_debug > 1) { 1833 cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!" 1834 " status=%d\n", ibt_status); 1835 } 1836 } else if (path.pi_hca_guid) { 1837 ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid); 1838 *hca = rib_stat->hca; 1839 return (RDMA_SUCCESS); 1840 } 1841 return (RDMA_FAILED); 1842 } 1843 1844 /* 1845 * Close channel, remove from connection list and 1846 * free up resources allocated for that channel. 1847 */ 1848 rdma_stat 1849 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1850 { 1851 rib_qp_t *qp = ctoqp(conn); 1852 rib_hca_t *hca; 1853 1854 /* 1855 * c_ref == 0 and connection is in C_DISCONN_PEND 1856 */ 1857 hca = qp->hca; 1858 if (conn_list != NULL) 1859 (void) rib_rm_conn(conn, conn_list); 1860 if (qp->qp_hdl != NULL) { 1861 /* 1862 * If the channel has not been establised, 1863 * ibt_flush_channel is called to flush outstanding WRs 1864 * on the Qs. Otherwise, ibt_close_rc_channel() is 1865 * called. The channel is then freed. 1866 */ 1867 if (conn_list != NULL) 1868 (void) ibt_close_rc_channel(qp->qp_hdl, 1869 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1870 else 1871 (void) ibt_flush_channel(qp->qp_hdl); 1872 1873 mutex_enter(&qp->posted_rbufs_lock); 1874 while (qp->n_posted_rbufs) 1875 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1876 mutex_exit(&qp->posted_rbufs_lock); 1877 (void) ibt_free_channel(qp->qp_hdl); 1878 qp->qp_hdl = NULL; 1879 } 1880 ASSERT(qp->rdlist == NULL); 1881 if (qp->replylist != NULL) { 1882 (void) rib_rem_replylist(qp); 1883 } 1884 1885 cv_destroy(&qp->cb_conn_cv); 1886 cv_destroy(&qp->posted_rbufs_cv); 1887 mutex_destroy(&qp->cb_lock); 1888 1889 mutex_destroy(&qp->replylist_lock); 1890 mutex_destroy(&qp->posted_rbufs_lock); 1891 mutex_destroy(&qp->rdlist_lock); 1892 1893 cv_destroy(&conn->c_cv); 1894 mutex_destroy(&conn->c_lock); 1895 1896 if (conn->c_raddr.buf != NULL) { 1897 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1898 } 1899 if (conn->c_laddr.buf != NULL) { 1900 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1901 } 1902 kmem_free(qp, sizeof (rib_qp_t)); 1903 1904 /* 1905 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1906 * then the hca is no longer being used. 1907 */ 1908 if (conn_list != NULL) { 1909 rw_enter(&hca->state_lock, RW_READER); 1910 if (hca->state == HCA_DETACHED) { 1911 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1912 if (hca->srv_conn_list.conn_hd == NULL) { 1913 rw_enter(&hca->cl_conn_list.conn_lock, 1914 RW_READER); 1915 if (hca->cl_conn_list.conn_hd == NULL) { 1916 mutex_enter(&hca->inuse_lock); 1917 hca->inuse = FALSE; 1918 cv_signal(&hca->cb_cv); 1919 mutex_exit(&hca->inuse_lock); 1920 } 1921 rw_exit(&hca->cl_conn_list.conn_lock); 1922 } 1923 rw_exit(&hca->srv_conn_list.conn_lock); 1924 } 1925 rw_exit(&hca->state_lock); 1926 } 1927 return (RDMA_SUCCESS); 1928 } 1929 1930 /* 1931 * Wait for send completion notification. Only on receiving a 1932 * notification be it a successful or error completion, free the 1933 * send_wid. 1934 */ 1935 static rdma_stat 1936 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 1937 { 1938 clock_t timout, cv_wait_ret; 1939 rdma_stat error = RDMA_SUCCESS; 1940 int i; 1941 1942 /* 1943 * Wait for send to complete 1944 */ 1945 ASSERT(wd != NULL); 1946 mutex_enter(&wd->sendwait_lock); 1947 if (wd->status == (uint_t)SEND_WAIT) { 1948 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 1949 ddi_get_lbolt(); 1950 if (qp->mode == RIB_SERVER) { 1951 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 1952 &wd->sendwait_lock, timout)) > 0 && 1953 wd->status == (uint_t)SEND_WAIT) 1954 ; 1955 switch (cv_wait_ret) { 1956 case -1: /* timeout */ 1957 #ifdef DEBUG 1958 if (rib_debug > 2) 1959 cmn_err(CE_WARN, "rib_sendwait: " 1960 "timed out qp %p\n", (void *)qp); 1961 #endif 1962 wd->cv_sig = 0; /* no signal needed */ 1963 error = RDMA_TIMEDOUT; 1964 break; 1965 default: /* got send completion */ 1966 break; 1967 } 1968 } else { 1969 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 1970 &wd->sendwait_lock, timout)) > 0 && 1971 wd->status == (uint_t)SEND_WAIT) 1972 ; 1973 switch (cv_wait_ret) { 1974 case -1: /* timeout */ 1975 #ifdef DEBUG 1976 if (rib_debug > 2) 1977 cmn_err(CE_WARN, "rib_sendwait: " 1978 "timed out qp %p\n", (void *)qp); 1979 #endif 1980 wd->cv_sig = 0; /* no signal needed */ 1981 error = RDMA_TIMEDOUT; 1982 break; 1983 case 0: /* interrupted */ 1984 #ifdef DEBUG 1985 if (rib_debug > 2) 1986 cmn_err(CE_NOTE, "rib_sendwait:" 1987 " interrupted on qp %p\n", 1988 (void *)qp); 1989 #endif 1990 wd->cv_sig = 0; /* no signal needed */ 1991 error = RDMA_INTR; 1992 break; 1993 default: /* got send completion */ 1994 break; 1995 } 1996 } 1997 } 1998 1999 if (wd->status != (uint_t)SEND_WAIT) { 2000 /* got send completion */ 2001 if (wd->status != RDMA_SUCCESS) { 2002 error = wd->status; 2003 if (wd->status != RDMA_CONNLOST) 2004 error = RDMA_FAILED; 2005 } 2006 for (i = 0; i < wd->nsbufs; i++) { 2007 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2008 (void *)wd->sbufaddr[i]); 2009 } 2010 mutex_exit(&wd->sendwait_lock); 2011 (void) rib_free_sendwait(wd); 2012 } else { 2013 mutex_exit(&wd->sendwait_lock); 2014 } 2015 2016 return (error); 2017 } 2018 2019 static struct send_wid * 2020 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2021 { 2022 struct send_wid *wd; 2023 2024 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2025 wd->xid = xid; 2026 wd->cv_sig = cv_sig; 2027 wd->qp = qp; 2028 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2029 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2030 wd->status = (uint_t)SEND_WAIT; 2031 2032 return (wd); 2033 } 2034 2035 static int 2036 rib_free_sendwait(struct send_wid *wdesc) 2037 { 2038 cv_destroy(&wdesc->wait_cv); 2039 mutex_destroy(&wdesc->sendwait_lock); 2040 kmem_free(wdesc, sizeof (*wdesc)); 2041 2042 return (0); 2043 } 2044 2045 static rdma_stat 2046 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2047 { 2048 mutex_enter(&qp->replylist_lock); 2049 if (rep != NULL) { 2050 (void) rib_remreply(qp, rep); 2051 mutex_exit(&qp->replylist_lock); 2052 return (RDMA_SUCCESS); 2053 } 2054 mutex_exit(&qp->replylist_lock); 2055 return (RDMA_FAILED); 2056 } 2057 2058 /* 2059 * Send buffers are freed here only in case of error in posting 2060 * on QP. If the post succeeded, the send buffers are freed upon 2061 * send completion in rib_sendwait() or in the scq_handler. 2062 */ 2063 rdma_stat 2064 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2065 int send_sig, int cv_sig) 2066 { 2067 struct send_wid *wdesc; 2068 struct clist *clp; 2069 ibt_status_t ibt_status = IBT_SUCCESS; 2070 rdma_stat ret = RDMA_SUCCESS; 2071 ibt_send_wr_t tx_wr; 2072 int i, nds; 2073 ibt_wr_ds_t sgl[DSEG_MAX]; 2074 uint_t total_msg_size; 2075 rib_qp_t *qp = ctoqp(conn); 2076 2077 ASSERT(cl != NULL); 2078 2079 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2080 2081 nds = 0; 2082 total_msg_size = 0; 2083 clp = cl; 2084 while (clp != NULL) { 2085 if (nds >= DSEG_MAX) { 2086 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX" 2087 " too small!"); 2088 return (RDMA_FAILED); 2089 } 2090 sgl[nds].ds_va = clp->c_saddr; 2091 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2092 sgl[nds].ds_len = clp->c_len; 2093 total_msg_size += clp->c_len; 2094 clp = clp->c_next; 2095 nds++; 2096 } 2097 2098 if (send_sig) { 2099 /* Set SEND_SIGNAL flag. */ 2100 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2101 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2102 } else { 2103 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2104 wdesc = rib_init_sendwait(msgid, 0, qp); 2105 } 2106 wdesc->nsbufs = nds; 2107 for (i = 0; i < nds; i++) { 2108 wdesc->sbufaddr[i] = sgl[i].ds_va; 2109 } 2110 2111 tx_wr.wr_id = (ibt_wrid_t)wdesc; 2112 tx_wr.wr_opcode = IBT_WRC_SEND; 2113 tx_wr.wr_trans = IBT_RC_SRV; 2114 tx_wr.wr_nds = nds; 2115 tx_wr.wr_sgl = sgl; 2116 2117 mutex_enter(&conn->c_lock); 2118 if (conn->c_state & C_CONNECTED) { 2119 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2120 } 2121 if (((conn->c_state & C_CONNECTED) == 0) || 2122 ibt_status != IBT_SUCCESS) { 2123 mutex_exit(&conn->c_lock); 2124 for (i = 0; i < nds; i++) { 2125 rib_rbuf_free(conn, SEND_BUFFER, 2126 (void *)wdesc->sbufaddr[i]); 2127 } 2128 (void) rib_free_sendwait(wdesc); 2129 #ifdef DEBUG 2130 if (rib_debug && ibt_status != IBT_SUCCESS) 2131 cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send " 2132 "failed! wr_id %llx on qpn %p, status=%d!", 2133 (longlong_t)tx_wr.wr_id, (void *)qp, 2134 ibt_status); 2135 #endif 2136 return (RDMA_FAILED); 2137 } 2138 mutex_exit(&conn->c_lock); 2139 2140 if (send_sig) { 2141 if (cv_sig) { 2142 /* 2143 * cv_wait for send to complete. 2144 * We can fail due to a timeout or signal or 2145 * unsuccessful send. 2146 */ 2147 ret = rib_sendwait(qp, wdesc); 2148 #ifdef DEBUG 2149 if (rib_debug > 2) 2150 if (ret != 0) { 2151 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait " 2152 "FAILED, rdma stat=%d, wr_id %llx, qp %p!", 2153 ret, (longlong_t)tx_wr.wr_id, (void *)qp); 2154 } 2155 #endif 2156 return (ret); 2157 } 2158 } 2159 2160 return (RDMA_SUCCESS); 2161 } 2162 2163 rdma_stat 2164 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2165 { 2166 rdma_stat ret; 2167 2168 /* send-wait & cv_signal */ 2169 ret = rib_send_and_wait(conn, cl, msgid, 1, 1); 2170 2171 return (ret); 2172 } 2173 2174 /* 2175 * Server interface (svc_rdma_ksend). 2176 * Send RPC reply and wait for RDMA_DONE. 2177 */ 2178 rdma_stat 2179 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2180 { 2181 rdma_stat ret = RDMA_SUCCESS; 2182 struct rdma_done_list *rd; 2183 clock_t timout, cv_wait_ret; 2184 rib_qp_t *qp = ctoqp(conn); 2185 2186 mutex_enter(&qp->rdlist_lock); 2187 rd = rdma_done_add(qp, msgid); 2188 2189 /* No cv_signal (whether send-wait or no-send-wait) */ 2190 ret = rib_send_and_wait(conn, cl, msgid, 1, 0); 2191 if (ret != RDMA_SUCCESS) { 2192 #ifdef DEBUG 2193 cmn_err(CE_WARN, "rib_send_resp: send_and_wait " 2194 "failed, msgid %u, qp %p", msgid, (void *)qp); 2195 #endif 2196 rdma_done_rm(qp, rd); 2197 goto done; 2198 } 2199 2200 /* 2201 * Wait for RDMA_DONE from remote end 2202 */ 2203 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2204 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock, 2205 timout); 2206 rdma_done_rm(qp, rd); 2207 if (cv_wait_ret < 0) { 2208 #ifdef DEBUG 2209 if (rib_debug > 1) { 2210 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not" 2211 " recv'd for qp %p, xid:%u\n", 2212 (void *)qp, msgid); 2213 } 2214 #endif 2215 ret = RDMA_TIMEDOUT; 2216 goto done; 2217 } 2218 2219 done: 2220 mutex_exit(&qp->rdlist_lock); 2221 return (ret); 2222 } 2223 2224 static struct recv_wid * 2225 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2226 { 2227 struct recv_wid *rwid; 2228 2229 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2230 rwid->xid = msgid; 2231 rwid->addr = sgl->ds_va; 2232 rwid->qp = qp; 2233 2234 return (rwid); 2235 } 2236 2237 static void 2238 rib_free_wid(struct recv_wid *rwid) 2239 { 2240 kmem_free(rwid, sizeof (struct recv_wid)); 2241 } 2242 2243 rdma_stat 2244 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2245 { 2246 rib_qp_t *qp = ctoqp(conn); 2247 struct clist *clp = cl; 2248 struct reply *rep; 2249 struct recv_wid *rwid; 2250 int nds; 2251 ibt_wr_ds_t sgl[DSEG_MAX]; 2252 ibt_recv_wr_t recv_wr; 2253 rdma_stat ret; 2254 ibt_status_t ibt_status; 2255 2256 /* 2257 * rdma_clnt_postrecv uses RECV_BUFFER. 2258 */ 2259 2260 nds = 0; 2261 while (cl != NULL) { 2262 if (nds >= DSEG_MAX) { 2263 cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!"); 2264 ret = RDMA_FAILED; 2265 goto done; 2266 } 2267 sgl[nds].ds_va = cl->c_saddr; 2268 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2269 sgl[nds].ds_len = cl->c_len; 2270 cl = cl->c_next; 2271 nds++; 2272 } 2273 2274 if (nds != 1) { 2275 cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n"); 2276 ret = RDMA_FAILED; 2277 goto done; 2278 } 2279 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2280 recv_wr.wr_nds = nds; 2281 recv_wr.wr_sgl = sgl; 2282 2283 rwid = rib_create_wid(qp, &sgl[0], msgid); 2284 if (rwid) { 2285 recv_wr.wr_id = (ibt_wrid_t)rwid; 2286 } else { 2287 cmn_err(CE_WARN, "rib_clnt_post: out of memory"); 2288 ret = RDMA_NORESOURCE; 2289 goto done; 2290 } 2291 rep = rib_addreplylist(qp, msgid); 2292 if (!rep) { 2293 cmn_err(CE_WARN, "rib_clnt_post: out of memory"); 2294 rib_free_wid(rwid); 2295 ret = RDMA_NORESOURCE; 2296 goto done; 2297 } 2298 2299 mutex_enter(&conn->c_lock); 2300 if (conn->c_state & C_CONNECTED) { 2301 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2302 } 2303 if (((conn->c_state & C_CONNECTED) == 0) || 2304 ibt_status != IBT_SUCCESS) { 2305 mutex_exit(&conn->c_lock); 2306 #ifdef DEBUG 2307 cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in " 2308 "ibt_post_recv(), msgid=%d, status=%d", 2309 (void *)qp, msgid, ibt_status); 2310 #endif 2311 rib_free_wid(rwid); 2312 (void) rib_rem_rep(qp, rep); 2313 ret = RDMA_FAILED; 2314 goto done; 2315 } 2316 mutex_exit(&conn->c_lock); 2317 return (RDMA_SUCCESS); 2318 2319 done: 2320 while (clp != NULL) { 2321 rib_rbuf_free(conn, RECV_BUFFER, (void *)clp->c_saddr); 2322 clp = clp->c_next; 2323 } 2324 return (ret); 2325 } 2326 2327 rdma_stat 2328 rib_svc_post(CONN* conn, struct clist *cl) 2329 { 2330 rib_qp_t *qp = ctoqp(conn); 2331 struct svc_recv *s_recvp; 2332 int nds; 2333 ibt_wr_ds_t sgl[DSEG_MAX]; 2334 ibt_recv_wr_t recv_wr; 2335 ibt_status_t ibt_status; 2336 2337 nds = 0; 2338 while (cl != NULL) { 2339 if (nds >= DSEG_MAX) { 2340 cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!"); 2341 return (RDMA_FAILED); 2342 } 2343 sgl[nds].ds_va = cl->c_saddr; 2344 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2345 sgl[nds].ds_len = cl->c_len; 2346 cl = cl->c_next; 2347 nds++; 2348 } 2349 2350 if (nds != 1) { 2351 cmn_err(CE_WARN, "rib_svc_post: nds!=1\n"); 2352 rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)sgl[0].ds_va); 2353 return (RDMA_FAILED); 2354 } 2355 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2356 recv_wr.wr_nds = nds; 2357 recv_wr.wr_sgl = sgl; 2358 2359 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2360 recv_wr.wr_id = (ibt_wrid_t)s_recvp; /* Use s_recvp's addr as wr id */ 2361 mutex_enter(&conn->c_lock); 2362 if (conn->c_state & C_CONNECTED) { 2363 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2364 } 2365 if (((conn->c_state & C_CONNECTED) == 0) || 2366 ibt_status != IBT_SUCCESS) { 2367 mutex_exit(&conn->c_lock); 2368 #ifdef DEBUG 2369 cmn_err(CE_WARN, "rib_svc_post: QP %p failed in " 2370 "ibt_post_recv(), status=%d", 2371 (void *)qp, ibt_status); 2372 #endif 2373 rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)sgl[0].ds_va); 2374 (void) rib_free_svc_recv(s_recvp); 2375 return (RDMA_FAILED); 2376 } 2377 mutex_exit(&conn->c_lock); 2378 2379 return (RDMA_SUCCESS); 2380 } 2381 2382 /* Client */ 2383 rdma_stat 2384 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2385 { 2386 2387 return (rib_clnt_post(conn, cl, msgid)); 2388 } 2389 2390 /* Server */ 2391 rdma_stat 2392 rib_post_recv(CONN *conn, struct clist *cl) 2393 { 2394 rib_qp_t *qp = ctoqp(conn); 2395 2396 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2397 mutex_enter(&qp->posted_rbufs_lock); 2398 qp->n_posted_rbufs++; 2399 mutex_exit(&qp->posted_rbufs_lock); 2400 return (RDMA_SUCCESS); 2401 } 2402 return (RDMA_FAILED); 2403 } 2404 2405 /* 2406 * Client side only interface to "recv" the rpc reply buf 2407 * posted earlier by rib_post_resp(conn, cl, msgid). 2408 */ 2409 rdma_stat 2410 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2411 { 2412 struct reply *rep = NULL; 2413 clock_t timout, cv_wait_ret; 2414 rdma_stat ret = RDMA_SUCCESS; 2415 rib_qp_t *qp = ctoqp(conn); 2416 2417 /* 2418 * Find the reply structure for this msgid 2419 */ 2420 mutex_enter(&qp->replylist_lock); 2421 2422 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2423 if (rep->xid == msgid) 2424 break; 2425 } 2426 if (rep != NULL) { 2427 /* 2428 * If message not yet received, wait. 2429 */ 2430 if (rep->status == (uint_t)REPLY_WAIT) { 2431 timout = ddi_get_lbolt() + 2432 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2433 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2434 &qp->replylist_lock, timout)) > 0 && 2435 rep->status == (uint_t)REPLY_WAIT); 2436 2437 switch (cv_wait_ret) { 2438 case -1: /* timeout */ 2439 ret = RDMA_TIMEDOUT; 2440 break; 2441 case 0: 2442 ret = RDMA_INTR; 2443 break; 2444 default: 2445 break; 2446 } 2447 } 2448 2449 if (rep->status == RDMA_SUCCESS) { 2450 struct clist *cl = NULL; 2451 2452 /* 2453 * Got message successfully 2454 */ 2455 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2456 (caddr_t)rep->vaddr_cq, NULL, NULL); 2457 *clp = cl; 2458 } else { 2459 if (rep->status != (uint_t)REPLY_WAIT) { 2460 /* 2461 * Got error in reply message. Free 2462 * recv buffer here. 2463 */ 2464 ret = rep->status; 2465 rib_rbuf_free(conn, RECV_BUFFER, 2466 (caddr_t)rep->vaddr_cq); 2467 } 2468 } 2469 (void) rib_remreply(qp, rep); 2470 } else { 2471 /* 2472 * No matching reply structure found for given msgid on the 2473 * reply wait list. 2474 */ 2475 ret = RDMA_INVAL; 2476 #ifdef DEBUG 2477 cmn_err(CE_WARN, "rib_recv: no matching reply for " 2478 "xid %u, qp %p\n", msgid, (void *)qp); 2479 #endif 2480 } 2481 2482 /* 2483 * Done. 2484 */ 2485 mutex_exit(&qp->replylist_lock); 2486 return (ret); 2487 } 2488 2489 /* 2490 * RDMA write a buffer to the remote address. 2491 */ 2492 rdma_stat 2493 rib_write(CONN *conn, struct clist *cl, int wait) 2494 { 2495 ibt_send_wr_t tx_wr; 2496 int nds; 2497 int cv_sig; 2498 ibt_wr_ds_t sgl[DSEG_MAX]; 2499 struct send_wid *wdesc; 2500 ibt_status_t ibt_status; 2501 rdma_stat ret = RDMA_SUCCESS; 2502 rib_qp_t *qp = ctoqp(conn); 2503 2504 if (cl == NULL) { 2505 cmn_err(CE_WARN, "rib_write: NULL clist\n"); 2506 return (RDMA_FAILED); 2507 } 2508 2509 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2510 /* 2511 * Remote address is at the head chunk item in list. 2512 */ 2513 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr; 2514 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ 2515 2516 nds = 0; 2517 while (cl != NULL) { 2518 if (nds >= DSEG_MAX) { 2519 cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!"); 2520 return (RDMA_FAILED); 2521 } 2522 sgl[nds].ds_va = cl->c_saddr; 2523 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2524 sgl[nds].ds_len = cl->c_len; 2525 cl = cl->c_next; 2526 nds++; 2527 } 2528 2529 if (wait) { 2530 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2531 cv_sig = 1; 2532 } else { 2533 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2534 cv_sig = 0; 2535 } 2536 2537 wdesc = rib_init_sendwait(0, cv_sig, qp); 2538 tx_wr.wr_id = (ibt_wrid_t)wdesc; 2539 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2540 tx_wr.wr_trans = IBT_RC_SRV; 2541 tx_wr.wr_nds = nds; 2542 tx_wr.wr_sgl = sgl; 2543 2544 mutex_enter(&conn->c_lock); 2545 if (conn->c_state & C_CONNECTED) { 2546 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2547 } 2548 if (((conn->c_state & C_CONNECTED) == 0) || 2549 ibt_status != IBT_SUCCESS) { 2550 mutex_exit(&conn->c_lock); 2551 (void) rib_free_sendwait(wdesc); 2552 return (RDMA_FAILED); 2553 } 2554 mutex_exit(&conn->c_lock); 2555 2556 /* 2557 * Wait for send to complete 2558 */ 2559 if (wait) { 2560 ret = rib_sendwait(qp, wdesc); 2561 if (ret != 0) { 2562 return (ret); 2563 } 2564 } 2565 return (RDMA_SUCCESS); 2566 } 2567 2568 /* 2569 * RDMA Read a buffer from the remote address. 2570 */ 2571 rdma_stat 2572 rib_read(CONN *conn, struct clist *cl, int wait) 2573 { 2574 ibt_send_wr_t rx_wr; 2575 int nds; 2576 int cv_sig; 2577 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */ 2578 struct send_wid *wdesc; 2579 ibt_status_t ibt_status = IBT_SUCCESS; 2580 rdma_stat ret = RDMA_SUCCESS; 2581 rib_qp_t *qp = ctoqp(conn); 2582 2583 if (cl == NULL) { 2584 cmn_err(CE_WARN, "rib_read: NULL clist\n"); 2585 return (RDMA_FAILED); 2586 } 2587 2588 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2589 /* 2590 * Remote address is at the head chunk item in list. 2591 */ 2592 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr; 2593 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */ 2594 2595 nds = 0; 2596 while (cl != NULL) { 2597 if (nds >= DSEG_MAX) { 2598 cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!"); 2599 return (RDMA_FAILED); 2600 } 2601 sgl[nds].ds_va = cl->c_daddr; 2602 sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2603 sgl[nds].ds_len = cl->c_len; 2604 cl = cl->c_next; 2605 nds++; 2606 } 2607 2608 if (wait) { 2609 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2610 cv_sig = 1; 2611 } else { 2612 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2613 cv_sig = 0; 2614 } 2615 2616 wdesc = rib_init_sendwait(0, cv_sig, qp); 2617 rx_wr.wr_id = (ibt_wrid_t)wdesc; 2618 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2619 rx_wr.wr_trans = IBT_RC_SRV; 2620 rx_wr.wr_nds = nds; 2621 rx_wr.wr_sgl = sgl; 2622 2623 mutex_enter(&conn->c_lock); 2624 if (conn->c_state & C_CONNECTED) { 2625 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2626 } 2627 if (((conn->c_state & C_CONNECTED) == 0) || 2628 ibt_status != IBT_SUCCESS) { 2629 mutex_exit(&conn->c_lock); 2630 #ifdef DEBUG 2631 if (rib_debug && ibt_status != IBT_SUCCESS) 2632 cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR" 2633 " wr_id %llx on qp %p, status=%d", 2634 (longlong_t)rx_wr.wr_id, (void *)qp, 2635 ibt_status); 2636 #endif 2637 (void) rib_free_sendwait(wdesc); 2638 return (RDMA_FAILED); 2639 } 2640 mutex_exit(&conn->c_lock); 2641 2642 /* 2643 * Wait for send to complete 2644 */ 2645 if (wait) { 2646 ret = rib_sendwait(qp, wdesc); 2647 if (ret != 0) { 2648 return (ret); 2649 } 2650 } 2651 2652 return (RDMA_SUCCESS); 2653 } 2654 2655 int 2656 is_for_ipv4(ibt_ar_t *result) 2657 { 2658 int i, size = sizeof (struct in_addr); 2659 uint8_t zero = 0; 2660 2661 for (i = 0; i < (ATS_AR_DATA_LEN - size); i++) 2662 zero |= result->ar_data[i]; 2663 return (zero == 0); 2664 } 2665 2666 /* 2667 * rib_srv_cm_handler() 2668 * Connection Manager callback to handle RC connection requests. 2669 */ 2670 /* ARGSUSED */ 2671 static ibt_cm_status_t 2672 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2673 ibt_cm_return_args_t *ret_args, void *priv_data, 2674 ibt_priv_data_len_t len) 2675 { 2676 queue_t *q; 2677 rib_qp_t *qp; 2678 rpcib_state_t *ribstat; 2679 rib_hca_t *hca; 2680 rdma_stat status = RDMA_SUCCESS; 2681 int i; 2682 struct clist cl; 2683 rdma_buf_t rdbuf; 2684 void *buf = NULL; 2685 ibt_cm_req_rcv_t cm_req_rcv; 2686 CONN *conn; 2687 ibt_status_t ibt_status; 2688 ibt_ar_t ar_query, ar_result; 2689 ib_gid_t sgid; 2690 2691 2692 ASSERT(any != NULL); 2693 ASSERT(event != NULL); 2694 2695 ribstat = (rpcib_state_t *)any; 2696 hca = (rib_hca_t *)ribstat->hca; 2697 ASSERT(hca != NULL); 2698 2699 /* got a connection request */ 2700 switch (event->cm_type) { 2701 case IBT_CM_EVENT_REQ_RCV: 2702 /* 2703 * If the plugin is in the NO_ACCEPT state, bail out. 2704 */ 2705 mutex_enter(&plugin_state_lock); 2706 if (plugin_state == NO_ACCEPT) { 2707 mutex_exit(&plugin_state_lock); 2708 return (IBT_CM_REJECT); 2709 } 2710 mutex_exit(&plugin_state_lock); 2711 2712 /* 2713 * Need to send a MRA MAD to CM so that it does not 2714 * timeout on us. 2715 */ 2716 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2717 event->cm_event.req.req_timeout * 8, NULL, 0); 2718 2719 mutex_enter(&rib_stat->open_hca_lock); 2720 q = rib_stat->q; 2721 mutex_exit(&rib_stat->open_hca_lock); 2722 status = rib_svc_create_chan(hca, (caddr_t)q, 2723 event->cm_event.req.req_prim_hca_port, &qp); 2724 if (status) { 2725 #ifdef DEBUG 2726 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2727 "create_channel failed %d", status); 2728 #endif 2729 return (IBT_CM_REJECT); 2730 } 2731 cm_req_rcv = event->cm_event.req; 2732 2733 #ifdef DEBUG 2734 if (rib_debug > 2) { 2735 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2736 "server recv'ed IBT_CM_EVENT_REQ_RCV\n"); 2737 cmn_err(CE_NOTE, "\t\t SID:%llx\n", 2738 (longlong_t)cm_req_rcv.req_service_id); 2739 cmn_err(CE_NOTE, "\t\t Local Port:%d\n", 2740 cm_req_rcv.req_prim_hca_port); 2741 cmn_err(CE_NOTE, 2742 "\t\t Remote GID:(prefix:%llx,guid:%llx)\n", 2743 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix, 2744 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid); 2745 cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n", 2746 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix, 2747 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid); 2748 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n", 2749 cm_req_rcv.req_remote_qpn); 2750 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n", 2751 cm_req_rcv.req_remote_qkey); 2752 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n", 2753 (void *)qp, (void *)qp->qp_hdl); 2754 } 2755 2756 if (rib_debug > 2) { 2757 ibt_rc_chan_query_attr_t chan_attrs; 2758 2759 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs) 2760 == IBT_SUCCESS) { 2761 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in " 2762 "CEP state %d\n", (void *)qp, chan_attrs.rc_state); 2763 } 2764 } 2765 #endif 2766 2767 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2768 ret_args->cm_ret.rep.cm_rdma_ra_out = 1; 2769 ret_args->cm_ret.rep.cm_rdma_ra_in = 1; 2770 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2771 2772 /* 2773 * Pre-posts RECV buffers 2774 */ 2775 conn = qptoc(qp); 2776 for (i = 0; i < preposted_rbufs; i++) { 2777 bzero(&rdbuf, sizeof (rdbuf)); 2778 rdbuf.type = RECV_BUFFER; 2779 buf = rib_rbuf_alloc(conn, &rdbuf); 2780 if (buf == NULL) { 2781 cmn_err(CE_WARN, "rib_svc_cm_handler: " 2782 "No RECV_BUFFER buf!\n"); 2783 (void) rib_disconnect_channel(conn, NULL); 2784 return (IBT_CM_REJECT); 2785 } 2786 2787 bzero(&cl, sizeof (cl)); 2788 cl.c_saddr = (uint64)rdbuf.addr; 2789 cl.c_len = rdbuf.len; 2790 cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */ 2791 cl.c_next = NULL; 2792 status = rib_post_recv(conn, &cl); 2793 if (status != RDMA_SUCCESS) { 2794 cmn_err(CE_WARN, "rib_srv_cm_handler: failed " 2795 "posting RPC_REQ buf to qp %p!", (void *)qp); 2796 (void) rib_disconnect_channel(conn, NULL); 2797 return (IBT_CM_REJECT); 2798 } 2799 } 2800 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2801 2802 /* 2803 * Get the address translation service record from ATS 2804 */ 2805 rw_enter(&hca->state_lock, RW_READER); 2806 if (hca->state == HCA_DETACHED) { 2807 rw_exit(&hca->state_lock); 2808 return (IBT_CM_REJECT); 2809 } 2810 rw_exit(&hca->state_lock); 2811 2812 for (i = 0; i < hca->hca_nports; i++) { 2813 ibt_status = ibt_get_port_state(hca->hca_hdl, i+1, 2814 &sgid, NULL); 2815 if (ibt_status != IBT_SUCCESS) { 2816 if (rib_debug) { 2817 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2818 "ibt_get_port_state FAILED!" 2819 "status = %d\n", ibt_status); 2820 } 2821 } else { 2822 /* 2823 * do ibt_query_ar() 2824 */ 2825 bzero(&ar_query, sizeof (ar_query)); 2826 bzero(&ar_result, sizeof (ar_result)); 2827 ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid; 2828 ar_query.ar_pkey = event->cm_event.req.req_pkey; 2829 ibt_status = ibt_query_ar(&sgid, &ar_query, 2830 &ar_result); 2831 if (ibt_status != IBT_SUCCESS) { 2832 if (rib_debug) { 2833 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2834 "ibt_query_ar FAILED!" 2835 "status = %d\n", ibt_status); 2836 } 2837 } else { 2838 conn = qptoc(qp); 2839 2840 if (is_for_ipv4(&ar_result)) { 2841 struct sockaddr_in *s; 2842 int sin_size = sizeof (struct sockaddr_in); 2843 int in_size = sizeof (struct in_addr); 2844 uint8_t *start_pos; 2845 2846 conn->c_raddr.maxlen = 2847 conn->c_raddr.len = sin_size; 2848 conn->c_raddr.buf = kmem_zalloc(sin_size, 2849 KM_SLEEP); 2850 s = (struct sockaddr_in *)conn->c_raddr.buf; 2851 s->sin_family = AF_INET; 2852 /* 2853 * For IPv4, the IP addr is stored in 2854 * the last four bytes of ar_data. 2855 */ 2856 start_pos = ar_result.ar_data + 2857 ATS_AR_DATA_LEN - in_size; 2858 bcopy(start_pos, &s->sin_addr, in_size); 2859 if (rib_debug > 1) { 2860 char print_addr[INET_ADDRSTRLEN]; 2861 2862 bzero(print_addr, INET_ADDRSTRLEN); 2863 (void) inet_ntop(AF_INET, &s->sin_addr, 2864 print_addr, INET_ADDRSTRLEN); 2865 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2866 "remote clnt_addr: %s\n", print_addr); 2867 } 2868 } else { 2869 struct sockaddr_in6 *s6; 2870 int sin6_size = sizeof (struct sockaddr_in6); 2871 2872 conn->c_raddr.maxlen = 2873 conn->c_raddr.len = sin6_size; 2874 conn->c_raddr.buf = kmem_zalloc(sin6_size, 2875 KM_SLEEP); 2876 2877 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2878 s6->sin6_family = AF_INET6; 2879 /* sin6_addr is stored in ar_data */ 2880 bcopy(ar_result.ar_data, &s6->sin6_addr, 2881 sizeof (struct in6_addr)); 2882 if (rib_debug > 1) { 2883 char print_addr[INET6_ADDRSTRLEN]; 2884 2885 bzero(print_addr, INET6_ADDRSTRLEN); 2886 (void) inet_ntop(AF_INET6, &s6->sin6_addr, 2887 print_addr, INET6_ADDRSTRLEN); 2888 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2889 "remote clnt_addr: %s\n", print_addr); 2890 } 2891 } 2892 return (IBT_CM_ACCEPT); 2893 } 2894 } 2895 } 2896 if (rib_debug > 1) { 2897 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2898 "address record query failed!"); 2899 } 2900 break; 2901 2902 case IBT_CM_EVENT_CONN_CLOSED: 2903 { 2904 CONN *conn; 2905 rib_qp_t *qp; 2906 2907 switch (event->cm_event.closed) { 2908 case IBT_CM_CLOSED_DREP_RCVD: 2909 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2910 case IBT_CM_CLOSED_DUP: 2911 case IBT_CM_CLOSED_ABORT: 2912 case IBT_CM_CLOSED_ALREADY: 2913 /* 2914 * These cases indicate the local end initiated 2915 * the closing of the channel. Nothing to do here. 2916 */ 2917 break; 2918 default: 2919 /* 2920 * Reason for CONN_CLOSED event must be one of 2921 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2922 * or IBT_CM_CLOSED_STALE. These indicate cases were 2923 * the remote end is closing the channel. In these 2924 * cases free the channel and transition to error 2925 * state 2926 */ 2927 qp = ibt_get_chan_private(event->cm_channel); 2928 conn = qptoc(qp); 2929 mutex_enter(&conn->c_lock); 2930 if (conn->c_state == C_DISCONN_PEND) { 2931 mutex_exit(&conn->c_lock); 2932 break; 2933 } 2934 conn->c_state = C_ERROR; 2935 2936 /* 2937 * Free the rc_channel. Channel has already 2938 * transitioned to ERROR state and WRs have been 2939 * FLUSHED_ERR already. 2940 */ 2941 (void) ibt_free_channel(qp->qp_hdl); 2942 qp->qp_hdl = NULL; 2943 2944 /* 2945 * Free the conn if c_ref goes down to 0 2946 */ 2947 if (conn->c_ref == 0) { 2948 /* 2949 * Remove from list and free conn 2950 */ 2951 conn->c_state = C_DISCONN_PEND; 2952 mutex_exit(&conn->c_lock); 2953 (void) rib_disconnect_channel(conn, 2954 &hca->srv_conn_list); 2955 } else { 2956 mutex_exit(&conn->c_lock); 2957 } 2958 #ifdef DEBUG 2959 if (rib_debug) 2960 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2961 " (CONN_CLOSED) channel disconnected"); 2962 #endif 2963 break; 2964 } 2965 break; 2966 } 2967 case IBT_CM_EVENT_CONN_EST: 2968 /* 2969 * RTU received, hence connection established. 2970 */ 2971 if (rib_debug > 1) 2972 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2973 "(CONN_EST) channel established"); 2974 break; 2975 2976 default: 2977 if (rib_debug > 2) { 2978 /* Let CM handle the following events. */ 2979 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 2980 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2981 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 2982 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 2983 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2984 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 2985 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 2986 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2987 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 2988 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 2989 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2990 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 2991 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 2992 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2993 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 2994 } 2995 } 2996 return (IBT_CM_REJECT); 2997 } 2998 2999 /* accept all other CM messages (i.e. let the CM handle them) */ 3000 return (IBT_CM_ACCEPT); 3001 } 3002 3003 static rdma_stat 3004 rib_register_ats(rib_hca_t *hca) 3005 { 3006 ibt_hca_portinfo_t *port_infop; 3007 uint_t port_size; 3008 uint_t pki, i, num_ports, nbinds; 3009 ibt_status_t ibt_status; 3010 rib_service_t *new_service, *temp_srv; 3011 rpcib_ats_t *atsp; 3012 rpcib_ibd_insts_t ibds; 3013 ib_pkey_t pkey; 3014 ibt_ar_t ar; /* address record */ 3015 3016 /* 3017 * Query all ports for the given HCA 3018 */ 3019 rw_enter(&hca->state_lock, RW_READER); 3020 if (hca->state != HCA_DETACHED) { 3021 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3022 &num_ports, &port_size); 3023 rw_exit(&hca->state_lock); 3024 } else { 3025 rw_exit(&hca->state_lock); 3026 return (RDMA_FAILED); 3027 } 3028 if (ibt_status != IBT_SUCCESS) { 3029 #ifdef DEBUG 3030 if (rib_debug) { 3031 cmn_err(CE_NOTE, "rib_register_ats: FAILED in " 3032 "ibt_query_hca_ports, status = %d\n", ibt_status); 3033 } 3034 #endif 3035 return (RDMA_FAILED); 3036 } 3037 3038 #ifdef DEBUG 3039 if (rib_debug > 1) { 3040 cmn_err(CE_NOTE, "rib_register_ats: Ports detected " 3041 "%d\n", num_ports); 3042 3043 for (i = 0; i < num_ports; i++) { 3044 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3045 cmn_err(CE_WARN, "rib_register_ats " 3046 "Port #: %d INACTIVE\n", i+1); 3047 } else if (port_infop[i].p_linkstate == 3048 IBT_PORT_ACTIVE) { 3049 cmn_err(CE_NOTE, "rib_register_ats " 3050 "Port #: %d ACTIVE\n", i+1); 3051 } 3052 } 3053 } 3054 #endif 3055 3056 ibds.rib_ibd_alloc = N_IBD_INSTANCES; 3057 ibds.rib_ibd_cnt = 0; 3058 ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc * 3059 sizeof (rpcib_ats_t), KM_SLEEP); 3060 rib_get_ibd_insts(&ibds); 3061 3062 if (ibds.rib_ibd_cnt == 0) { 3063 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * 3064 sizeof (rpcib_ats_t)); 3065 ibt_free_portinfo(port_infop, port_size); 3066 return (RDMA_FAILED); 3067 } 3068 3069 /* 3070 * Get the IP addresses of active ports and 3071 * register them with ATS. IPv4 addresses 3072 * have precedence over IPv6 addresses. 3073 */ 3074 if (get_ibd_ipaddr(&ibds) != 0) { 3075 #ifdef DEBUG 3076 if (rib_debug > 1) { 3077 cmn_err(CE_WARN, "rib_register_ats: " 3078 "get_ibd_ipaddr failed"); 3079 } 3080 #endif 3081 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * 3082 sizeof (rpcib_ats_t)); 3083 ibt_free_portinfo(port_infop, port_size); 3084 return (RDMA_FAILED); 3085 } 3086 3087 /* 3088 * Start ATS registration for active ports on this HCA. 3089 */ 3090 rw_enter(&hca->service_list_lock, RW_WRITER); 3091 nbinds = 0; 3092 new_service = NULL; 3093 for (i = 0; i < num_ports; i++) { 3094 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3095 continue; 3096 3097 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3098 pkey = port_infop[i].p_pkey_tbl[pki]; 3099 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) { 3100 ar.ar_gid = port_infop[i].p_sgid_tbl[0]; 3101 ar.ar_pkey = pkey; 3102 atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds); 3103 if (atsp == NULL) 3104 continue; 3105 /* 3106 * store the sin[6]_addr in ar_data 3107 */ 3108 (void) bzero(ar.ar_data, ATS_AR_DATA_LEN); 3109 if (atsp->ras_inet_type == AF_INET) { 3110 uint8_t *start_pos; 3111 3112 /* 3113 * The ipv4 addr goes into the last 3114 * four bytes of ar_data. 3115 */ 3116 start_pos = ar.ar_data + ATS_AR_DATA_LEN - 3117 sizeof (struct in_addr); 3118 bcopy(&atsp->ras_sin.sin_addr, start_pos, 3119 sizeof (struct in_addr)); 3120 } else if (atsp->ras_inet_type == AF_INET6) { 3121 bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data, 3122 sizeof (struct in6_addr)); 3123 } else 3124 continue; 3125 3126 ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar); 3127 if (ibt_status == IBT_SUCCESS) { 3128 #ifdef DEBUG 3129 if (rib_debug > 1) { 3130 cmn_err(CE_WARN, "rib_register_ats: " 3131 "ibt_register_ar OK on port %d", i+1); 3132 } 3133 #endif 3134 /* 3135 * Allocate and prepare a service entry 3136 */ 3137 new_service = kmem_zalloc(sizeof (rib_service_t), 3138 KM_SLEEP); 3139 new_service->srv_port = i + 1; 3140 new_service->srv_ar = ar; 3141 new_service->srv_next = NULL; 3142 3143 /* 3144 * Add to the service list for this HCA 3145 */ 3146 new_service->srv_next = hca->ats_list; 3147 hca->ats_list = new_service; 3148 new_service = NULL; 3149 nbinds ++; 3150 } else { 3151 #ifdef DEBUG 3152 if (rib_debug > 1) { 3153 cmn_err(CE_WARN, "rib_register_ats: " 3154 "ibt_register_ar FAILED on port %d", i+1); 3155 } 3156 #endif 3157 } 3158 } 3159 } 3160 } 3161 3162 #ifdef DEBUG 3163 if (rib_debug > 1) { 3164 for (temp_srv = hca->ats_list; temp_srv != NULL; 3165 temp_srv = temp_srv->srv_next) { 3166 cmn_err(CE_NOTE, "Service: ATS, active on" 3167 " port: %d\n", temp_srv->srv_port); 3168 } 3169 } 3170 #endif 3171 3172 rw_exit(&hca->service_list_lock); 3173 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t)); 3174 ibt_free_portinfo(port_infop, port_size); 3175 3176 if (nbinds == 0) { 3177 #ifdef DEBUG 3178 if (rib_debug > 1) { 3179 cmn_err(CE_WARN, "rib_register_ats FAILED!\n"); 3180 } 3181 #endif 3182 return (RDMA_FAILED); 3183 } 3184 return (RDMA_SUCCESS); 3185 } 3186 3187 static rdma_stat 3188 rib_register_service(rib_hca_t *hca, int service_type) 3189 { 3190 ibt_srv_desc_t sdesc; 3191 ibt_srv_bind_t sbind; 3192 ibt_hca_portinfo_t *port_infop; 3193 ib_svc_id_t srv_id; 3194 ibt_srv_hdl_t srv_hdl; 3195 uint_t port_size; 3196 uint_t pki, i, j, num_ports, nbinds; 3197 ibt_status_t ibt_status; 3198 char **addrs; 3199 int addr_count; 3200 rib_service_t *new_service, *temp_srv; 3201 ib_pkey_t pkey; 3202 3203 /* 3204 * Query all ports for the given HCA 3205 */ 3206 rw_enter(&hca->state_lock, RW_READER); 3207 if (hca->state != HCA_DETACHED) { 3208 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3209 &num_ports, &port_size); 3210 rw_exit(&hca->state_lock); 3211 } else { 3212 rw_exit(&hca->state_lock); 3213 return (RDMA_FAILED); 3214 } 3215 if (ibt_status != IBT_SUCCESS) { 3216 #ifdef DEBUG 3217 cmn_err(CE_NOTE, "rib_register_service: FAILED in " 3218 "ibt_query_hca_ports, status = %d\n", ibt_status); 3219 #endif 3220 return (RDMA_FAILED); 3221 } 3222 3223 #ifdef DEBUG 3224 if (rib_debug > 1) { 3225 cmn_err(CE_NOTE, "rib_register_service: Ports detected " 3226 "%d\n", num_ports); 3227 3228 for (i = 0; i < num_ports; i++) { 3229 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3230 cmn_err(CE_WARN, "rib_register_service " 3231 "Port #: %d INACTIVE\n", i+1); 3232 } else if (port_infop[i].p_linkstate == 3233 IBT_PORT_ACTIVE) { 3234 cmn_err(CE_NOTE, "rib_register_service " 3235 "Port #: %d ACTIVE\n", i+1); 3236 } 3237 } 3238 } 3239 #endif 3240 /* 3241 * Get all the IP addresses on this system to register the 3242 * given "service type" on all DNS recognized IP addrs. 3243 * Each service type such as NFS will have all the systems 3244 * IP addresses as its different names. For now the only 3245 * type of service we support in RPCIB is NFS. 3246 */ 3247 addrs = get_ip_addrs(&addr_count); 3248 if (addrs == NULL) { 3249 #ifdef DEBUG 3250 if (rib_debug) { 3251 cmn_err(CE_WARN, "rib_register_service: " 3252 "get_ip_addrs failed\n"); 3253 } 3254 #endif 3255 ibt_free_portinfo(port_infop, port_size); 3256 return (RDMA_FAILED); 3257 } 3258 3259 #ifdef DEBUG 3260 if (rib_debug > 1) { 3261 for (i = 0; i < addr_count; i++) 3262 cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]); 3263 } 3264 #endif 3265 3266 rw_enter(&hca->service_list_lock, RW_WRITER); 3267 /* 3268 * Start registering and binding service to active 3269 * on active ports on this HCA. 3270 */ 3271 nbinds = 0; 3272 new_service = NULL; 3273 3274 /* 3275 * We use IP addresses as the service names for 3276 * service registration. Register each of them 3277 * with CM to obtain a svc_id and svc_hdl. We do not 3278 * register the service with machine's loopback address. 3279 */ 3280 for (j = 1; j < addr_count; j++) { 3281 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3282 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3283 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3284 3285 sdesc.sd_handler = rib_srv_cm_handler; 3286 sdesc.sd_flags = 0; 3287 3288 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3289 &sdesc, 0, 1, &srv_hdl, &srv_id); 3290 if (ibt_status != IBT_SUCCESS) { 3291 #ifdef DEBUG 3292 if (rib_debug) { 3293 cmn_err(CE_WARN, "rib_register_service: " 3294 "ibt_register_service FAILED, status " 3295 "= %d\n", ibt_status); 3296 } 3297 #endif 3298 /* 3299 * No need to go on, since we failed to obtain 3300 * a srv_id and srv_hdl. Move on to the next 3301 * IP addr as a service name. 3302 */ 3303 continue; 3304 } 3305 for (i = 0; i < num_ports; i++) { 3306 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3307 continue; 3308 3309 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3310 pkey = port_infop[i].p_pkey_tbl[pki]; 3311 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) { 3312 3313 /* 3314 * Allocate and prepare a service entry 3315 */ 3316 new_service = kmem_zalloc(1 * sizeof (rib_service_t), 3317 KM_SLEEP); 3318 new_service->srv_type = service_type; 3319 new_service->srv_port = i + 1; 3320 new_service->srv_id = srv_id; 3321 new_service->srv_hdl = srv_hdl; 3322 new_service->srv_sbind_hdl = kmem_zalloc(1 * 3323 sizeof (ibt_sbind_hdl_t), KM_SLEEP); 3324 3325 new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN, 3326 KM_SLEEP); 3327 (void) bcopy(addrs[j], new_service->srv_name, 3328 IB_SVC_NAME_LEN); 3329 (void) strlcat(new_service->srv_name, "::NFS", 3330 IB_SVC_NAME_LEN); 3331 new_service->srv_next = NULL; 3332 3333 /* 3334 * Bind the service, specified by the IP address, 3335 * to the port/pkey using the srv_hdl returned 3336 * from ibt_register_service(). 3337 */ 3338 (void) bzero(&sbind, sizeof (ibt_srv_bind_t)); 3339 sbind.sb_pkey = pkey; 3340 sbind.sb_lease = 0xFFFFFFFF; 3341 sbind.sb_key[0] = NFS_SEC_KEY0; 3342 sbind.sb_key[1] = NFS_SEC_KEY1; 3343 sbind.sb_name = new_service->srv_name; 3344 3345 #ifdef DEBUG 3346 if (rib_debug > 1) { 3347 cmn_err(CE_NOTE, "rib_register_service: " 3348 "binding service using name: %s\n", 3349 sbind.sb_name); 3350 } 3351 #endif 3352 ibt_status = ibt_bind_service(srv_hdl, 3353 port_infop[i].p_sgid_tbl[0], &sbind, rib_stat, 3354 new_service->srv_sbind_hdl); 3355 if (ibt_status != IBT_SUCCESS) { 3356 #ifdef DEBUG 3357 if (rib_debug) { 3358 cmn_err(CE_WARN, "rib_register_service: FAILED" 3359 " in ibt_bind_service, status = %d\n", 3360 ibt_status); 3361 } 3362 #endif 3363 kmem_free(new_service->srv_sbind_hdl, 3364 sizeof (ibt_sbind_hdl_t)); 3365 kmem_free(new_service->srv_name, 3366 IB_SVC_NAME_LEN); 3367 kmem_free(new_service, 3368 sizeof (rib_service_t)); 3369 new_service = NULL; 3370 continue; 3371 } 3372 #ifdef DEBUG 3373 if (rib_debug > 1) { 3374 if (ibt_status == IBT_SUCCESS) 3375 cmn_err(CE_NOTE, "rib_regstr_service: " 3376 "Serv: %s REGISTERED on port: %d", 3377 sbind.sb_name, i+1); 3378 } 3379 #endif 3380 /* 3381 * Add to the service list for this HCA 3382 */ 3383 new_service->srv_next = hca->service_list; 3384 hca->service_list = new_service; 3385 new_service = NULL; 3386 nbinds ++; 3387 } 3388 } 3389 } 3390 } 3391 rw_exit(&hca->service_list_lock); 3392 3393 #ifdef DEBUG 3394 if (rib_debug > 1) { 3395 /* 3396 * Change this print to a more generic one, as rpcib 3397 * is supposed to handle multiple service types. 3398 */ 3399 for (temp_srv = hca->service_list; temp_srv != NULL; 3400 temp_srv = temp_srv->srv_next) { 3401 cmn_err(CE_NOTE, "NFS-IB, active on port:" 3402 " %d\n" 3403 "Using name: %s", temp_srv->srv_port, 3404 temp_srv->srv_name); 3405 } 3406 } 3407 #endif 3408 3409 ibt_free_portinfo(port_infop, port_size); 3410 for (i = 0; i < addr_count; i++) { 3411 if (addrs[i]) 3412 kmem_free(addrs[i], IB_SVC_NAME_LEN); 3413 } 3414 kmem_free(addrs, addr_count * sizeof (char *)); 3415 3416 if (nbinds == 0) { 3417 #ifdef DEBUG 3418 if (rib_debug) { 3419 cmn_err(CE_WARN, "rib_register_service: " 3420 "bind_service FAILED!\n"); 3421 } 3422 #endif 3423 return (RDMA_FAILED); 3424 } else { 3425 /* 3426 * Put this plugin into accept state, since atleast 3427 * one registration was successful. 3428 */ 3429 mutex_enter(&plugin_state_lock); 3430 plugin_state = ACCEPT; 3431 mutex_exit(&plugin_state_lock); 3432 return (RDMA_SUCCESS); 3433 } 3434 } 3435 3436 void 3437 rib_listen(struct rdma_svc_data *rd) 3438 { 3439 rdma_stat status = RDMA_SUCCESS; 3440 3441 rd->active = 0; 3442 rd->err_code = RDMA_FAILED; 3443 3444 /* 3445 * First check if a hca is still attached 3446 */ 3447 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3448 if (rib_stat->hca->state != HCA_INITED) { 3449 rw_exit(&rib_stat->hca->state_lock); 3450 return; 3451 } 3452 rw_exit(&rib_stat->hca->state_lock); 3453 3454 rib_stat->q = &rd->q; 3455 /* 3456 * Register the Address translation service 3457 */ 3458 mutex_enter(&rib_stat->open_hca_lock); 3459 if (ats_running == 0) { 3460 if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) { 3461 #ifdef DEBUG 3462 if (rib_debug) { 3463 cmn_err(CE_WARN, 3464 "rib_listen(): ats registration failed!"); 3465 } 3466 #endif 3467 mutex_exit(&rib_stat->open_hca_lock); 3468 return; 3469 } else { 3470 ats_running = 1; 3471 } 3472 } 3473 mutex_exit(&rib_stat->open_hca_lock); 3474 3475 /* 3476 * Right now the only service type is NFS. Hence force feed this 3477 * value. Ideally to communicate the service type it should be 3478 * passed down in rdma_svc_data. 3479 */ 3480 rib_stat->service_type = NFS; 3481 status = rib_register_service(rib_stat->hca, NFS); 3482 if (status != RDMA_SUCCESS) { 3483 rd->err_code = status; 3484 return; 3485 } 3486 /* 3487 * Service active on an HCA, check rd->err_code for more 3488 * explainable errors. 3489 */ 3490 rd->active = 1; 3491 rd->err_code = status; 3492 } 3493 3494 /* XXXX */ 3495 /* ARGSUSED */ 3496 static void 3497 rib_listen_stop(struct rdma_svc_data *svcdata) 3498 { 3499 rib_hca_t *hca; 3500 3501 /* 3502 * KRPC called the RDMATF to stop the listeners, this means 3503 * stop sending incomming or recieved requests to KRPC master 3504 * transport handle for RDMA-IB. This is also means that the 3505 * master transport handle, responsible for us, is going away. 3506 */ 3507 mutex_enter(&plugin_state_lock); 3508 plugin_state = NO_ACCEPT; 3509 if (svcdata != NULL) 3510 svcdata->active = 0; 3511 mutex_exit(&plugin_state_lock); 3512 3513 /* 3514 * First check if a hca is still attached 3515 */ 3516 hca = rib_stat->hca; 3517 rw_enter(&hca->state_lock, RW_READER); 3518 if (hca->state != HCA_INITED) { 3519 rw_exit(&hca->state_lock); 3520 return; 3521 } 3522 rib_stop_services(hca); 3523 rw_exit(&hca->state_lock); 3524 } 3525 3526 /* 3527 * Traverse the HCA's service list to unbind and deregister services. 3528 * Instead of unbinding the service for a service handle by 3529 * calling ibt_unbind_service() for each port/pkey, we unbind 3530 * all the services for the service handle by making only one 3531 * call to ibt_unbind_all_services(). Then, we deregister the 3532 * service for the service handle. 3533 * 3534 * When traversing the entries in service_list, we compare the 3535 * srv_hdl of the current entry with that of the next. If they 3536 * are different or if the next entry is NULL, the current entry 3537 * marks the last binding of the service handle. In this case, 3538 * call ibt_unbind_all_services() and deregister the service for 3539 * the service handle. If they are the same, the current and the 3540 * next entries are bound to the same service handle. In this 3541 * case, move on to the next entry. 3542 */ 3543 static void 3544 rib_stop_services(rib_hca_t *hca) 3545 { 3546 rib_service_t *srv_list, *to_remove; 3547 ibt_status_t ibt_status; 3548 3549 /* 3550 * unbind and deregister the services for this service type. 3551 * Right now there is only one service type. In future it will 3552 * be passed down to this function. 3553 */ 3554 rw_enter(&hca->service_list_lock, RW_WRITER); 3555 srv_list = hca->service_list; 3556 while (srv_list != NULL) { 3557 to_remove = srv_list; 3558 srv_list = to_remove->srv_next; 3559 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3560 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3561 3562 ibt_status = ibt_unbind_all_services(to_remove->srv_hdl); 3563 if (ibt_status != IBT_SUCCESS) { 3564 cmn_err(CE_WARN, "rib_listen_stop: " 3565 "ibt_unbind_all_services FAILED" 3566 " status: %d\n", ibt_status); 3567 } 3568 3569 ibt_status = 3570 ibt_deregister_service(hca->ibt_clnt_hdl, 3571 to_remove->srv_hdl); 3572 if (ibt_status != IBT_SUCCESS) { 3573 cmn_err(CE_WARN, "rib_listen_stop: " 3574 "ibt_deregister_service FAILED" 3575 " status: %d\n", ibt_status); 3576 } 3577 3578 #ifdef DEBUG 3579 if (rib_debug > 1) { 3580 if (ibt_status == IBT_SUCCESS) 3581 cmn_err(CE_NOTE, "rib_listen_stop: " 3582 "Successfully stopped and" 3583 " UNREGISTERED service: %s\n", 3584 to_remove->srv_name); 3585 } 3586 #endif 3587 } 3588 kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN); 3589 kmem_free(to_remove->srv_sbind_hdl, 3590 sizeof (ibt_sbind_hdl_t)); 3591 3592 kmem_free(to_remove, sizeof (rib_service_t)); 3593 } 3594 hca->service_list = NULL; 3595 rw_exit(&hca->service_list_lock); 3596 } 3597 3598 static struct svc_recv * 3599 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3600 { 3601 struct svc_recv *recvp; 3602 3603 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3604 recvp->vaddr = sgl->ds_va; 3605 recvp->qp = qp; 3606 recvp->bytes_xfer = 0; 3607 return (recvp); 3608 } 3609 3610 static int 3611 rib_free_svc_recv(struct svc_recv *recvp) 3612 { 3613 kmem_free(recvp, sizeof (*recvp)); 3614 3615 return (0); 3616 } 3617 3618 static struct reply * 3619 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3620 { 3621 struct reply *rep; 3622 3623 3624 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3625 if (rep == NULL) { 3626 mutex_exit(&qp->replylist_lock); 3627 cmn_err(CE_WARN, "rib_addreplylist: no memory\n"); 3628 return (NULL); 3629 } 3630 rep->xid = msgid; 3631 rep->vaddr_cq = NULL; 3632 rep->bytes_xfer = 0; 3633 rep->status = (uint_t)REPLY_WAIT; 3634 rep->prev = NULL; 3635 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3636 3637 mutex_enter(&qp->replylist_lock); 3638 if (qp->replylist) { 3639 rep->next = qp->replylist; 3640 qp->replylist->prev = rep; 3641 } 3642 qp->rep_list_size++; 3643 if (rib_debug > 1) 3644 cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n", 3645 (void *)qp, qp->rep_list_size); 3646 qp->replylist = rep; 3647 mutex_exit(&qp->replylist_lock); 3648 3649 return (rep); 3650 } 3651 3652 static rdma_stat 3653 rib_rem_replylist(rib_qp_t *qp) 3654 { 3655 struct reply *r, *n; 3656 3657 mutex_enter(&qp->replylist_lock); 3658 for (r = qp->replylist; r != NULL; r = n) { 3659 n = r->next; 3660 (void) rib_remreply(qp, r); 3661 } 3662 mutex_exit(&qp->replylist_lock); 3663 3664 return (RDMA_SUCCESS); 3665 } 3666 3667 static int 3668 rib_remreply(rib_qp_t *qp, struct reply *rep) 3669 { 3670 3671 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3672 if (rep->prev) { 3673 rep->prev->next = rep->next; 3674 } 3675 if (rep->next) { 3676 rep->next->prev = rep->prev; 3677 } 3678 if (qp->replylist == rep) 3679 qp->replylist = rep->next; 3680 3681 cv_destroy(&rep->wait_cv); 3682 qp->rep_list_size--; 3683 if (rib_debug > 1) 3684 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n", 3685 (void *)qp, qp->rep_list_size); 3686 3687 kmem_free(rep, sizeof (*rep)); 3688 3689 return (0); 3690 } 3691 3692 rdma_stat 3693 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 3694 struct mrc *buf_handle) 3695 { 3696 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3697 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3698 rdma_stat status; 3699 rib_hca_t *hca = (ctoqp(conn))->hca; 3700 3701 /* 3702 * Note: ALL buffer pools use the same memory type RDMARW. 3703 */ 3704 status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc); 3705 if (status == RDMA_SUCCESS) { 3706 buf_handle->mrc_linfo = (uint64_t)mr_hdl; 3707 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3708 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3709 } else { 3710 buf_handle->mrc_linfo = NULL; 3711 buf_handle->mrc_lmr = 0; 3712 buf_handle->mrc_rmr = 0; 3713 } 3714 return (status); 3715 } 3716 3717 static rdma_stat 3718 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 3719 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3720 { 3721 ibt_mr_attr_t mem_attr; 3722 ibt_status_t ibt_status; 3723 3724 mem_attr.mr_vaddr = (uint64_t)buf; 3725 mem_attr.mr_len = (ib_msglen_t)size; 3726 mem_attr.mr_as = NULL; 3727 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3728 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3729 IBT_MR_ENABLE_WINDOW_BIND | spec; 3730 3731 rw_enter(&hca->state_lock, RW_READER); 3732 if (hca->state == HCA_INITED) { 3733 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3734 &mem_attr, mr_hdlp, mr_descp); 3735 rw_exit(&hca->state_lock); 3736 } else { 3737 rw_exit(&hca->state_lock); 3738 return (RDMA_FAILED); 3739 } 3740 3741 if (ibt_status != IBT_SUCCESS) { 3742 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr " 3743 "(spec:%d) failed for addr %llX, status %d", 3744 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status); 3745 return (RDMA_FAILED); 3746 } 3747 return (RDMA_SUCCESS); 3748 } 3749 3750 rdma_stat 3751 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 3752 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle) 3753 { 3754 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3755 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3756 rdma_stat status; 3757 rib_hca_t *hca = (ctoqp(conn))->hca; 3758 3759 /* 3760 * Non-coherent memory registration. 3761 */ 3762 status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 3763 &mr_desc); 3764 if (status == RDMA_SUCCESS) { 3765 buf_handle->mrc_linfo = (uint64_t)mr_hdl; 3766 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3767 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3768 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3769 } else { 3770 buf_handle->mrc_linfo = NULL; 3771 buf_handle->mrc_lmr = 0; 3772 buf_handle->mrc_rmr = 0; 3773 } 3774 return (status); 3775 } 3776 3777 /* ARGSUSED */ 3778 rdma_stat 3779 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3780 { 3781 rib_hca_t *hca = (ctoqp(conn))->hca; 3782 3783 /* 3784 * Allow memory deregistration even if HCA is 3785 * getting detached. Need all outstanding 3786 * memory registrations to be deregistered 3787 * before HCA_DETACH_EVENT can be accepted. 3788 */ 3789 (void) ibt_deregister_mr(hca->hca_hdl, 3790 (ibt_mr_hdl_t)buf_handle.mrc_linfo); 3791 return (RDMA_SUCCESS); 3792 } 3793 3794 /* ARGSUSED */ 3795 rdma_stat 3796 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3797 RIB_SYNCMEM_HANDLE sync_handle) 3798 { 3799 (void) rib_deregistermem(conn, buf, buf_handle); 3800 3801 return (RDMA_SUCCESS); 3802 } 3803 3804 /* ARGSUSED */ 3805 rdma_stat 3806 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3807 int len, int cpu) 3808 { 3809 ibt_status_t status; 3810 rib_hca_t *hca = (ctoqp(conn))->hca; 3811 ibt_mr_sync_t mr_segment; 3812 3813 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3814 mr_segment.ms_vaddr = (ib_vaddr_t)buf; 3815 mr_segment.ms_len = (ib_memlen_t)len; 3816 if (cpu) { 3817 /* make incoming data visible to memory */ 3818 mr_segment.ms_flags = IBT_SYNC_WRITE; 3819 } else { 3820 /* make memory changes visible to IO */ 3821 mr_segment.ms_flags = IBT_SYNC_READ; 3822 } 3823 rw_enter(&hca->state_lock, RW_READER); 3824 if (hca->state == HCA_INITED) { 3825 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3826 rw_exit(&hca->state_lock); 3827 } else { 3828 rw_exit(&hca->state_lock); 3829 return (RDMA_FAILED); 3830 } 3831 3832 if (status == IBT_SUCCESS) 3833 return (RDMA_SUCCESS); 3834 else { 3835 #ifdef DEBUG 3836 cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n", 3837 status); 3838 #endif 3839 return (RDMA_FAILED); 3840 } 3841 } 3842 3843 /* 3844 * XXXX ???? 3845 */ 3846 static rdma_stat 3847 rib_getinfo(rdma_info_t *info) 3848 { 3849 /* 3850 * XXXX Hack! 3851 */ 3852 info->addrlen = 16; 3853 info->mts = 1000000; 3854 info->mtu = 1000000; 3855 3856 return (RDMA_SUCCESS); 3857 } 3858 3859 rib_bufpool_t * 3860 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3861 { 3862 rib_bufpool_t *rbp = NULL; 3863 bufpool_t *bp = NULL; 3864 caddr_t buf; 3865 ibt_mr_attr_t mem_attr; 3866 ibt_status_t ibt_status; 3867 int i, j; 3868 3869 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3870 3871 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3872 num * sizeof (void *), KM_SLEEP); 3873 3874 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3875 bp->numelems = num; 3876 3877 switch (ptype) { 3878 case SEND_BUFFER: 3879 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3880 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 3881 bp->rsize = RPC_MSG_SZ; 3882 break; 3883 case RECV_BUFFER: 3884 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3885 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 3886 bp->rsize = RPC_BUF_SIZE; 3887 break; 3888 default: 3889 goto fail; 3890 } 3891 3892 /* 3893 * Register the pool. 3894 */ 3895 bp->bufsize = num * bp->rsize; 3896 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3897 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3898 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3899 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3900 sizeof (ibt_mr_desc_t), KM_SLEEP); 3901 3902 rw_enter(&hca->state_lock, RW_READER); 3903 if (hca->state != HCA_INITED) { 3904 rw_exit(&hca->state_lock); 3905 goto fail; 3906 } 3907 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3908 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3909 mem_attr.mr_vaddr = (uint64_t)buf; 3910 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3911 mem_attr.mr_as = NULL; 3912 ibt_status = ibt_register_mr(hca->hca_hdl, 3913 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i], 3914 &rbp->mr_desc[i]); 3915 if (ibt_status != IBT_SUCCESS) { 3916 for (j = 0; j < i; j++) { 3917 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]); 3918 } 3919 rw_exit(&hca->state_lock); 3920 goto fail; 3921 } 3922 } 3923 rw_exit(&hca->state_lock); 3924 3925 buf = (caddr_t)bp->buf; 3926 for (i = 0; i < num; i++, buf += bp->rsize) { 3927 bp->buflist[i] = (void *)buf; 3928 } 3929 bp->buffree = num - 1; /* no. of free buffers */ 3930 rbp->bpool = bp; 3931 3932 return (rbp); 3933 fail: 3934 if (bp) { 3935 if (bp->buf) 3936 kmem_free(bp->buf, bp->bufsize); 3937 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3938 } 3939 if (rbp) { 3940 if (rbp->mr_hdl) 3941 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3942 if (rbp->mr_desc) 3943 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3944 kmem_free(rbp, sizeof (rib_bufpool_t)); 3945 } 3946 return (NULL); 3947 } 3948 3949 static void 3950 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3951 { 3952 int i; 3953 rib_bufpool_t *rbp = NULL; 3954 bufpool_t *bp; 3955 3956 /* 3957 * Obtain pool address based on type of pool 3958 */ 3959 switch (ptype) { 3960 case SEND_BUFFER: 3961 rbp = hca->send_pool; 3962 break; 3963 case RECV_BUFFER: 3964 rbp = hca->recv_pool; 3965 break; 3966 default: 3967 return; 3968 } 3969 if (rbp == NULL) 3970 return; 3971 3972 bp = rbp->bpool; 3973 3974 /* 3975 * Deregister the pool memory and free it. 3976 */ 3977 for (i = 0; i < bp->numelems; i++) { 3978 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3979 } 3980 } 3981 3982 static void 3983 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3984 { 3985 3986 rib_bufpool_t *rbp = NULL; 3987 bufpool_t *bp; 3988 3989 /* 3990 * Obtain pool address based on type of pool 3991 */ 3992 switch (ptype) { 3993 case SEND_BUFFER: 3994 rbp = hca->send_pool; 3995 break; 3996 case RECV_BUFFER: 3997 rbp = hca->recv_pool; 3998 break; 3999 default: 4000 return; 4001 } 4002 if (rbp == NULL) 4003 return; 4004 4005 bp = rbp->bpool; 4006 4007 /* 4008 * Free the pool memory. 4009 */ 4010 if (rbp->mr_hdl) 4011 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 4012 4013 if (rbp->mr_desc) 4014 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4015 4016 if (bp->buf) 4017 kmem_free(bp->buf, bp->bufsize); 4018 mutex_destroy(&bp->buflock); 4019 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4020 kmem_free(rbp, sizeof (rib_bufpool_t)); 4021 } 4022 4023 void 4024 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4025 { 4026 /* 4027 * Deregister the pool memory and free it. 4028 */ 4029 rib_rbufpool_deregister(hca, ptype); 4030 rib_rbufpool_free(hca, ptype); 4031 } 4032 4033 /* 4034 * Fetch a buffer from the pool of type specified in rdbuf->type. 4035 */ 4036 static rdma_stat 4037 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4038 { 4039 4040 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4041 if (rdbuf->addr) { 4042 switch (rdbuf->type) { 4043 case SEND_BUFFER: 4044 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4045 break; 4046 case RECV_BUFFER: 4047 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4048 break; 4049 default: 4050 rdbuf->len = 0; 4051 } 4052 return (RDMA_SUCCESS); 4053 } else 4054 return (RDMA_FAILED); 4055 } 4056 4057 4058 /* 4059 * Fetch a buffer of specified type. 4060 * Note that rdbuf->handle is mw's rkey. 4061 */ 4062 static void * 4063 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4064 { 4065 rib_qp_t *qp = ctoqp(conn); 4066 rib_hca_t *hca = qp->hca; 4067 rdma_btype ptype = rdbuf->type; 4068 void *buf; 4069 rib_bufpool_t *rbp = NULL; 4070 bufpool_t *bp; 4071 int i; 4072 4073 /* 4074 * Obtain pool address based on type of pool 4075 */ 4076 switch (ptype) { 4077 case SEND_BUFFER: 4078 rbp = hca->send_pool; 4079 break; 4080 case RECV_BUFFER: 4081 rbp = hca->recv_pool; 4082 break; 4083 default: 4084 return (NULL); 4085 } 4086 if (rbp == NULL) 4087 return (NULL); 4088 4089 bp = rbp->bpool; 4090 4091 mutex_enter(&bp->buflock); 4092 if (bp->buffree < 0) { 4093 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!"); 4094 mutex_exit(&bp->buflock); 4095 return (NULL); 4096 } 4097 4098 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4099 buf = bp->buflist[bp->buffree]; 4100 rdbuf->addr = buf; 4101 rdbuf->len = bp->rsize; 4102 for (i = bp->numelems - 1; i >= 0; i--) { 4103 if ((ib_vaddr_t)buf == rbp->mr_desc[i].md_vaddr) { 4104 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey; 4105 rdbuf->handle.mrc_linfo = (uint64_t)rbp->mr_hdl[i]; 4106 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey; 4107 bp->buffree--; 4108 if (rib_debug > 1) 4109 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs " 4110 "(type %d)\n", bp->buffree+1, ptype); 4111 4112 mutex_exit(&bp->buflock); 4113 4114 return (buf); 4115 } 4116 } 4117 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of " 4118 "type %d found!", buf, ptype); 4119 mutex_exit(&bp->buflock); 4120 4121 return (NULL); 4122 } 4123 4124 static void 4125 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4126 { 4127 4128 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4129 } 4130 4131 static void 4132 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4133 { 4134 rib_qp_t *qp = ctoqp(conn); 4135 rib_hca_t *hca = qp->hca; 4136 rib_bufpool_t *rbp = NULL; 4137 bufpool_t *bp; 4138 4139 /* 4140 * Obtain pool address based on type of pool 4141 */ 4142 switch (ptype) { 4143 case SEND_BUFFER: 4144 rbp = hca->send_pool; 4145 break; 4146 case RECV_BUFFER: 4147 rbp = hca->recv_pool; 4148 break; 4149 default: 4150 return; 4151 } 4152 if (rbp == NULL) 4153 return; 4154 4155 bp = rbp->bpool; 4156 4157 mutex_enter(&bp->buflock); 4158 if (++bp->buffree >= bp->numelems) { 4159 /* 4160 * Should never happen 4161 */ 4162 cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) " 4163 "too many frees!", ptype); 4164 bp->buffree--; 4165 } else { 4166 bp->buflist[bp->buffree] = buf; 4167 if (rib_debug > 1) 4168 cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs " 4169 "(type %d)\n", bp->buffree+1, ptype); 4170 } 4171 mutex_exit(&bp->buflock); 4172 } 4173 4174 static rdma_stat 4175 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4176 { 4177 rw_enter(&connlist->conn_lock, RW_WRITER); 4178 if (connlist->conn_hd) { 4179 cn->c_next = connlist->conn_hd; 4180 connlist->conn_hd->c_prev = cn; 4181 } 4182 connlist->conn_hd = cn; 4183 rw_exit(&connlist->conn_lock); 4184 4185 return (RDMA_SUCCESS); 4186 } 4187 4188 static rdma_stat 4189 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4190 { 4191 rw_enter(&connlist->conn_lock, RW_WRITER); 4192 if (cn->c_prev) { 4193 cn->c_prev->c_next = cn->c_next; 4194 } 4195 if (cn->c_next) { 4196 cn->c_next->c_prev = cn->c_prev; 4197 } 4198 if (connlist->conn_hd == cn) 4199 connlist->conn_hd = cn->c_next; 4200 rw_exit(&connlist->conn_lock); 4201 4202 return (RDMA_SUCCESS); 4203 } 4204 4205 /* 4206 * Connection management. 4207 * IBTF does not support recycling of channels. So connections are only 4208 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or 4209 * C_DISCONN_PEND state. No C_IDLE state. 4210 * C_CONN_PEND state: Connection establishment in progress to the server. 4211 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4212 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4213 * only in this state. 4214 * C_ERROR state: A connection transitions to this state when WRs on the 4215 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4216 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4217 * C_DISCONN_PEND state: When a connection is in C_ERROR state and when 4218 * c_ref drops to 0 (this indicates that RPC has no more references to this 4219 * connection), the connection should be destroyed. A connection transitions 4220 * into this state when it is being destroyed. 4221 */ 4222 static rdma_stat 4223 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 4224 { 4225 CONN *cn; 4226 int status = RDMA_SUCCESS; 4227 rib_hca_t *hca = (rib_hca_t *)handle; 4228 rib_qp_t *qp; 4229 clock_t cv_stat, timout; 4230 ibt_path_info_t path; 4231 4232 again: 4233 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4234 cn = hca->cl_conn_list.conn_hd; 4235 while (cn != NULL) { 4236 /* 4237 * First, clear up any connection in the ERROR state 4238 */ 4239 mutex_enter(&cn->c_lock); 4240 if (cn->c_state == C_ERROR) { 4241 if (cn->c_ref == 0) { 4242 /* 4243 * Remove connection from list and destroy it. 4244 */ 4245 cn->c_state = C_DISCONN_PEND; 4246 mutex_exit(&cn->c_lock); 4247 rw_exit(&hca->cl_conn_list.conn_lock); 4248 (void) rib_disconnect_channel(cn, 4249 &hca->cl_conn_list); 4250 goto again; 4251 } 4252 mutex_exit(&cn->c_lock); 4253 cn = cn->c_next; 4254 continue; 4255 } else if (cn->c_state == C_DISCONN_PEND) { 4256 mutex_exit(&cn->c_lock); 4257 cn = cn->c_next; 4258 continue; 4259 } 4260 if ((cn->c_raddr.len == svcaddr->len) && 4261 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 4262 /* 4263 * Our connection. Give up conn list lock 4264 * as we are done traversing the list. 4265 */ 4266 rw_exit(&hca->cl_conn_list.conn_lock); 4267 if (cn->c_state == C_CONNECTED) { 4268 cn->c_ref++; /* sharing a conn */ 4269 mutex_exit(&cn->c_lock); 4270 *conn = cn; 4271 return (status); 4272 } 4273 if (cn->c_state == C_CONN_PEND) { 4274 /* 4275 * Hold a reference to this conn before 4276 * we give up the lock. 4277 */ 4278 cn->c_ref++; 4279 timout = ddi_get_lbolt() + 4280 drv_usectohz(CONN_WAIT_TIME * 1000000); 4281 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4282 &cn->c_lock, timout)) > 0 && 4283 cn->c_state == C_CONN_PEND) 4284 ; 4285 if (cv_stat == 0) { 4286 cn->c_ref--; 4287 mutex_exit(&cn->c_lock); 4288 return (RDMA_INTR); 4289 } 4290 if (cv_stat < 0) { 4291 cn->c_ref--; 4292 mutex_exit(&cn->c_lock); 4293 return (RDMA_TIMEDOUT); 4294 } 4295 if (cn->c_state == C_CONNECTED) { 4296 *conn = cn; 4297 mutex_exit(&cn->c_lock); 4298 return (status); 4299 } else { 4300 cn->c_ref--; 4301 mutex_exit(&cn->c_lock); 4302 return (RDMA_TIMEDOUT); 4303 } 4304 } 4305 } 4306 mutex_exit(&cn->c_lock); 4307 cn = cn->c_next; 4308 } 4309 rw_exit(&hca->cl_conn_list.conn_lock); 4310 4311 status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path); 4312 if (status != RDMA_SUCCESS) { 4313 #ifdef DEBUG 4314 if (rib_debug) { 4315 cmn_err(CE_WARN, "rib_conn_get: " 4316 "No server ATS record!"); 4317 } 4318 #endif 4319 return (RDMA_FAILED); 4320 } 4321 4322 /* 4323 * Channel to server doesn't exist yet, create one. 4324 */ 4325 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 4326 return (RDMA_FAILED); 4327 } 4328 cn = qptoc(qp); 4329 cn->c_state = C_CONN_PEND; 4330 cn->c_ref = 1; 4331 4332 /* 4333 * Add to conn list. 4334 * We had given up the READER lock. In the time since then, 4335 * another thread might have created the connection we are 4336 * trying here. But for now, that is quiet alright - there 4337 * might be two connections between a pair of hosts instead 4338 * of one. If we really want to close that window, 4339 * then need to check the list after acquiring the 4340 * WRITER lock. 4341 */ 4342 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4343 status = rib_conn_to_srv(hca, qp, &path); 4344 mutex_enter(&cn->c_lock); 4345 if (status == RDMA_SUCCESS) { 4346 cn->c_state = C_CONNECTED; 4347 *conn = cn; 4348 } else { 4349 cn->c_state = C_ERROR; 4350 cn->c_ref--; 4351 #ifdef DEBUG 4352 if (rib_debug) { 4353 cmn_err(CE_WARN, "rib_conn_get: FAILED creating" 4354 " a channel!"); 4355 } 4356 #endif 4357 } 4358 cv_broadcast(&cn->c_cv); 4359 mutex_exit(&cn->c_lock); 4360 return (status); 4361 } 4362 4363 static rdma_stat 4364 rib_conn_release(CONN *conn) 4365 { 4366 rib_qp_t *qp = ctoqp(conn); 4367 4368 mutex_enter(&conn->c_lock); 4369 conn->c_ref--; 4370 4371 /* 4372 * If a conn is C_ERROR, close the channel. 4373 * If it's CONNECTED, keep it that way. 4374 */ 4375 if (conn->c_ref == 0 && (conn->c_state & C_ERROR)) { 4376 conn->c_state = C_DISCONN_PEND; 4377 mutex_exit(&conn->c_lock); 4378 if (qp->mode == RIB_SERVER) 4379 (void) rib_disconnect_channel(conn, 4380 &qp->hca->srv_conn_list); 4381 else 4382 (void) rib_disconnect_channel(conn, 4383 &qp->hca->cl_conn_list); 4384 return (RDMA_SUCCESS); 4385 } 4386 mutex_exit(&conn->c_lock); 4387 return (RDMA_SUCCESS); 4388 } 4389 4390 /* 4391 * Add at front of list 4392 */ 4393 static struct rdma_done_list * 4394 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4395 { 4396 struct rdma_done_list *rd; 4397 4398 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4399 4400 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4401 rd->xid = xid; 4402 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4403 4404 rd->prev = NULL; 4405 rd->next = qp->rdlist; 4406 if (qp->rdlist != NULL) 4407 qp->rdlist->prev = rd; 4408 qp->rdlist = rd; 4409 4410 return (rd); 4411 } 4412 4413 static void 4414 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4415 { 4416 struct rdma_done_list *r; 4417 4418 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4419 4420 r = rd->next; 4421 if (r != NULL) { 4422 r->prev = rd->prev; 4423 } 4424 4425 r = rd->prev; 4426 if (r != NULL) { 4427 r->next = rd->next; 4428 } else { 4429 qp->rdlist = rd->next; 4430 } 4431 4432 cv_destroy(&rd->rdma_done_cv); 4433 kmem_free(rd, sizeof (*rd)); 4434 } 4435 4436 static void 4437 rdma_done_rem_list(rib_qp_t *qp) 4438 { 4439 struct rdma_done_list *r, *n; 4440 4441 mutex_enter(&qp->rdlist_lock); 4442 for (r = qp->rdlist; r != NULL; r = n) { 4443 n = r->next; 4444 rdma_done_rm(qp, r); 4445 } 4446 mutex_exit(&qp->rdlist_lock); 4447 } 4448 4449 static void 4450 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4451 { 4452 struct rdma_done_list *r = qp->rdlist; 4453 4454 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4455 4456 while (r) { 4457 if (r->xid == xid) { 4458 cv_signal(&r->rdma_done_cv); 4459 return; 4460 } else { 4461 r = r->next; 4462 } 4463 } 4464 if (rib_debug > 1) { 4465 cmn_err(CE_WARN, "rdma_done_notify: " 4466 "No matching xid for %u, qp %p\n", xid, (void *)qp); 4467 } 4468 } 4469 4470 rpcib_ats_t * 4471 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds) 4472 { 4473 rpcib_ats_t *atsp; 4474 int i; 4475 4476 for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) { 4477 if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix && 4478 atsp->ras_port_gid.gid_guid == gid->gid_guid && 4479 atsp->ras_pkey == pkey) { 4480 return (atsp); 4481 } 4482 } 4483 return (NULL); 4484 } 4485 4486 int 4487 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg) 4488 { 4489 rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg; 4490 rpcib_ats_t *atsp; 4491 ib_pkey_t pkey; 4492 uint8_t port; 4493 ib_guid_t hca_guid; 4494 ib_gid_t port_gid; 4495 4496 if ((i_ddi_node_state(dip) >= DS_ATTACHED) && 4497 (strcmp(ddi_node_name(dip), "ibport") == 0) && 4498 (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) { 4499 4500 if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) { 4501 rpcib_ats_t *tmp; 4502 4503 tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc + 4504 N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP); 4505 bcopy(ibds->rib_ats, tmp, 4506 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t)); 4507 kmem_free(ibds->rib_ats, 4508 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t)); 4509 ibds->rib_ats = tmp; 4510 ibds->rib_ibd_alloc += N_IBD_INSTANCES; 4511 } 4512 if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, 4513 dip, 0, "hca-guid", 0)) == 0) || 4514 ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 4515 0, "port-number", 0)) == 0) || 4516 (ibt_get_port_state_byguid(hca_guid, port, 4517 &port_gid, NULL) != IBT_SUCCESS) || 4518 ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 4519 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= 4520 IB_PKEY_INVALID_FULL)) { 4521 return (DDI_WALK_CONTINUE); 4522 } 4523 atsp = &ibds->rib_ats[ibds->rib_ibd_cnt]; 4524 atsp->ras_inst = ddi_get_instance(dip); 4525 atsp->ras_pkey = pkey; 4526 atsp->ras_port_gid = port_gid; 4527 ibds->rib_ibd_cnt++; 4528 } 4529 return (DDI_WALK_CONTINUE); 4530 } 4531 4532 void 4533 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds) 4534 { 4535 ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds); 4536 } 4537 4538 /* 4539 * Return ibd interfaces and ibd instances. 4540 */ 4541 int 4542 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds) 4543 { 4544 TIUSER *tiptr, *tiptr6; 4545 vnode_t *kvp, *kvp6; 4546 vnode_t *vp = NULL, *vp6 = NULL; 4547 struct strioctl iocb; 4548 struct lifreq lif_req; 4549 int k, ip_cnt; 4550 rpcib_ats_t *atsp; 4551 4552 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, 4553 &kvp) == 0) { 4554 if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, 4555 &tiptr, CRED()) == 0) { 4556 vp = tiptr->fp->f_vnode; 4557 } else { 4558 VN_RELE(kvp); 4559 } 4560 } 4561 4562 if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP, 4563 &kvp6) == 0) { 4564 if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE, 4565 &tiptr6, CRED()) == 0) { 4566 vp6 = tiptr6->fp->f_vnode; 4567 } else { 4568 VN_RELE(kvp6); 4569 } 4570 } 4571 4572 if (vp == NULL && vp6 == NULL) 4573 return (-1); 4574 4575 /* Get ibd ip's */ 4576 ip_cnt = 0; 4577 for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) { 4578 /* IPv4 */ 4579 if (vp != NULL) { 4580 (void) bzero((void *)&lif_req, sizeof (struct lifreq)); 4581 (void) snprintf(lif_req.lifr_name, 4582 sizeof (lif_req.lifr_name), "%s%d", 4583 IBD_NAME, atsp->ras_inst); 4584 4585 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4586 iocb.ic_cmd = SIOCGLIFADDR; 4587 iocb.ic_timout = 0; 4588 iocb.ic_len = sizeof (struct lifreq); 4589 iocb.ic_dp = (caddr_t)&lif_req; 4590 if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) { 4591 atsp->ras_inet_type = AF_INET; 4592 bcopy(&lif_req.lifr_addr, &atsp->ras_sin, 4593 sizeof (struct sockaddr_in)); 4594 ip_cnt++; 4595 continue; 4596 } 4597 } 4598 /* Try IPv6 */ 4599 if (vp6 != NULL) { 4600 (void) bzero((void *)&lif_req, sizeof (struct lifreq)); 4601 (void) snprintf(lif_req.lifr_name, 4602 sizeof (lif_req.lifr_name), "%s%d", 4603 IBD_NAME, atsp->ras_inst); 4604 4605 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4606 iocb.ic_cmd = SIOCGLIFADDR; 4607 iocb.ic_timout = 0; 4608 iocb.ic_len = sizeof (struct lifreq); 4609 iocb.ic_dp = (caddr_t)&lif_req; 4610 if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) { 4611 4612 atsp->ras_inet_type = AF_INET6; 4613 bcopy(&lif_req.lifr_addr, &atsp->ras_sin6, 4614 sizeof (struct sockaddr_in6)); 4615 ip_cnt++; 4616 } 4617 } 4618 } 4619 4620 if (vp6 != NULL) { 4621 (void) t_kclose(tiptr6, 0); 4622 VN_RELE(kvp6); 4623 } 4624 if (vp != NULL) { 4625 (void) t_kclose(tiptr, 0); 4626 VN_RELE(kvp); 4627 } 4628 4629 if (ip_cnt == 0) 4630 return (-1); 4631 else 4632 return (0); 4633 } 4634 4635 char ** 4636 get_ip_addrs(int *count) 4637 { 4638 TIUSER *tiptr; 4639 vnode_t *kvp; 4640 int num_of_ifs; 4641 char **addresses; 4642 int return_code; 4643 4644 /* 4645 * Open a device for doing down stream kernel ioctls 4646 */ 4647 return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, 4648 NULLVPP, &kvp); 4649 if (return_code != 0) { 4650 cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n"); 4651 *count = -1; 4652 return (NULL); 4653 } 4654 4655 return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, 4656 &tiptr, CRED()); 4657 if (return_code != 0) { 4658 cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n"); 4659 VN_RELE(kvp); 4660 *count = -1; 4661 return (NULL); 4662 } 4663 4664 /* 4665 * Perform the first ioctl to get the number of interfaces 4666 */ 4667 return_code = get_interfaces(tiptr, &num_of_ifs); 4668 if (return_code != 0 || num_of_ifs == 0) { 4669 cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n"); 4670 (void) t_kclose(tiptr, 0); 4671 VN_RELE(kvp); 4672 *count = -1; 4673 return (NULL); 4674 } 4675 4676 /* 4677 * Perform the second ioctl to get the address on each interface 4678 * found. 4679 */ 4680 addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP); 4681 return_code = find_addrs(tiptr, addresses, num_of_ifs); 4682 if (return_code <= 0) { 4683 cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n"); 4684 (void) t_kclose(tiptr, 0); 4685 kmem_free(addresses, num_of_ifs * sizeof (char *)); 4686 VN_RELE(kvp); 4687 *count = -1; 4688 return (NULL); 4689 } 4690 4691 *count = return_code; 4692 VN_RELE(kvp); 4693 (void) t_kclose(tiptr, 0); 4694 return (addresses); 4695 } 4696 4697 int 4698 get_interfaces(TIUSER *tiptr, int *num) 4699 { 4700 struct lifnum if_buf; 4701 struct strioctl iocb; 4702 vnode_t *vp; 4703 int return_code; 4704 4705 /* 4706 * Prep the number of interfaces request buffer for ioctl 4707 */ 4708 (void) bzero((void *)&if_buf, sizeof (struct lifnum)); 4709 if_buf.lifn_family = AF_UNSPEC; 4710 if_buf.lifn_flags = 0; 4711 4712 /* 4713 * Prep the kernel ioctl buffer and send it down stream 4714 */ 4715 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4716 iocb.ic_cmd = SIOCGLIFNUM; 4717 iocb.ic_timout = 0; 4718 iocb.ic_len = sizeof (if_buf); 4719 iocb.ic_dp = (caddr_t)&if_buf; 4720 4721 vp = tiptr->fp->f_vnode; 4722 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4723 if (return_code != 0) { 4724 cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n"); 4725 *num = -1; 4726 return (-1); 4727 } 4728 4729 *num = if_buf.lifn_count; 4730 #ifdef DEBUG 4731 if (rib_debug > 1) 4732 cmn_err(CE_NOTE, "Number of interfaces detected: %d\n", 4733 if_buf.lifn_count); 4734 #endif 4735 return (0); 4736 } 4737 4738 int 4739 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs) 4740 { 4741 struct lifconf lifc; 4742 struct lifreq *if_data_buf; 4743 struct strioctl iocb; 4744 caddr_t request_buffer; 4745 struct sockaddr_in *sin4; 4746 struct sockaddr_in6 *sin6; 4747 vnode_t *vp; 4748 int i, count, return_code; 4749 4750 /* 4751 * Prep the buffer for requesting all interface's info 4752 */ 4753 (void) bzero((void *)&lifc, sizeof (struct lifconf)); 4754 lifc.lifc_family = AF_UNSPEC; 4755 lifc.lifc_flags = 0; 4756 lifc.lifc_len = num_ifs * sizeof (struct lifreq); 4757 4758 request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq), 4759 KM_SLEEP); 4760 4761 lifc.lifc_buf = request_buffer; 4762 4763 /* 4764 * Prep the kernel ioctl buffer and send it down stream 4765 */ 4766 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4767 iocb.ic_cmd = SIOCGLIFCONF; 4768 iocb.ic_timout = 0; 4769 iocb.ic_len = sizeof (struct lifconf); 4770 iocb.ic_dp = (caddr_t)&lifc; 4771 4772 vp = tiptr->fp->f_vnode; 4773 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4774 if (return_code != 0) { 4775 cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n"); 4776 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq)); 4777 return (-1); 4778 } 4779 4780 /* 4781 * Extract addresses and fill them in the requested array 4782 * IB_SVC_NAME_LEN is defined to be 64 so it covers both IPv4 & 4783 * IPv6. Here count is the number of IP addresses collected. 4784 */ 4785 if_data_buf = lifc.lifc_req; 4786 count = 0; 4787 for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, 4788 if_data_buf++) { 4789 if (if_data_buf->lifr_addr.ss_family == AF_INET) { 4790 sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr; 4791 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 4792 (void) inet_ntop(AF_INET, &sin4->sin_addr, 4793 addrs[count], IB_SVC_NAME_LEN); 4794 count ++; 4795 } 4796 4797 if (if_data_buf->lifr_addr.ss_family == AF_INET6) { 4798 sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr; 4799 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 4800 (void) inet_ntop(AF_INET6, &sin6->sin6_addr, 4801 addrs[count], IB_SVC_NAME_LEN); 4802 count ++; 4803 } 4804 } 4805 4806 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq)); 4807 return (count); 4808 } 4809 4810 /* 4811 * Goes through all connections and closes the channel 4812 * This will cause all the WRs on those channels to be 4813 * flushed. 4814 */ 4815 static void 4816 rib_close_channels(rib_conn_list_t *connlist) 4817 { 4818 CONN *conn; 4819 rib_qp_t *qp; 4820 4821 rw_enter(&connlist->conn_lock, RW_READER); 4822 conn = connlist->conn_hd; 4823 while (conn != NULL) { 4824 mutex_enter(&conn->c_lock); 4825 qp = ctoqp(conn); 4826 if (conn->c_state & C_CONNECTED) { 4827 /* 4828 * Live connection in CONNECTED state. 4829 * Call ibt_close_rc_channel in nonblocking mode 4830 * with no callbacks. 4831 */ 4832 conn->c_state = C_ERROR; 4833 (void) ibt_close_rc_channel(qp->qp_hdl, 4834 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4835 (void) ibt_free_channel(qp->qp_hdl); 4836 qp->qp_hdl = NULL; 4837 } else { 4838 if (conn->c_state == C_ERROR && 4839 qp->qp_hdl != NULL) { 4840 /* 4841 * Connection in ERROR state but 4842 * channel is not yet freed. 4843 */ 4844 (void) ibt_close_rc_channel(qp->qp_hdl, 4845 IBT_NOCALLBACKS, NULL, 0, NULL, 4846 NULL, 0); 4847 (void) ibt_free_channel(qp->qp_hdl); 4848 qp->qp_hdl = NULL; 4849 } 4850 } 4851 mutex_exit(&conn->c_lock); 4852 conn = conn->c_next; 4853 } 4854 rw_exit(&connlist->conn_lock); 4855 } 4856 4857 /* 4858 * Frees up all connections that are no longer being referenced 4859 */ 4860 static void 4861 rib_purge_connlist(rib_conn_list_t *connlist) 4862 { 4863 CONN *conn; 4864 4865 top: 4866 rw_enter(&connlist->conn_lock, RW_READER); 4867 conn = connlist->conn_hd; 4868 while (conn != NULL) { 4869 mutex_enter(&conn->c_lock); 4870 4871 /* 4872 * At this point connection is either in ERROR 4873 * or DISCONN_PEND state. If in DISCONN_PEND state 4874 * then some other thread is culling that connection. 4875 * If not and if c_ref is 0, then destroy the connection. 4876 */ 4877 if (conn->c_ref == 0 && 4878 conn->c_state != C_DISCONN_PEND) { 4879 /* 4880 * Cull the connection 4881 */ 4882 conn->c_state = C_DISCONN_PEND; 4883 mutex_exit(&conn->c_lock); 4884 rw_exit(&connlist->conn_lock); 4885 (void) rib_disconnect_channel(conn, connlist); 4886 goto top; 4887 } else { 4888 /* 4889 * conn disconnect already scheduled or will 4890 * happen from conn_release when c_ref drops to 0. 4891 */ 4892 mutex_exit(&conn->c_lock); 4893 } 4894 conn = conn->c_next; 4895 } 4896 rw_exit(&connlist->conn_lock); 4897 4898 /* 4899 * At this point, only connections with c_ref != 0 are on the list 4900 */ 4901 } 4902 4903 /* 4904 * Cleans and closes up all uses of the HCA 4905 */ 4906 static void 4907 rib_detach_hca(rib_hca_t *hca) 4908 { 4909 4910 /* 4911 * Stop all services on the HCA 4912 * Go through cl_conn_list and close all rc_channels 4913 * Go through svr_conn_list and close all rc_channels 4914 * Free connections whose c_ref has dropped to 0 4915 * Destroy all CQs 4916 * Deregister and released all buffer pool memory after all 4917 * connections are destroyed 4918 * Free the protection domain 4919 * ibt_close_hca() 4920 */ 4921 rw_enter(&hca->state_lock, RW_WRITER); 4922 if (hca->state == HCA_DETACHED) { 4923 rw_exit(&hca->state_lock); 4924 return; 4925 } 4926 4927 hca->state = HCA_DETACHED; 4928 rib_stat->nhca_inited--; 4929 4930 rib_stop_services(hca); 4931 rib_deregister_ats(); 4932 rib_close_channels(&hca->cl_conn_list); 4933 rib_close_channels(&hca->srv_conn_list); 4934 rw_exit(&hca->state_lock); 4935 4936 rib_purge_connlist(&hca->cl_conn_list); 4937 rib_purge_connlist(&hca->srv_conn_list); 4938 4939 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4940 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4941 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4942 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4943 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4944 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4945 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4946 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4947 4948 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4949 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4950 if (hca->srv_conn_list.conn_hd == NULL && 4951 hca->cl_conn_list.conn_hd == NULL) { 4952 /* 4953 * conn_lists are NULL, so destroy 4954 * buffers, close hca and be done. 4955 */ 4956 rib_rbufpool_destroy(hca, RECV_BUFFER); 4957 rib_rbufpool_destroy(hca, SEND_BUFFER); 4958 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4959 (void) ibt_close_hca(hca->hca_hdl); 4960 hca->hca_hdl = NULL; 4961 } 4962 rw_exit(&hca->cl_conn_list.conn_lock); 4963 rw_exit(&hca->srv_conn_list.conn_lock); 4964 4965 if (hca->hca_hdl != NULL) { 4966 mutex_enter(&hca->inuse_lock); 4967 while (hca->inuse) 4968 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4969 mutex_exit(&hca->inuse_lock); 4970 /* 4971 * conn_lists are now NULL, so destroy 4972 * buffers, close hca and be done. 4973 */ 4974 rib_rbufpool_destroy(hca, RECV_BUFFER); 4975 rib_rbufpool_destroy(hca, SEND_BUFFER); 4976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4977 (void) ibt_close_hca(hca->hca_hdl); 4978 hca->hca_hdl = NULL; 4979 } 4980 } 4981