1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * The rpcib plugin. Implements the interface for RDMATF's 31 * interaction with IBTF. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysmacros.h> 39 #include <sys/proc.h> 40 #include <sys/socket.h> 41 #include <sys/file.h> 42 #include <sys/stream.h> 43 #include <sys/strsubr.h> 44 #include <sys/stropts.h> 45 #include <sys/errno.h> 46 #include <sys/kmem.h> 47 #include <sys/debug.h> 48 #include <sys/systm.h> 49 #include <sys/pathname.h> 50 #include <sys/kstat.h> 51 #include <sys/t_lock.h> 52 #include <sys/ddi.h> 53 #include <sys/cmn_err.h> 54 #include <sys/time.h> 55 #include <sys/isa_defs.h> 56 #include <sys/callb.h> 57 #include <sys/sunddi.h> 58 #include <sys/sunndi.h> 59 60 #include <sys/ib/ibtl/ibti.h> 61 #include <rpc/rpc.h> 62 #include <rpc/ib.h> 63 64 #include <sys/modctl.h> 65 66 #include <sys/pathname.h> 67 #include <sys/kstr.h> 68 #include <sys/sockio.h> 69 #include <sys/vnode.h> 70 #include <sys/tiuser.h> 71 #include <net/if.h> 72 #include <sys/cred.h> 73 74 75 extern char *inet_ntop(int, const void *, char *, int); 76 77 78 /* 79 * Prototype declarations for driver ops 80 */ 81 82 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 83 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 84 void *, void **); 85 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 86 87 88 /* rpcib cb_ops */ 89 static struct cb_ops rpcib_cbops = { 90 nulldev, /* open */ 91 nulldev, /* close */ 92 nodev, /* strategy */ 93 nodev, /* print */ 94 nodev, /* dump */ 95 nodev, /* read */ 96 nodev, /* write */ 97 nodev, /* ioctl */ 98 nodev, /* devmap */ 99 nodev, /* mmap */ 100 nodev, /* segmap */ 101 nochpoll, /* poll */ 102 ddi_prop_op, /* prop_op */ 103 NULL, /* stream */ 104 D_MP, /* cb_flag */ 105 CB_REV, /* rev */ 106 nodev, /* int (*cb_aread)() */ 107 nodev /* int (*cb_awrite)() */ 108 }; 109 110 /* 111 * Device options 112 */ 113 static struct dev_ops rpcib_ops = { 114 DEVO_REV, /* devo_rev, */ 115 0, /* refcnt */ 116 rpcib_getinfo, /* info */ 117 nulldev, /* identify */ 118 nulldev, /* probe */ 119 rpcib_attach, /* attach */ 120 rpcib_detach, /* detach */ 121 nodev, /* reset */ 122 &rpcib_cbops, /* driver ops - devctl interfaces */ 123 NULL, /* bus operations */ 124 NULL /* power */ 125 }; 126 127 /* 128 * Module linkage information. 129 */ 130 131 static struct modldrv rib_modldrv = { 132 &mod_driverops, /* Driver module */ 133 "RPCIB plugin driver, ver %I%", /* Driver name and version */ 134 &rpcib_ops, /* Driver ops */ 135 }; 136 137 static struct modlinkage rib_modlinkage = { 138 MODREV_1, 139 (void *)&rib_modldrv, 140 NULL 141 }; 142 143 /* 144 * rib_stat: private data pointer used when registering 145 * with the IBTF. It is returned to the consumer 146 * in all callbacks. 147 */ 148 static rpcib_state_t *rib_stat = NULL; 149 150 #define RNR_RETRIES 2 151 #define MAX_PORTS 2 152 153 int preposted_rbufs = 16; 154 int send_threshold = 1; 155 156 /* 157 * State of the plugin. 158 * ACCEPT = accepting new connections and requests. 159 * NO_ACCEPT = not accepting new connection and requests. 160 * This should eventually move to rpcib_state_t structure, since this 161 * will tell in which state the plugin is for a particular type of service 162 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 163 * state for one and in no_accept state for the other. 164 */ 165 int plugin_state; 166 kmutex_t plugin_state_lock; 167 168 169 /* 170 * RPCIB RDMATF operations 171 */ 172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 173 static rdma_stat rib_disconnect(CONN *conn); 174 static void rib_listen(struct rdma_svc_data *rd); 175 static void rib_listen_stop(struct rdma_svc_data *rd); 176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 177 struct mrc *buf_handle); 178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 179 struct mrc buf_handle); 180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 181 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle); 182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 183 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle); 184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 185 caddr_t buf, int len, int cpu); 186 187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 188 189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 191 192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 193 194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 203 static rdma_stat rib_conn_release(CONN *conn); 204 static rdma_stat rib_getinfo(rdma_info_t *info); 205 static rdma_stat rib_register_ats(rib_hca_t *); 206 static void rib_deregister_ats(); 207 static void rib_stop_services(rib_hca_t *); 208 209 /* 210 * RPCIB addressing operations 211 */ 212 char ** get_ip_addrs(int *count); 213 int get_interfaces(TIUSER *tiptr, int *num); 214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs); 215 int get_ibd_ipaddr(rpcib_ibd_insts_t *); 216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *); 217 void rib_get_ibd_insts(rpcib_ibd_insts_t *); 218 219 220 /* 221 * RDMA operations the RPCIB module exports 222 */ 223 static rdmaops_t rib_ops = { 224 rib_reachable, 225 rib_conn_get, 226 rib_conn_release, 227 rib_listen, 228 rib_listen_stop, 229 rib_registermem, 230 rib_deregistermem, 231 rib_registermemsync, 232 rib_deregistermemsync, 233 rib_syncmem, 234 rib_reg_buf_alloc, 235 rib_reg_buf_free, 236 rib_send, 237 rib_send_resp, 238 rib_post_resp, 239 rib_post_recv, 240 rib_recv, 241 rib_read, 242 rib_write, 243 rib_getinfo 244 }; 245 246 /* 247 * RDMATF RPCIB plugin details 248 */ 249 static rdma_mod_t rib_mod = { 250 "ibtf", /* api name */ 251 RDMATF_VERS_1, 252 0, 253 &rib_ops, /* rdma op vector for ibtf */ 254 }; 255 256 static rdma_stat open_hcas(rpcib_state_t *); 257 static rdma_stat rib_qp_init(rib_qp_t *, int); 258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 264 ibt_mr_hdl_t *, ibt_mr_desc_t *); 265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *); 266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 267 rib_qp_t **); 268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 269 rib_qp_t **); 270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 272 static int rib_free_sendwait(struct send_wid *); 273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 275 static void rdma_done_rem_list(rib_qp_t *); 276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 277 278 static void rib_async_handler(void *, 279 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 282 static int rib_free_svc_recv(struct svc_recv *); 283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 284 static void rib_free_wid(struct recv_wid *); 285 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 286 static void rib_detach_hca(rib_hca_t *); 287 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int, 288 ibt_path_info_t *); 289 290 /* 291 * Registration with IBTF as a consumer 292 */ 293 static struct ibt_clnt_modinfo_s rib_modinfo = { 294 IBTI_V2, 295 IBT_GENERIC, 296 rib_async_handler, /* async event handler */ 297 NULL, /* Memory Region Handler */ 298 "nfs/ib" 299 }; 300 301 /* 302 * Global strucuture 303 */ 304 305 typedef struct rpcib_s { 306 dev_info_t *rpcib_dip; 307 kmutex_t rpcib_mutex; 308 } rpcib_t; 309 310 rpcib_t rpcib; 311 312 /* 313 * /etc/system controlled variable to control 314 * debugging in rpcib kernel module. 315 * Set it to values greater that 1 to control 316 * the amount of debugging messages required. 317 */ 318 int rib_debug = 0; 319 320 static int ats_running = 0; 321 int 322 _init(void) 323 { 324 int error; 325 326 error = mod_install((struct modlinkage *)&rib_modlinkage); 327 if (error != 0) { 328 /* 329 * Could not load module 330 */ 331 return (error); 332 } 333 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 334 335 return (0); 336 } 337 338 int 339 _fini() 340 { 341 int status; 342 343 if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) { 344 return (EBUSY); 345 } 346 347 rib_deregister_ats(); 348 349 /* 350 * Remove module 351 */ 352 if ((status = mod_remove(&rib_modlinkage)) != 0) { 353 (void) rdma_register_mod(&rib_mod); 354 return (status); 355 } 356 mutex_destroy(&plugin_state_lock); 357 return (0); 358 } 359 360 int 361 _info(struct modinfo *modinfop) 362 { 363 return (mod_info(&rib_modlinkage, modinfop)); 364 } 365 366 367 /* 368 * rpcib_getinfo() 369 * Given the device number, return the devinfo pointer or the 370 * instance number. 371 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 372 */ 373 374 /*ARGSUSED*/ 375 static int 376 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 377 { 378 int ret = DDI_SUCCESS; 379 380 switch (cmd) { 381 case DDI_INFO_DEVT2DEVINFO: 382 if (rpcib.rpcib_dip != NULL) 383 *result = rpcib.rpcib_dip; 384 else { 385 *result = NULL; 386 ret = DDI_FAILURE; 387 } 388 break; 389 390 case DDI_INFO_DEVT2INSTANCE: 391 *result = NULL; 392 break; 393 394 default: 395 ret = DDI_FAILURE; 396 } 397 return (ret); 398 } 399 400 static int 401 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 402 { 403 ibt_status_t ibt_status; 404 rdma_stat r_status; 405 406 switch (cmd) { 407 case DDI_ATTACH: 408 break; 409 case DDI_RESUME: 410 return (DDI_SUCCESS); 411 default: 412 return (DDI_FAILURE); 413 } 414 415 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 416 417 mutex_enter(&rpcib.rpcib_mutex); 418 if (rpcib.rpcib_dip != NULL) { 419 mutex_exit(&rpcib.rpcib_mutex); 420 return (DDI_FAILURE); 421 } 422 rpcib.rpcib_dip = dip; 423 mutex_exit(&rpcib.rpcib_mutex); 424 /* 425 * Create the "rpcib" minor-node. 426 */ 427 if (ddi_create_minor_node(dip, 428 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 429 /* Error message, no cmn_err as they print on console */ 430 return (DDI_FAILURE); 431 } 432 433 if (rib_stat == NULL) { 434 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 435 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 436 } 437 438 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 439 if (rib_stat->hca_count < 1) { 440 mutex_destroy(&rib_stat->open_hca_lock); 441 kmem_free(rib_stat, sizeof (*rib_stat)); 442 rib_stat = NULL; 443 return (DDI_FAILURE); 444 } 445 446 ibt_status = ibt_attach(&rib_modinfo, dip, 447 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 448 if (ibt_status != IBT_SUCCESS) { 449 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 450 mutex_destroy(&rib_stat->open_hca_lock); 451 kmem_free(rib_stat, sizeof (*rib_stat)); 452 rib_stat = NULL; 453 return (DDI_FAILURE); 454 } 455 456 mutex_enter(&rib_stat->open_hca_lock); 457 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 458 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 459 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 460 mutex_exit(&rib_stat->open_hca_lock); 461 mutex_destroy(&rib_stat->open_hca_lock); 462 kmem_free(rib_stat, sizeof (*rib_stat)); 463 rib_stat = NULL; 464 return (DDI_FAILURE); 465 } 466 mutex_exit(&rib_stat->open_hca_lock); 467 468 /* 469 * Register with rdmatf 470 */ 471 rib_mod.rdma_count = rib_stat->hca_count; 472 r_status = rdma_register_mod(&rib_mod); 473 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 474 rib_detach_hca(rib_stat->hca); 475 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 476 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 477 mutex_destroy(&rib_stat->open_hca_lock); 478 kmem_free(rib_stat, sizeof (*rib_stat)); 479 rib_stat = NULL; 480 return (DDI_FAILURE); 481 } 482 483 484 return (DDI_SUCCESS); 485 } 486 487 /*ARGSUSED*/ 488 static int 489 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 490 { 491 switch (cmd) { 492 493 case DDI_DETACH: 494 break; 495 496 case DDI_SUSPEND: 497 default: 498 return (DDI_FAILURE); 499 } 500 501 /* 502 * Detach the hca and free resources 503 */ 504 mutex_enter(&plugin_state_lock); 505 plugin_state = NO_ACCEPT; 506 mutex_exit(&plugin_state_lock); 507 rib_detach_hca(rib_stat->hca); 508 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 509 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 510 511 mutex_enter(&rpcib.rpcib_mutex); 512 rpcib.rpcib_dip = NULL; 513 mutex_exit(&rpcib.rpcib_mutex); 514 515 mutex_destroy(&rpcib.rpcib_mutex); 516 return (DDI_SUCCESS); 517 } 518 519 520 static void 521 rib_deregister_ats() 522 { 523 rib_hca_t *hca; 524 rib_service_t *srv_list, *to_remove; 525 ibt_status_t ibt_status; 526 527 /* 528 * deregister the Address Translation Service. 529 */ 530 hca = rib_stat->hca; 531 rw_enter(&hca->service_list_lock, RW_WRITER); 532 srv_list = hca->ats_list; 533 while (srv_list != NULL) { 534 to_remove = srv_list; 535 srv_list = to_remove->srv_next; 536 537 ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl, 538 &to_remove->srv_ar); 539 if (ibt_status != IBT_SUCCESS) { 540 #ifdef DEBUG 541 if (rib_debug) { 542 cmn_err(CE_WARN, "_fini: " 543 "ibt_deregister_ar FAILED" 544 " status: %d", ibt_status); 545 } 546 #endif 547 } else { 548 mutex_enter(&rib_stat->open_hca_lock); 549 ats_running = 0; 550 mutex_exit(&rib_stat->open_hca_lock); 551 #ifdef DEBUG 552 if (rib_debug) { 553 554 cmn_err(CE_NOTE, "_fini: " 555 "Successfully unregistered" 556 " ATS service: %s", 557 to_remove->srv_name); 558 } 559 #endif 560 } 561 kmem_free(to_remove, sizeof (rib_service_t)); 562 } 563 hca->ats_list = NULL; 564 rw_exit(&hca->service_list_lock); 565 } 566 567 static void rib_rbufpool_free(rib_hca_t *, int); 568 static void rib_rbufpool_deregister(rib_hca_t *, int); 569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 571 static rdma_stat rib_rem_replylist(rib_qp_t *); 572 static int rib_remreply(rib_qp_t *, struct reply *); 573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 575 576 /* 577 * One CQ pair per HCA 578 */ 579 static rdma_stat 580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 581 rib_cq_t **cqp, rpcib_state_t *ribstat) 582 { 583 rib_cq_t *cq; 584 ibt_cq_attr_t cq_attr; 585 uint32_t real_size; 586 ibt_status_t status; 587 rdma_stat error = RDMA_SUCCESS; 588 589 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 590 cq->rib_hca = hca; 591 cq_attr.cq_size = cq_size; 592 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 593 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 594 &real_size); 595 if (status != IBT_SUCCESS) { 596 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 597 " status=%d", status); 598 error = RDMA_FAILED; 599 goto fail; 600 } 601 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 602 603 /* 604 * Enable CQ callbacks. CQ Callbacks are single shot 605 * (e.g. you have to call ibt_enable_cq_notify() 606 * after each callback to get another one). 607 */ 608 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 609 if (status != IBT_SUCCESS) { 610 cmn_err(CE_WARN, "rib_create_cq: " 611 "enable_cq_notify failed, status %d", status); 612 error = RDMA_FAILED; 613 goto fail; 614 } 615 *cqp = cq; 616 617 return (error); 618 fail: 619 if (cq->rib_cq_hdl) 620 (void) ibt_free_cq(cq->rib_cq_hdl); 621 if (cq) 622 kmem_free(cq, sizeof (rib_cq_t)); 623 return (error); 624 } 625 626 static rdma_stat 627 open_hcas(rpcib_state_t *ribstat) 628 { 629 rib_hca_t *hca; 630 ibt_status_t ibt_status; 631 rdma_stat status; 632 ibt_hca_portinfo_t *pinfop; 633 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 634 uint_t size, cq_size; 635 int i; 636 637 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 638 if (ribstat->hcas == NULL) 639 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 640 sizeof (rib_hca_t), KM_SLEEP); 641 642 /* 643 * Open a hca and setup for RDMA 644 */ 645 for (i = 0; i < ribstat->hca_count; i++) { 646 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 647 ribstat->hca_guids[i], 648 &ribstat->hcas[i].hca_hdl); 649 if (ibt_status != IBT_SUCCESS) { 650 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) " 651 "returned %d", i, ibt_status); 652 continue; 653 } 654 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 655 hca = &(ribstat->hcas[i]); 656 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 657 hca->state = HCA_INITED; 658 659 /* 660 * query HCA info 661 */ 662 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 663 if (ibt_status != IBT_SUCCESS) { 664 cmn_err(CE_WARN, "open_hcas: ibt_query_hca " 665 "returned %d (hca_guid 0x%llx)", 666 ibt_status, (longlong_t)ribstat->hca_guids[i]); 667 goto fail1; 668 } 669 670 /* 671 * One PD (Protection Domain) per HCA. 672 * A qp is allowed to access a memory region 673 * only when it's in the same PD as that of 674 * the memory region. 675 */ 676 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 677 if (ibt_status != IBT_SUCCESS) { 678 cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd " 679 "returned %d (hca_guid 0x%llx)", 680 ibt_status, (longlong_t)ribstat->hca_guids[i]); 681 goto fail1; 682 } 683 684 /* 685 * query HCA ports 686 */ 687 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 688 0, &pinfop, &hca->hca_nports, &size); 689 if (ibt_status != IBT_SUCCESS) { 690 cmn_err(CE_WARN, "open_hcas: " 691 "ibt_query_hca_ports returned %d " 692 "(hca_guid 0x%llx)", 693 ibt_status, (longlong_t)hca->hca_guid); 694 goto fail2; 695 } 696 hca->hca_ports = pinfop; 697 hca->hca_pinfosz = size; 698 pinfop = NULL; 699 700 cq_size = DEF_CQ_SIZE; /* default cq size */ 701 /* 702 * Create 2 pairs of cq's (1 pair for client 703 * and the other pair for server) on this hca. 704 * If number of qp's gets too large, then several 705 * cq's will be needed. 706 */ 707 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 708 &hca->svc_rcq, ribstat); 709 if (status != RDMA_SUCCESS) { 710 goto fail3; 711 } 712 713 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 714 &hca->svc_scq, ribstat); 715 if (status != RDMA_SUCCESS) { 716 goto fail3; 717 } 718 719 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 720 &hca->clnt_rcq, ribstat); 721 if (status != RDMA_SUCCESS) { 722 goto fail3; 723 } 724 725 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 726 &hca->clnt_scq, ribstat); 727 if (status != RDMA_SUCCESS) { 728 goto fail3; 729 } 730 731 /* 732 * Create buffer pools. 733 * Note rib_rbuf_create also allocates memory windows. 734 */ 735 hca->recv_pool = rib_rbufpool_create(hca, 736 RECV_BUFFER, MAX_BUFS); 737 if (hca->recv_pool == NULL) { 738 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n"); 739 goto fail3; 740 } 741 742 hca->send_pool = rib_rbufpool_create(hca, 743 SEND_BUFFER, MAX_BUFS); 744 if (hca->send_pool == NULL) { 745 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n"); 746 rib_rbufpool_destroy(hca, RECV_BUFFER); 747 goto fail3; 748 } 749 750 /* 751 * Initialize the registered service list and 752 * the lock 753 */ 754 hca->service_list = NULL; 755 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 756 757 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 758 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 759 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 760 hca->iblock); 761 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 762 hca->iblock); 763 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 764 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 765 hca->inuse = TRUE; 766 /* 767 * XXX One hca only. Add multi-hca functionality if needed 768 * later. 769 */ 770 ribstat->hca = hca; 771 ribstat->nhca_inited++; 772 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 773 break; 774 775 fail3: 776 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 777 fail2: 778 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 779 fail1: 780 (void) ibt_close_hca(hca->hca_hdl); 781 782 } 783 if (ribstat->hca != NULL) 784 return (RDMA_SUCCESS); 785 else 786 return (RDMA_FAILED); 787 } 788 789 /* 790 * Callback routines 791 */ 792 793 /* 794 * SCQ handlers 795 */ 796 /* ARGSUSED */ 797 static void 798 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 799 { 800 ibt_status_t ibt_status; 801 ibt_wc_t wc; 802 int i; 803 804 /* 805 * Re-enable cq notify here to avoid missing any 806 * completion queue notification. 807 */ 808 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 809 810 ibt_status = IBT_SUCCESS; 811 while (ibt_status != IBT_CQ_EMPTY) { 812 bzero(&wc, sizeof (wc)); 813 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 814 if (ibt_status != IBT_SUCCESS) 815 return; 816 817 /* 818 * Got a send completion 819 */ 820 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 821 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 822 CONN *conn = qptoc(wd->qp); 823 824 mutex_enter(&wd->sendwait_lock); 825 switch (wc.wc_status) { 826 case IBT_WC_SUCCESS: 827 wd->status = RDMA_SUCCESS; 828 break; 829 case IBT_WC_WR_FLUSHED_ERR: 830 wd->status = RDMA_FAILED; 831 break; 832 default: 833 /* 834 * RC Send Q Error Code Local state Remote State 835 * ==================== =========== ============ 836 * IBT_WC_BAD_RESPONSE_ERR ERROR None 837 * IBT_WC_LOCAL_LEN_ERR ERROR None 838 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 839 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 840 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 841 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 842 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 843 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 844 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 845 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 846 * IBT_WC_WR_FLUSHED_ERR None None 847 */ 848 #ifdef DEBUG 849 if (rib_debug > 1) { 850 if (wc.wc_status != IBT_WC_SUCCESS) { 851 cmn_err(CE_NOTE, "rib_clnt_scq_handler: " 852 "WR completed in error, wc.wc_status:%d, " 853 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id); 854 } 855 } 856 #endif 857 /* 858 * Channel in error state. Set connection to 859 * ERROR and cleanup will happen either from 860 * conn_release or from rib_conn_get 861 */ 862 wd->status = RDMA_FAILED; 863 mutex_enter(&conn->c_lock); 864 if (conn->c_state != C_DISCONN_PEND) 865 conn->c_state = C_ERROR; 866 mutex_exit(&conn->c_lock); 867 break; 868 } 869 if (wd->cv_sig == 1) { 870 /* 871 * Notify poster 872 */ 873 cv_signal(&wd->wait_cv); 874 mutex_exit(&wd->sendwait_lock); 875 } else { 876 /* 877 * Poster not waiting for notification. 878 * Free the send buffers and send_wid 879 */ 880 for (i = 0; i < wd->nsbufs; i++) { 881 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 882 (void *)(uintptr_t)wd->sbufaddr[i]); 883 } 884 mutex_exit(&wd->sendwait_lock); 885 (void) rib_free_sendwait(wd); 886 } 887 } 888 } 889 } 890 891 /* ARGSUSED */ 892 static void 893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 894 { 895 ibt_status_t ibt_status; 896 ibt_wc_t wc; 897 int i; 898 899 /* 900 * Re-enable cq notify here to avoid missing any 901 * completion queue notification. 902 */ 903 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 904 905 ibt_status = IBT_SUCCESS; 906 while (ibt_status != IBT_CQ_EMPTY) { 907 bzero(&wc, sizeof (wc)); 908 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 909 if (ibt_status != IBT_SUCCESS) 910 return; 911 912 /* 913 * Got a send completion 914 */ 915 #ifdef DEBUG 916 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) { 917 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error " 918 "wc.wc_status:%d, wc_id:%llX", 919 wc.wc_status, (longlong_t)wc.wc_id); 920 } 921 #endif 922 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 923 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 924 925 mutex_enter(&wd->sendwait_lock); 926 if (wd->cv_sig == 1) { 927 /* 928 * Update completion status and notify poster 929 */ 930 if (wc.wc_status == IBT_WC_SUCCESS) 931 wd->status = RDMA_SUCCESS; 932 else 933 wd->status = RDMA_FAILED; 934 cv_signal(&wd->wait_cv); 935 mutex_exit(&wd->sendwait_lock); 936 } else { 937 /* 938 * Poster not waiting for notification. 939 * Free the send buffers and send_wid 940 */ 941 for (i = 0; i < wd->nsbufs; i++) { 942 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 943 (void *)(uintptr_t)wd->sbufaddr[i]); 944 } 945 mutex_exit(&wd->sendwait_lock); 946 (void) rib_free_sendwait(wd); 947 } 948 } 949 } 950 } 951 952 /* 953 * RCQ handler 954 */ 955 /* ARGSUSED */ 956 static void 957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 958 { 959 rib_qp_t *qp; 960 ibt_status_t ibt_status; 961 ibt_wc_t wc; 962 struct recv_wid *rwid; 963 964 /* 965 * Re-enable cq notify here to avoid missing any 966 * completion queue notification. 967 */ 968 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 969 970 ibt_status = IBT_SUCCESS; 971 while (ibt_status != IBT_CQ_EMPTY) { 972 bzero(&wc, sizeof (wc)); 973 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 974 if (ibt_status != IBT_SUCCESS) 975 return; 976 977 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 978 qp = rwid->qp; 979 if (wc.wc_status == IBT_WC_SUCCESS) { 980 XDR inxdrs, *xdrs; 981 uint_t xid, vers, op, find_xid = 0; 982 struct reply *r; 983 CONN *conn = qptoc(qp); 984 985 xdrs = &inxdrs; 986 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 987 wc.wc_bytes_xfer, XDR_DECODE); 988 /* 989 * Treat xid as opaque (xid is the first entity 990 * in the rpc rdma message). 991 */ 992 xid = *(uint32_t *)(uintptr_t)rwid->addr; 993 /* Skip xid and set the xdr position accordingly. */ 994 XDR_SETPOS(xdrs, sizeof (uint32_t)); 995 (void) xdr_u_int(xdrs, &vers); 996 (void) xdr_u_int(xdrs, &op); 997 XDR_DESTROY(xdrs); 998 if (vers != RPCRDMA_VERS) { 999 /* 1000 * Invalid RPC/RDMA version. Cannot interoperate. 1001 * Set connection to ERROR state and bail out. 1002 */ 1003 mutex_enter(&conn->c_lock); 1004 if (conn->c_state != C_DISCONN_PEND) 1005 conn->c_state = C_ERROR; 1006 mutex_exit(&conn->c_lock); 1007 rib_rbuf_free(conn, RECV_BUFFER, 1008 (void *)(uintptr_t)rwid->addr); 1009 rib_free_wid(rwid); 1010 continue; 1011 } 1012 1013 mutex_enter(&qp->replylist_lock); 1014 for (r = qp->replylist; r != NULL; r = r->next) { 1015 if (r->xid == xid) { 1016 find_xid = 1; 1017 switch (op) { 1018 case RDMA_MSG: 1019 case RDMA_NOMSG: 1020 case RDMA_MSGP: 1021 r->status = RDMA_SUCCESS; 1022 r->vaddr_cq = rwid->addr; 1023 r->bytes_xfer = wc.wc_bytes_xfer; 1024 cv_signal(&r->wait_cv); 1025 break; 1026 default: 1027 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1028 (void *)(uintptr_t)rwid->addr); 1029 break; 1030 } 1031 break; 1032 } 1033 } 1034 mutex_exit(&qp->replylist_lock); 1035 if (find_xid == 0) { 1036 /* RPC caller not waiting for reply */ 1037 #ifdef DEBUG 1038 if (rib_debug) { 1039 cmn_err(CE_NOTE, "rib_clnt_rcq_handler: " 1040 "NO matching xid %u!\n", xid); 1041 } 1042 #endif 1043 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1044 (void *)(uintptr_t)rwid->addr); 1045 } 1046 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1047 CONN *conn = qptoc(qp); 1048 1049 /* 1050 * Connection being flushed. Just free 1051 * the posted buffer 1052 */ 1053 rib_rbuf_free(conn, RECV_BUFFER, 1054 (void *)(uintptr_t)rwid->addr); 1055 } else { 1056 CONN *conn = qptoc(qp); 1057 /* 1058 * RC Recv Q Error Code Local state Remote State 1059 * ==================== =========== ============ 1060 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1061 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1062 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1063 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1064 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1065 * IBT_WC_WR_FLUSHED_ERR None None 1066 */ 1067 /* 1068 * Channel in error state. Set connection 1069 * in ERROR state. 1070 */ 1071 mutex_enter(&conn->c_lock); 1072 if (conn->c_state != C_DISCONN_PEND) 1073 conn->c_state = C_ERROR; 1074 mutex_exit(&conn->c_lock); 1075 rib_rbuf_free(conn, RECV_BUFFER, 1076 (void *)(uintptr_t)rwid->addr); 1077 } 1078 rib_free_wid(rwid); 1079 } 1080 } 1081 1082 /* Server side */ 1083 /* ARGSUSED */ 1084 static void 1085 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1086 { 1087 struct recv_data *rd; 1088 rib_qp_t *qp; 1089 ibt_status_t ibt_status; 1090 ibt_wc_t wc; 1091 struct svc_recv *s_recvp; 1092 CONN *conn; 1093 mblk_t *mp; 1094 1095 /* 1096 * Re-enable cq notify here to avoid missing any 1097 * completion queue notification. 1098 */ 1099 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1100 1101 ibt_status = IBT_SUCCESS; 1102 while (ibt_status != IBT_CQ_EMPTY) { 1103 bzero(&wc, sizeof (wc)); 1104 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1105 if (ibt_status != IBT_SUCCESS) 1106 return; 1107 1108 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1109 qp = s_recvp->qp; 1110 conn = qptoc(qp); 1111 mutex_enter(&qp->posted_rbufs_lock); 1112 qp->n_posted_rbufs--; 1113 if (qp->n_posted_rbufs == 0) 1114 cv_signal(&qp->posted_rbufs_cv); 1115 mutex_exit(&qp->posted_rbufs_lock); 1116 1117 if (wc.wc_status == IBT_WC_SUCCESS) { 1118 XDR inxdrs, *xdrs; 1119 uint_t xid, vers, op; 1120 1121 xdrs = &inxdrs; 1122 /* s_recvp->vaddr stores data */ 1123 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1124 wc.wc_bytes_xfer, XDR_DECODE); 1125 1126 /* 1127 * Treat xid as opaque (xid is the first entity 1128 * in the rpc rdma message). 1129 */ 1130 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1131 /* Skip xid and set the xdr position accordingly. */ 1132 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1133 if (!xdr_u_int(xdrs, &vers) || 1134 !xdr_u_int(xdrs, &op)) { 1135 rib_rbuf_free(conn, RECV_BUFFER, 1136 (void *)(uintptr_t)s_recvp->vaddr); 1137 XDR_DESTROY(xdrs); 1138 #ifdef DEBUG 1139 cmn_err(CE_NOTE, "rib_svc_rcq_handler: " 1140 "xdr_u_int failed for qp %p, wc_id=%llx", 1141 (void *)qp, (longlong_t)wc.wc_id); 1142 #endif 1143 (void) rib_free_svc_recv(s_recvp); 1144 continue; 1145 } 1146 XDR_DESTROY(xdrs); 1147 1148 if (vers != RPCRDMA_VERS) { 1149 /* 1150 * Invalid RPC/RDMA version. Drop rpc rdma message. 1151 */ 1152 rib_rbuf_free(conn, RECV_BUFFER, 1153 (void *)(uintptr_t)s_recvp->vaddr); 1154 (void) rib_free_svc_recv(s_recvp); 1155 continue; 1156 } 1157 /* 1158 * Is this for RDMA_DONE? 1159 */ 1160 if (op == RDMA_DONE) { 1161 rib_rbuf_free(conn, RECV_BUFFER, 1162 (void *)(uintptr_t)s_recvp->vaddr); 1163 /* 1164 * Wake up the thread waiting on 1165 * a RDMA_DONE for xid 1166 */ 1167 mutex_enter(&qp->rdlist_lock); 1168 rdma_done_notify(qp, xid); 1169 mutex_exit(&qp->rdlist_lock); 1170 (void) rib_free_svc_recv(s_recvp); 1171 continue; 1172 } 1173 1174 mutex_enter(&plugin_state_lock); 1175 if (plugin_state == ACCEPT) { 1176 while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL) 1177 (void) strwaitbuf(sizeof (*rd), BPRI_LO); 1178 /* 1179 * Plugin is in accept state, hence the master 1180 * transport queue for this is still accepting 1181 * requests. Hence we can call svc_queuereq to 1182 * queue this recieved msg. 1183 */ 1184 rd = (struct recv_data *)mp->b_rptr; 1185 rd->conn = conn; 1186 rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr; 1187 rd->rpcmsg.type = RECV_BUFFER; 1188 rd->rpcmsg.len = wc.wc_bytes_xfer; 1189 rd->status = wc.wc_status; 1190 mutex_enter(&conn->c_lock); 1191 conn->c_ref++; 1192 mutex_exit(&conn->c_lock); 1193 mp->b_wptr += sizeof (*rd); 1194 svc_queuereq((queue_t *)rib_stat->q, mp); 1195 mutex_exit(&plugin_state_lock); 1196 } else { 1197 /* 1198 * The master transport for this is going 1199 * away and the queue is not accepting anymore 1200 * requests for krpc, so don't do anything, just 1201 * free the msg. 1202 */ 1203 mutex_exit(&plugin_state_lock); 1204 rib_rbuf_free(conn, RECV_BUFFER, 1205 (void *)(uintptr_t)s_recvp->vaddr); 1206 } 1207 } else { 1208 rib_rbuf_free(conn, RECV_BUFFER, 1209 (void *)(uintptr_t)s_recvp->vaddr); 1210 } 1211 (void) rib_free_svc_recv(s_recvp); 1212 } 1213 } 1214 1215 /* 1216 * Handles DR event of IBT_HCA_DETACH_EVENT. 1217 */ 1218 /* ARGSUSED */ 1219 static void 1220 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1221 ibt_async_code_t code, ibt_async_event_t *event) 1222 { 1223 1224 switch (code) { 1225 case IBT_HCA_ATTACH_EVENT: 1226 /* ignore */ 1227 break; 1228 case IBT_HCA_DETACH_EVENT: 1229 { 1230 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1231 rib_detach_hca(rib_stat->hca); 1232 #ifdef DEBUG 1233 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1234 #endif 1235 break; 1236 } 1237 #ifdef DEBUG 1238 case IBT_EVENT_PATH_MIGRATED: 1239 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n"); 1240 break; 1241 case IBT_EVENT_SQD: 1242 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1243 break; 1244 case IBT_EVENT_COM_EST: 1245 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1246 break; 1247 case IBT_ERROR_CATASTROPHIC_CHAN: 1248 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n"); 1249 break; 1250 case IBT_ERROR_INVALID_REQUEST_CHAN: 1251 cmn_err(CE_NOTE, "rib_async_handler(): " 1252 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1253 break; 1254 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1255 cmn_err(CE_NOTE, "rib_async_handler(): " 1256 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1257 break; 1258 case IBT_ERROR_PATH_MIGRATE_REQ: 1259 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n"); 1260 break; 1261 case IBT_ERROR_CQ: 1262 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1263 break; 1264 case IBT_ERROR_PORT_DOWN: 1265 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1266 break; 1267 case IBT_EVENT_PORT_UP: 1268 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1269 break; 1270 case IBT_ASYNC_OPAQUE1: 1271 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1272 break; 1273 case IBT_ASYNC_OPAQUE2: 1274 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1275 break; 1276 case IBT_ASYNC_OPAQUE3: 1277 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1278 break; 1279 case IBT_ASYNC_OPAQUE4: 1280 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1281 break; 1282 #endif 1283 default: 1284 break; 1285 } 1286 } 1287 1288 /* 1289 * Client's reachable function. 1290 */ 1291 static rdma_stat 1292 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1293 { 1294 rib_hca_t *hca; 1295 rdma_stat status; 1296 1297 /* 1298 * First check if a hca is still attached 1299 */ 1300 *handle = NULL; 1301 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1302 if (rib_stat->hca->state != HCA_INITED) { 1303 rw_exit(&rib_stat->hca->state_lock); 1304 return (RDMA_FAILED); 1305 } 1306 status = rib_ping_srv(addr_type, raddr, &hca); 1307 rw_exit(&rib_stat->hca->state_lock); 1308 1309 if (status == RDMA_SUCCESS) { 1310 *handle = (void *)hca; 1311 /* 1312 * Register the Address translation service 1313 */ 1314 mutex_enter(&rib_stat->open_hca_lock); 1315 if (ats_running == 0) { 1316 if (rib_register_ats(rib_stat->hca) 1317 == RDMA_SUCCESS) { 1318 ats_running = 1; 1319 mutex_exit(&rib_stat->open_hca_lock); 1320 return (RDMA_SUCCESS); 1321 } else { 1322 mutex_exit(&rib_stat->open_hca_lock); 1323 return (RDMA_FAILED); 1324 } 1325 } else { 1326 mutex_exit(&rib_stat->open_hca_lock); 1327 return (RDMA_SUCCESS); 1328 } 1329 } else { 1330 *handle = NULL; 1331 if (rib_debug > 2) 1332 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n"); 1333 return (RDMA_FAILED); 1334 } 1335 } 1336 1337 /* Client side qp creation */ 1338 static rdma_stat 1339 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1340 { 1341 rib_qp_t *kqp = NULL; 1342 CONN *conn; 1343 1344 ASSERT(qp != NULL); 1345 *qp = NULL; 1346 1347 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1348 conn = qptoc(kqp); 1349 kqp->hca = hca; 1350 kqp->rdmaconn.c_rdmamod = &rib_mod; 1351 kqp->rdmaconn.c_private = (caddr_t)kqp; 1352 1353 kqp->mode = RIB_CLIENT; 1354 kqp->chan_flags = IBT_BLOCKING; 1355 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1356 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1357 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1358 1359 /* 1360 * Initialize 1361 */ 1362 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1363 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1364 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1365 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1366 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1367 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1368 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1369 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1370 1371 *qp = kqp; 1372 return (RDMA_SUCCESS); 1373 } 1374 1375 /* Server side qp creation */ 1376 static rdma_stat 1377 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1378 { 1379 rib_qp_t *kqp = NULL; 1380 ibt_chan_sizes_t chan_sizes; 1381 ibt_rc_chan_alloc_args_t qp_attr; 1382 ibt_status_t ibt_status; 1383 1384 ASSERT(qp != NULL); 1385 *qp = NULL; 1386 1387 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1388 kqp->hca = hca; 1389 kqp->port_num = port; 1390 kqp->rdmaconn.c_rdmamod = &rib_mod; 1391 kqp->rdmaconn.c_private = (caddr_t)kqp; 1392 1393 /* 1394 * Create the qp handle 1395 */ 1396 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1397 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1398 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1399 qp_attr.rc_pd = hca->pd_hdl; 1400 qp_attr.rc_hca_port_num = port; 1401 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1402 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1403 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1404 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1405 qp_attr.rc_clone_chan = NULL; 1406 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1407 qp_attr.rc_flags = IBT_WR_SIGNALED; 1408 1409 rw_enter(&hca->state_lock, RW_READER); 1410 if (hca->state != HCA_DETACHED) { 1411 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1412 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1413 &chan_sizes); 1414 } else { 1415 rw_exit(&hca->state_lock); 1416 goto fail; 1417 } 1418 rw_exit(&hca->state_lock); 1419 1420 if (ibt_status != IBT_SUCCESS) { 1421 cmn_err(CE_WARN, "rib_svc_create_chan: " 1422 "ibt_alloc_rc_channel failed, ibt_status=%d.", 1423 ibt_status); 1424 goto fail; 1425 } 1426 1427 kqp->mode = RIB_SERVER; 1428 kqp->chan_flags = IBT_BLOCKING; 1429 kqp->q = q; /* server ONLY */ 1430 1431 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1432 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1433 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1434 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1435 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1436 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1437 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1438 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1439 /* 1440 * Set the private data area to qp to be used in callbacks 1441 */ 1442 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1443 kqp->rdmaconn.c_state = C_CONNECTED; 1444 *qp = kqp; 1445 return (RDMA_SUCCESS); 1446 fail: 1447 if (kqp) 1448 kmem_free(kqp, sizeof (rib_qp_t)); 1449 1450 return (RDMA_FAILED); 1451 } 1452 1453 void 1454 rib_dump_pathrec(ibt_path_info_t *path_rec) 1455 { 1456 ib_pkey_t pkey; 1457 1458 if (rib_debug > 1) { 1459 cmn_err(CE_NOTE, "Path Record:\n"); 1460 1461 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n", 1462 (longlong_t)path_rec->pi_hca_guid); 1463 cmn_err(CE_NOTE, "Dest Service ID = %llx\n", 1464 (longlong_t)path_rec->pi_sid); 1465 cmn_err(CE_NOTE, "Port Num = %02d\n", 1466 path_rec->pi_prim_cep_path.cep_hca_port_num); 1467 cmn_err(CE_NOTE, "P_Key Index = %04d\n", 1468 path_rec->pi_prim_cep_path.cep_pkey_ix); 1469 1470 (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid, 1471 path_rec->pi_prim_cep_path.cep_hca_port_num, 1472 path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey); 1473 cmn_err(CE_NOTE, "P_Key = 0x%x\n", pkey); 1474 1475 1476 cmn_err(CE_NOTE, "SGID: = %llx:%llx\n", 1477 (longlong_t) 1478 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix, 1479 (longlong_t) 1480 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid); 1481 1482 cmn_err(CE_NOTE, "DGID: = %llx:%llx\n", 1483 (longlong_t) 1484 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix, 1485 (longlong_t) 1486 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid); 1487 1488 cmn_err(CE_NOTE, "Path Rate = %02x\n", 1489 path_rec->pi_prim_cep_path.cep_adds_vect.av_srate); 1490 cmn_err(CE_NOTE, "SL = %02x\n", 1491 path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl); 1492 cmn_err(CE_NOTE, "Prim Packet LT = %02x\n", 1493 path_rec->pi_prim_pkt_lt); 1494 cmn_err(CE_NOTE, "Path MTU = %02x\n", 1495 path_rec->pi_path_mtu); 1496 } 1497 } 1498 1499 /* ARGSUSED */ 1500 ibt_cm_status_t 1501 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1502 ibt_cm_return_args_t *ret_args, void *priv_data, 1503 ibt_priv_data_len_t len) 1504 { 1505 rpcib_state_t *ribstat; 1506 rib_hca_t *hca; 1507 1508 ribstat = (rpcib_state_t *)clnt_hdl; 1509 hca = (rib_hca_t *)ribstat->hca; 1510 1511 switch (event->cm_type) { 1512 1513 /* got a connection close event */ 1514 case IBT_CM_EVENT_CONN_CLOSED: 1515 { 1516 CONN *conn; 1517 rib_qp_t *qp; 1518 1519 /* check reason why connection was closed */ 1520 switch (event->cm_event.closed) { 1521 case IBT_CM_CLOSED_DREP_RCVD: 1522 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1523 case IBT_CM_CLOSED_DUP: 1524 case IBT_CM_CLOSED_ABORT: 1525 case IBT_CM_CLOSED_ALREADY: 1526 /* 1527 * These cases indicate the local end initiated 1528 * the closing of the channel. Nothing to do here. 1529 */ 1530 break; 1531 default: 1532 /* 1533 * Reason for CONN_CLOSED event must be one of 1534 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1535 * or IBT_CM_CLOSED_STALE. These indicate cases were 1536 * the remote end is closing the channel. In these 1537 * cases free the channel and transition to error 1538 * state 1539 */ 1540 qp = ibt_get_chan_private(event->cm_channel); 1541 conn = qptoc(qp); 1542 mutex_enter(&conn->c_lock); 1543 if (conn->c_state == C_DISCONN_PEND) { 1544 mutex_exit(&conn->c_lock); 1545 break; 1546 } 1547 1548 conn->c_state = C_ERROR; 1549 1550 /* 1551 * Free the rc_channel. Channel has already 1552 * transitioned to ERROR state and WRs have been 1553 * FLUSHED_ERR already. 1554 */ 1555 (void) ibt_free_channel(qp->qp_hdl); 1556 qp->qp_hdl = NULL; 1557 1558 /* 1559 * Free the conn if c_ref is down to 0 already 1560 */ 1561 if (conn->c_ref == 0) { 1562 /* 1563 * Remove from list and free conn 1564 */ 1565 conn->c_state = C_DISCONN_PEND; 1566 mutex_exit(&conn->c_lock); 1567 (void) rib_disconnect_channel(conn, 1568 &hca->cl_conn_list); 1569 } else { 1570 mutex_exit(&conn->c_lock); 1571 } 1572 #ifdef DEBUG 1573 if (rib_debug) 1574 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1575 "(CONN_CLOSED) channel disconnected"); 1576 #endif 1577 break; 1578 } 1579 break; 1580 } 1581 default: 1582 break; 1583 } 1584 return (IBT_CM_ACCEPT); 1585 } 1586 1587 1588 /* Check if server has done ATS registration */ 1589 rdma_stat 1590 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr, 1591 int addr_type, ibt_path_info_t *path) 1592 { 1593 struct sockaddr_in *sin4; 1594 struct sockaddr_in6 *sin6; 1595 ibt_path_attr_t path_attr; 1596 ibt_status_t ibt_status; 1597 ib_pkey_t pkey; 1598 ibt_ar_t ar_query, ar_result; 1599 rib_service_t *ats; 1600 ib_gid_t sgid; 1601 ibt_path_info_t paths[MAX_PORTS]; 1602 uint8_t npaths, i; 1603 1604 (void) bzero(&path_attr, sizeof (ibt_path_attr_t)); 1605 (void) bzero(path, sizeof (ibt_path_info_t)); 1606 1607 /* 1608 * Construct svc name 1609 */ 1610 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 1611 switch (addr_type) { 1612 case AF_INET: 1613 sin4 = (struct sockaddr_in *)raddr->buf; 1614 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname, 1615 IB_SVC_NAME_LEN); 1616 break; 1617 1618 case AF_INET6: 1619 sin6 = (struct sockaddr_in6 *)raddr->buf; 1620 (void) inet_ntop(AF_INET6, &sin6->sin6_addr, 1621 path_attr.pa_sname, IB_SVC_NAME_LEN); 1622 break; 1623 1624 default: 1625 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1626 return (RDMA_INVAL); 1627 } 1628 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN); 1629 1630 /* 1631 * Attempt a path to the server on an ATS-registered port. 1632 * Try all ATS-registered ports until one succeeds. 1633 * The first one that succeeds will be used to connect 1634 * to the server. If none of them succeed, return RDMA_FAILED. 1635 */ 1636 rw_enter(&hca->state_lock, RW_READER); 1637 if (hca->state != HCA_DETACHED) { 1638 rw_enter(&hca->service_list_lock, RW_READER); 1639 for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) { 1640 path_attr.pa_hca_guid = hca->hca_guid; 1641 path_attr.pa_hca_port_num = ats->srv_port; 1642 ibt_status = ibt_get_paths(hca->ibt_clnt_hdl, 1643 IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths); 1644 if (ibt_status == IBT_SUCCESS || 1645 ibt_status == IBT_INSUFF_DATA) { 1646 for (i = 0; i < npaths; i++) { 1647 if (paths[i].pi_hca_guid) { 1648 /* 1649 * do ibt_query_ar() 1650 */ 1651 sgid = 1652 paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid; 1653 1654 (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid, 1655 paths[i].pi_prim_cep_path.cep_hca_port_num, 1656 paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey); 1657 1658 bzero(&ar_query, sizeof (ar_query)); 1659 bzero(&ar_result, sizeof (ar_result)); 1660 ar_query.ar_gid = 1661 paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid; 1662 ar_query.ar_pkey = pkey; 1663 ibt_status = ibt_query_ar(&sgid, &ar_query, 1664 &ar_result); 1665 if (ibt_status == IBT_SUCCESS) { 1666 #ifdef DEBUG 1667 if (rib_debug > 1) 1668 rib_dump_pathrec(&paths[i]); 1669 #endif 1670 bcopy(&paths[i], path, 1671 sizeof (ibt_path_info_t)); 1672 rw_exit(&hca->service_list_lock); 1673 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1674 rw_exit(&hca->state_lock); 1675 return (RDMA_SUCCESS); 1676 } 1677 #ifdef DEBUG 1678 if (rib_debug) { 1679 cmn_err(CE_NOTE, "rib_chk_srv_ats: " 1680 "ibt_query_ar FAILED, return\n"); 1681 } 1682 #endif 1683 } 1684 } 1685 } 1686 } 1687 rw_exit(&hca->service_list_lock); 1688 } 1689 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1690 rw_exit(&hca->state_lock); 1691 return (RDMA_FAILED); 1692 } 1693 1694 1695 /* 1696 * Connect to the server. 1697 */ 1698 rdma_stat 1699 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path) 1700 { 1701 ibt_chan_open_args_t chan_args; /* channel args */ 1702 ibt_chan_sizes_t chan_sizes; 1703 ibt_rc_chan_alloc_args_t qp_attr; 1704 ibt_status_t ibt_status; 1705 ibt_rc_returns_t ret_args; /* conn reject info */ 1706 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1707 1708 (void) bzero(&chan_args, sizeof (chan_args)); 1709 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1710 1711 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 1712 /* Alloc a RC channel */ 1713 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1714 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1715 qp_attr.rc_pd = hca->pd_hdl; 1716 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1717 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1718 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1719 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1720 qp_attr.rc_clone_chan = NULL; 1721 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1722 qp_attr.rc_flags = IBT_WR_SIGNALED; 1723 1724 chan_args.oc_path = path; 1725 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1726 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1727 chan_args.oc_rdma_ra_out = 1; 1728 chan_args.oc_rdma_ra_in = 1; 1729 chan_args.oc_path_retry_cnt = 2; 1730 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1731 1732 refresh: 1733 rw_enter(&hca->state_lock, RW_READER); 1734 if (hca->state != HCA_DETACHED) { 1735 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1736 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl, 1737 &chan_sizes); 1738 } else { 1739 rw_exit(&hca->state_lock); 1740 return (RDMA_FAILED); 1741 } 1742 rw_exit(&hca->state_lock); 1743 1744 if (ibt_status != IBT_SUCCESS) { 1745 #ifdef DEBUG 1746 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel " 1747 "failed, ibt_status=%d.", ibt_status); 1748 #endif 1749 return (RDMA_FAILED); 1750 } 1751 1752 /* Connect to the Server */ 1753 (void) bzero(&ret_args, sizeof (ret_args)); 1754 mutex_enter(&qp->cb_lock); 1755 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1756 IBT_BLOCKING, &chan_args, &ret_args); 1757 if (ibt_status != IBT_SUCCESS) { 1758 #ifdef DEBUG 1759 if (rib_debug) 1760 cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel" 1761 " failed for qp %p, status=%d, " 1762 "ret_args.rc_status=%d\n", 1763 (void *)qp, ibt_status, ret_args.rc_status); 1764 #endif 1765 (void) ibt_free_channel(qp->qp_hdl); 1766 qp->qp_hdl = NULL; 1767 mutex_exit(&qp->cb_lock); 1768 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1769 ret_args.rc_status == IBT_CM_CONN_STALE) { 1770 /* 1771 * Got IBT_CM_CONN_STALE probably because of stale 1772 * data on the passive end of a channel that existed 1773 * prior to reboot. Retry establishing a channel 1774 * REFRESH_ATTEMPTS times, during which time the 1775 * stale conditions on the server might clear up. 1776 */ 1777 goto refresh; 1778 } 1779 return (RDMA_FAILED); 1780 } 1781 mutex_exit(&qp->cb_lock); 1782 /* 1783 * Set the private data area to qp to be used in callbacks 1784 */ 1785 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1786 return (RDMA_SUCCESS); 1787 } 1788 1789 rdma_stat 1790 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) 1791 { 1792 struct sockaddr_in *sin4; 1793 struct sockaddr_in6 *sin6; 1794 ibt_path_attr_t path_attr; 1795 ibt_path_info_t path; 1796 ibt_status_t ibt_status; 1797 1798 ASSERT(raddr->buf != NULL); 1799 1800 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1801 bzero(&path, sizeof (ibt_path_info_t)); 1802 1803 /* 1804 * Conctruct svc name 1805 */ 1806 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 1807 switch (addr_type) { 1808 case AF_INET: 1809 sin4 = (struct sockaddr_in *)raddr->buf; 1810 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname, 1811 IB_SVC_NAME_LEN); 1812 break; 1813 1814 case AF_INET6: 1815 sin6 = (struct sockaddr_in6 *)raddr->buf; 1816 (void) inet_ntop(AF_INET6, &sin6->sin6_addr, 1817 path_attr.pa_sname, IB_SVC_NAME_LEN); 1818 break; 1819 1820 default: 1821 #ifdef DEBUG 1822 if (rib_debug) { 1823 cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n"); 1824 } 1825 #endif 1826 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1827 return (RDMA_INVAL); 1828 } 1829 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN); 1830 1831 ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl, 1832 IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL); 1833 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN); 1834 if (ibt_status != IBT_SUCCESS) { 1835 if (rib_debug > 1) { 1836 cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!" 1837 " status=%d\n", ibt_status); 1838 } 1839 } else if (path.pi_hca_guid) { 1840 ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid); 1841 *hca = rib_stat->hca; 1842 return (RDMA_SUCCESS); 1843 } 1844 return (RDMA_FAILED); 1845 } 1846 1847 /* 1848 * Close channel, remove from connection list and 1849 * free up resources allocated for that channel. 1850 */ 1851 rdma_stat 1852 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1853 { 1854 rib_qp_t *qp = ctoqp(conn); 1855 rib_hca_t *hca; 1856 1857 /* 1858 * c_ref == 0 and connection is in C_DISCONN_PEND 1859 */ 1860 hca = qp->hca; 1861 if (conn_list != NULL) 1862 (void) rib_rm_conn(conn, conn_list); 1863 if (qp->qp_hdl != NULL) { 1864 /* 1865 * If the channel has not been establised, 1866 * ibt_flush_channel is called to flush outstanding WRs 1867 * on the Qs. Otherwise, ibt_close_rc_channel() is 1868 * called. The channel is then freed. 1869 */ 1870 if (conn_list != NULL) 1871 (void) ibt_close_rc_channel(qp->qp_hdl, 1872 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1873 else 1874 (void) ibt_flush_channel(qp->qp_hdl); 1875 1876 mutex_enter(&qp->posted_rbufs_lock); 1877 while (qp->n_posted_rbufs) 1878 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1879 mutex_exit(&qp->posted_rbufs_lock); 1880 (void) ibt_free_channel(qp->qp_hdl); 1881 qp->qp_hdl = NULL; 1882 } 1883 ASSERT(qp->rdlist == NULL); 1884 if (qp->replylist != NULL) { 1885 (void) rib_rem_replylist(qp); 1886 } 1887 1888 cv_destroy(&qp->cb_conn_cv); 1889 cv_destroy(&qp->posted_rbufs_cv); 1890 mutex_destroy(&qp->cb_lock); 1891 1892 mutex_destroy(&qp->replylist_lock); 1893 mutex_destroy(&qp->posted_rbufs_lock); 1894 mutex_destroy(&qp->rdlist_lock); 1895 1896 cv_destroy(&conn->c_cv); 1897 mutex_destroy(&conn->c_lock); 1898 1899 if (conn->c_raddr.buf != NULL) { 1900 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1901 } 1902 if (conn->c_laddr.buf != NULL) { 1903 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1904 } 1905 kmem_free(qp, sizeof (rib_qp_t)); 1906 1907 /* 1908 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1909 * then the hca is no longer being used. 1910 */ 1911 if (conn_list != NULL) { 1912 rw_enter(&hca->state_lock, RW_READER); 1913 if (hca->state == HCA_DETACHED) { 1914 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1915 if (hca->srv_conn_list.conn_hd == NULL) { 1916 rw_enter(&hca->cl_conn_list.conn_lock, 1917 RW_READER); 1918 if (hca->cl_conn_list.conn_hd == NULL) { 1919 mutex_enter(&hca->inuse_lock); 1920 hca->inuse = FALSE; 1921 cv_signal(&hca->cb_cv); 1922 mutex_exit(&hca->inuse_lock); 1923 } 1924 rw_exit(&hca->cl_conn_list.conn_lock); 1925 } 1926 rw_exit(&hca->srv_conn_list.conn_lock); 1927 } 1928 rw_exit(&hca->state_lock); 1929 } 1930 return (RDMA_SUCCESS); 1931 } 1932 1933 /* 1934 * Wait for send completion notification. Only on receiving a 1935 * notification be it a successful or error completion, free the 1936 * send_wid. 1937 */ 1938 static rdma_stat 1939 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 1940 { 1941 clock_t timout, cv_wait_ret; 1942 rdma_stat error = RDMA_SUCCESS; 1943 int i; 1944 1945 /* 1946 * Wait for send to complete 1947 */ 1948 ASSERT(wd != NULL); 1949 mutex_enter(&wd->sendwait_lock); 1950 if (wd->status == (uint_t)SEND_WAIT) { 1951 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 1952 ddi_get_lbolt(); 1953 if (qp->mode == RIB_SERVER) { 1954 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 1955 &wd->sendwait_lock, timout)) > 0 && 1956 wd->status == (uint_t)SEND_WAIT) 1957 ; 1958 switch (cv_wait_ret) { 1959 case -1: /* timeout */ 1960 #ifdef DEBUG 1961 if (rib_debug > 2) 1962 cmn_err(CE_WARN, "rib_sendwait: " 1963 "timed out qp %p\n", (void *)qp); 1964 #endif 1965 wd->cv_sig = 0; /* no signal needed */ 1966 error = RDMA_TIMEDOUT; 1967 break; 1968 default: /* got send completion */ 1969 break; 1970 } 1971 } else { 1972 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 1973 &wd->sendwait_lock, timout)) > 0 && 1974 wd->status == (uint_t)SEND_WAIT) 1975 ; 1976 switch (cv_wait_ret) { 1977 case -1: /* timeout */ 1978 #ifdef DEBUG 1979 if (rib_debug > 2) 1980 cmn_err(CE_WARN, "rib_sendwait: " 1981 "timed out qp %p\n", (void *)qp); 1982 #endif 1983 wd->cv_sig = 0; /* no signal needed */ 1984 error = RDMA_TIMEDOUT; 1985 break; 1986 case 0: /* interrupted */ 1987 #ifdef DEBUG 1988 if (rib_debug > 2) 1989 cmn_err(CE_NOTE, "rib_sendwait:" 1990 " interrupted on qp %p\n", 1991 (void *)qp); 1992 #endif 1993 wd->cv_sig = 0; /* no signal needed */ 1994 error = RDMA_INTR; 1995 break; 1996 default: /* got send completion */ 1997 break; 1998 } 1999 } 2000 } 2001 2002 if (wd->status != (uint_t)SEND_WAIT) { 2003 /* got send completion */ 2004 if (wd->status != RDMA_SUCCESS) { 2005 error = wd->status; 2006 if (wd->status != RDMA_CONNLOST) 2007 error = RDMA_FAILED; 2008 } 2009 for (i = 0; i < wd->nsbufs; i++) { 2010 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2011 (void *)(uintptr_t)wd->sbufaddr[i]); 2012 } 2013 mutex_exit(&wd->sendwait_lock); 2014 (void) rib_free_sendwait(wd); 2015 } else { 2016 mutex_exit(&wd->sendwait_lock); 2017 } 2018 2019 return (error); 2020 } 2021 2022 static struct send_wid * 2023 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2024 { 2025 struct send_wid *wd; 2026 2027 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2028 wd->xid = xid; 2029 wd->cv_sig = cv_sig; 2030 wd->qp = qp; 2031 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2032 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2033 wd->status = (uint_t)SEND_WAIT; 2034 2035 return (wd); 2036 } 2037 2038 static int 2039 rib_free_sendwait(struct send_wid *wdesc) 2040 { 2041 cv_destroy(&wdesc->wait_cv); 2042 mutex_destroy(&wdesc->sendwait_lock); 2043 kmem_free(wdesc, sizeof (*wdesc)); 2044 2045 return (0); 2046 } 2047 2048 static rdma_stat 2049 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2050 { 2051 mutex_enter(&qp->replylist_lock); 2052 if (rep != NULL) { 2053 (void) rib_remreply(qp, rep); 2054 mutex_exit(&qp->replylist_lock); 2055 return (RDMA_SUCCESS); 2056 } 2057 mutex_exit(&qp->replylist_lock); 2058 return (RDMA_FAILED); 2059 } 2060 2061 /* 2062 * Send buffers are freed here only in case of error in posting 2063 * on QP. If the post succeeded, the send buffers are freed upon 2064 * send completion in rib_sendwait() or in the scq_handler. 2065 */ 2066 rdma_stat 2067 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2068 int send_sig, int cv_sig) 2069 { 2070 struct send_wid *wdesc; 2071 struct clist *clp; 2072 ibt_status_t ibt_status = IBT_SUCCESS; 2073 rdma_stat ret = RDMA_SUCCESS; 2074 ibt_send_wr_t tx_wr; 2075 int i, nds; 2076 ibt_wr_ds_t sgl[DSEG_MAX]; 2077 uint_t total_msg_size; 2078 rib_qp_t *qp = ctoqp(conn); 2079 2080 ASSERT(cl != NULL); 2081 2082 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2083 2084 nds = 0; 2085 total_msg_size = 0; 2086 clp = cl; 2087 while (clp != NULL) { 2088 if (nds >= DSEG_MAX) { 2089 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX" 2090 " too small!"); 2091 return (RDMA_FAILED); 2092 } 2093 sgl[nds].ds_va = clp->c_saddr; 2094 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2095 sgl[nds].ds_len = clp->c_len; 2096 total_msg_size += clp->c_len; 2097 clp = clp->c_next; 2098 nds++; 2099 } 2100 2101 if (send_sig) { 2102 /* Set SEND_SIGNAL flag. */ 2103 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2104 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2105 } else { 2106 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2107 wdesc = rib_init_sendwait(msgid, 0, qp); 2108 } 2109 wdesc->nsbufs = nds; 2110 for (i = 0; i < nds; i++) { 2111 wdesc->sbufaddr[i] = sgl[i].ds_va; 2112 } 2113 2114 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2115 tx_wr.wr_opcode = IBT_WRC_SEND; 2116 tx_wr.wr_trans = IBT_RC_SRV; 2117 tx_wr.wr_nds = nds; 2118 tx_wr.wr_sgl = sgl; 2119 2120 mutex_enter(&conn->c_lock); 2121 if (conn->c_state & C_CONNECTED) { 2122 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2123 } 2124 if (((conn->c_state & C_CONNECTED) == 0) || 2125 ibt_status != IBT_SUCCESS) { 2126 mutex_exit(&conn->c_lock); 2127 for (i = 0; i < nds; i++) { 2128 rib_rbuf_free(conn, SEND_BUFFER, 2129 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2130 } 2131 (void) rib_free_sendwait(wdesc); 2132 #ifdef DEBUG 2133 if (rib_debug && ibt_status != IBT_SUCCESS) 2134 cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send " 2135 "failed! wr_id %llx on qpn %p, status=%d!", 2136 (longlong_t)tx_wr.wr_id, (void *)qp, 2137 ibt_status); 2138 #endif 2139 return (RDMA_FAILED); 2140 } 2141 mutex_exit(&conn->c_lock); 2142 2143 if (send_sig) { 2144 if (cv_sig) { 2145 /* 2146 * cv_wait for send to complete. 2147 * We can fail due to a timeout or signal or 2148 * unsuccessful send. 2149 */ 2150 ret = rib_sendwait(qp, wdesc); 2151 #ifdef DEBUG 2152 if (rib_debug > 2) 2153 if (ret != 0) { 2154 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait " 2155 "FAILED, rdma stat=%d, wr_id %llx, qp %p!", 2156 ret, (longlong_t)tx_wr.wr_id, (void *)qp); 2157 } 2158 #endif 2159 return (ret); 2160 } 2161 } 2162 2163 return (RDMA_SUCCESS); 2164 } 2165 2166 rdma_stat 2167 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2168 { 2169 rdma_stat ret; 2170 2171 /* send-wait & cv_signal */ 2172 ret = rib_send_and_wait(conn, cl, msgid, 1, 1); 2173 2174 return (ret); 2175 } 2176 2177 /* 2178 * Server interface (svc_rdma_ksend). 2179 * Send RPC reply and wait for RDMA_DONE. 2180 */ 2181 rdma_stat 2182 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2183 { 2184 rdma_stat ret = RDMA_SUCCESS; 2185 struct rdma_done_list *rd; 2186 clock_t timout, cv_wait_ret; 2187 rib_qp_t *qp = ctoqp(conn); 2188 2189 mutex_enter(&qp->rdlist_lock); 2190 rd = rdma_done_add(qp, msgid); 2191 2192 /* No cv_signal (whether send-wait or no-send-wait) */ 2193 ret = rib_send_and_wait(conn, cl, msgid, 1, 0); 2194 if (ret != RDMA_SUCCESS) { 2195 #ifdef DEBUG 2196 cmn_err(CE_WARN, "rib_send_resp: send_and_wait " 2197 "failed, msgid %u, qp %p", msgid, (void *)qp); 2198 #endif 2199 rdma_done_rm(qp, rd); 2200 goto done; 2201 } 2202 2203 /* 2204 * Wait for RDMA_DONE from remote end 2205 */ 2206 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2207 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock, 2208 timout); 2209 rdma_done_rm(qp, rd); 2210 if (cv_wait_ret < 0) { 2211 #ifdef DEBUG 2212 if (rib_debug > 1) { 2213 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not" 2214 " recv'd for qp %p, xid:%u\n", 2215 (void *)qp, msgid); 2216 } 2217 #endif 2218 ret = RDMA_TIMEDOUT; 2219 goto done; 2220 } 2221 2222 done: 2223 mutex_exit(&qp->rdlist_lock); 2224 return (ret); 2225 } 2226 2227 static struct recv_wid * 2228 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2229 { 2230 struct recv_wid *rwid; 2231 2232 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2233 rwid->xid = msgid; 2234 rwid->addr = sgl->ds_va; 2235 rwid->qp = qp; 2236 2237 return (rwid); 2238 } 2239 2240 static void 2241 rib_free_wid(struct recv_wid *rwid) 2242 { 2243 kmem_free(rwid, sizeof (struct recv_wid)); 2244 } 2245 2246 rdma_stat 2247 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2248 { 2249 rib_qp_t *qp = ctoqp(conn); 2250 struct clist *clp = cl; 2251 struct reply *rep; 2252 struct recv_wid *rwid; 2253 int nds; 2254 ibt_wr_ds_t sgl[DSEG_MAX]; 2255 ibt_recv_wr_t recv_wr; 2256 rdma_stat ret; 2257 ibt_status_t ibt_status; 2258 2259 /* 2260 * rdma_clnt_postrecv uses RECV_BUFFER. 2261 */ 2262 2263 nds = 0; 2264 while (cl != NULL) { 2265 if (nds >= DSEG_MAX) { 2266 cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!"); 2267 ret = RDMA_FAILED; 2268 goto done; 2269 } 2270 sgl[nds].ds_va = cl->c_saddr; 2271 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2272 sgl[nds].ds_len = cl->c_len; 2273 cl = cl->c_next; 2274 nds++; 2275 } 2276 2277 if (nds != 1) { 2278 cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n"); 2279 ret = RDMA_FAILED; 2280 goto done; 2281 } 2282 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2283 recv_wr.wr_nds = nds; 2284 recv_wr.wr_sgl = sgl; 2285 2286 rwid = rib_create_wid(qp, &sgl[0], msgid); 2287 if (rwid) { 2288 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2289 } else { 2290 cmn_err(CE_WARN, "rib_clnt_post: out of memory"); 2291 ret = RDMA_NORESOURCE; 2292 goto done; 2293 } 2294 rep = rib_addreplylist(qp, msgid); 2295 if (!rep) { 2296 cmn_err(CE_WARN, "rib_clnt_post: out of memory"); 2297 rib_free_wid(rwid); 2298 ret = RDMA_NORESOURCE; 2299 goto done; 2300 } 2301 2302 mutex_enter(&conn->c_lock); 2303 if (conn->c_state & C_CONNECTED) { 2304 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2305 } 2306 if (((conn->c_state & C_CONNECTED) == 0) || 2307 ibt_status != IBT_SUCCESS) { 2308 mutex_exit(&conn->c_lock); 2309 #ifdef DEBUG 2310 cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in " 2311 "ibt_post_recv(), msgid=%d, status=%d", 2312 (void *)qp, msgid, ibt_status); 2313 #endif 2314 rib_free_wid(rwid); 2315 (void) rib_rem_rep(qp, rep); 2316 ret = RDMA_FAILED; 2317 goto done; 2318 } 2319 mutex_exit(&conn->c_lock); 2320 return (RDMA_SUCCESS); 2321 2322 done: 2323 while (clp != NULL) { 2324 rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr); 2325 clp = clp->c_next; 2326 } 2327 return (ret); 2328 } 2329 2330 rdma_stat 2331 rib_svc_post(CONN* conn, struct clist *cl) 2332 { 2333 rib_qp_t *qp = ctoqp(conn); 2334 struct svc_recv *s_recvp; 2335 int nds; 2336 ibt_wr_ds_t sgl[DSEG_MAX]; 2337 ibt_recv_wr_t recv_wr; 2338 ibt_status_t ibt_status; 2339 2340 nds = 0; 2341 while (cl != NULL) { 2342 if (nds >= DSEG_MAX) { 2343 cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!"); 2344 return (RDMA_FAILED); 2345 } 2346 sgl[nds].ds_va = cl->c_saddr; 2347 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2348 sgl[nds].ds_len = cl->c_len; 2349 cl = cl->c_next; 2350 nds++; 2351 } 2352 2353 if (nds != 1) { 2354 cmn_err(CE_WARN, "rib_svc_post: nds!=1\n"); 2355 rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va); 2356 return (RDMA_FAILED); 2357 } 2358 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2359 recv_wr.wr_nds = nds; 2360 recv_wr.wr_sgl = sgl; 2361 2362 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2363 /* Use s_recvp's addr as wr id */ 2364 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2365 mutex_enter(&conn->c_lock); 2366 if (conn->c_state & C_CONNECTED) { 2367 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2368 } 2369 if (((conn->c_state & C_CONNECTED) == 0) || 2370 ibt_status != IBT_SUCCESS) { 2371 mutex_exit(&conn->c_lock); 2372 #ifdef DEBUG 2373 cmn_err(CE_WARN, "rib_svc_post: QP %p failed in " 2374 "ibt_post_recv(), status=%d", 2375 (void *)qp, ibt_status); 2376 #endif 2377 rib_rbuf_free(conn, RECV_BUFFER, 2378 (caddr_t)(uintptr_t)sgl[0].ds_va); 2379 (void) rib_free_svc_recv(s_recvp); 2380 return (RDMA_FAILED); 2381 } 2382 mutex_exit(&conn->c_lock); 2383 2384 return (RDMA_SUCCESS); 2385 } 2386 2387 /* Client */ 2388 rdma_stat 2389 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2390 { 2391 2392 return (rib_clnt_post(conn, cl, msgid)); 2393 } 2394 2395 /* Server */ 2396 rdma_stat 2397 rib_post_recv(CONN *conn, struct clist *cl) 2398 { 2399 rib_qp_t *qp = ctoqp(conn); 2400 2401 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2402 mutex_enter(&qp->posted_rbufs_lock); 2403 qp->n_posted_rbufs++; 2404 mutex_exit(&qp->posted_rbufs_lock); 2405 return (RDMA_SUCCESS); 2406 } 2407 return (RDMA_FAILED); 2408 } 2409 2410 /* 2411 * Client side only interface to "recv" the rpc reply buf 2412 * posted earlier by rib_post_resp(conn, cl, msgid). 2413 */ 2414 rdma_stat 2415 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2416 { 2417 struct reply *rep = NULL; 2418 clock_t timout, cv_wait_ret; 2419 rdma_stat ret = RDMA_SUCCESS; 2420 rib_qp_t *qp = ctoqp(conn); 2421 2422 /* 2423 * Find the reply structure for this msgid 2424 */ 2425 mutex_enter(&qp->replylist_lock); 2426 2427 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2428 if (rep->xid == msgid) 2429 break; 2430 } 2431 if (rep != NULL) { 2432 /* 2433 * If message not yet received, wait. 2434 */ 2435 if (rep->status == (uint_t)REPLY_WAIT) { 2436 timout = ddi_get_lbolt() + 2437 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2438 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2439 &qp->replylist_lock, timout)) > 0 && 2440 rep->status == (uint_t)REPLY_WAIT); 2441 2442 switch (cv_wait_ret) { 2443 case -1: /* timeout */ 2444 ret = RDMA_TIMEDOUT; 2445 break; 2446 case 0: 2447 ret = RDMA_INTR; 2448 break; 2449 default: 2450 break; 2451 } 2452 } 2453 2454 if (rep->status == RDMA_SUCCESS) { 2455 struct clist *cl = NULL; 2456 2457 /* 2458 * Got message successfully 2459 */ 2460 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2461 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2462 *clp = cl; 2463 } else { 2464 if (rep->status != (uint_t)REPLY_WAIT) { 2465 /* 2466 * Got error in reply message. Free 2467 * recv buffer here. 2468 */ 2469 ret = rep->status; 2470 rib_rbuf_free(conn, RECV_BUFFER, 2471 (caddr_t)(uintptr_t)rep->vaddr_cq); 2472 } 2473 } 2474 (void) rib_remreply(qp, rep); 2475 } else { 2476 /* 2477 * No matching reply structure found for given msgid on the 2478 * reply wait list. 2479 */ 2480 ret = RDMA_INVAL; 2481 #ifdef DEBUG 2482 cmn_err(CE_WARN, "rib_recv: no matching reply for " 2483 "xid %u, qp %p\n", msgid, (void *)qp); 2484 #endif 2485 } 2486 2487 /* 2488 * Done. 2489 */ 2490 mutex_exit(&qp->replylist_lock); 2491 return (ret); 2492 } 2493 2494 /* 2495 * RDMA write a buffer to the remote address. 2496 */ 2497 rdma_stat 2498 rib_write(CONN *conn, struct clist *cl, int wait) 2499 { 2500 ibt_send_wr_t tx_wr; 2501 int nds; 2502 int cv_sig; 2503 ibt_wr_ds_t sgl[DSEG_MAX]; 2504 struct send_wid *wdesc; 2505 ibt_status_t ibt_status; 2506 rdma_stat ret = RDMA_SUCCESS; 2507 rib_qp_t *qp = ctoqp(conn); 2508 2509 if (cl == NULL) { 2510 cmn_err(CE_WARN, "rib_write: NULL clist\n"); 2511 return (RDMA_FAILED); 2512 } 2513 2514 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2515 /* 2516 * Remote address is at the head chunk item in list. 2517 */ 2518 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr; 2519 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ 2520 2521 nds = 0; 2522 while (cl != NULL) { 2523 if (nds >= DSEG_MAX) { 2524 cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!"); 2525 return (RDMA_FAILED); 2526 } 2527 sgl[nds].ds_va = cl->c_saddr; 2528 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2529 sgl[nds].ds_len = cl->c_len; 2530 cl = cl->c_next; 2531 nds++; 2532 } 2533 2534 if (wait) { 2535 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2536 cv_sig = 1; 2537 } else { 2538 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2539 cv_sig = 0; 2540 } 2541 2542 wdesc = rib_init_sendwait(0, cv_sig, qp); 2543 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2544 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2545 tx_wr.wr_trans = IBT_RC_SRV; 2546 tx_wr.wr_nds = nds; 2547 tx_wr.wr_sgl = sgl; 2548 2549 mutex_enter(&conn->c_lock); 2550 if (conn->c_state & C_CONNECTED) { 2551 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2552 } 2553 if (((conn->c_state & C_CONNECTED) == 0) || 2554 ibt_status != IBT_SUCCESS) { 2555 mutex_exit(&conn->c_lock); 2556 (void) rib_free_sendwait(wdesc); 2557 return (RDMA_FAILED); 2558 } 2559 mutex_exit(&conn->c_lock); 2560 2561 /* 2562 * Wait for send to complete 2563 */ 2564 if (wait) { 2565 ret = rib_sendwait(qp, wdesc); 2566 if (ret != 0) { 2567 return (ret); 2568 } 2569 } 2570 return (RDMA_SUCCESS); 2571 } 2572 2573 /* 2574 * RDMA Read a buffer from the remote address. 2575 */ 2576 rdma_stat 2577 rib_read(CONN *conn, struct clist *cl, int wait) 2578 { 2579 ibt_send_wr_t rx_wr; 2580 int nds; 2581 int cv_sig; 2582 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */ 2583 struct send_wid *wdesc; 2584 ibt_status_t ibt_status = IBT_SUCCESS; 2585 rdma_stat ret = RDMA_SUCCESS; 2586 rib_qp_t *qp = ctoqp(conn); 2587 2588 if (cl == NULL) { 2589 cmn_err(CE_WARN, "rib_read: NULL clist\n"); 2590 return (RDMA_FAILED); 2591 } 2592 2593 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2594 /* 2595 * Remote address is at the head chunk item in list. 2596 */ 2597 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr; 2598 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */ 2599 2600 nds = 0; 2601 while (cl != NULL) { 2602 if (nds >= DSEG_MAX) { 2603 cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!"); 2604 return (RDMA_FAILED); 2605 } 2606 sgl[nds].ds_va = cl->c_daddr; 2607 sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2608 sgl[nds].ds_len = cl->c_len; 2609 cl = cl->c_next; 2610 nds++; 2611 } 2612 2613 if (wait) { 2614 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2615 cv_sig = 1; 2616 } else { 2617 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2618 cv_sig = 0; 2619 } 2620 2621 wdesc = rib_init_sendwait(0, cv_sig, qp); 2622 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2623 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2624 rx_wr.wr_trans = IBT_RC_SRV; 2625 rx_wr.wr_nds = nds; 2626 rx_wr.wr_sgl = sgl; 2627 2628 mutex_enter(&conn->c_lock); 2629 if (conn->c_state & C_CONNECTED) { 2630 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2631 } 2632 if (((conn->c_state & C_CONNECTED) == 0) || 2633 ibt_status != IBT_SUCCESS) { 2634 mutex_exit(&conn->c_lock); 2635 #ifdef DEBUG 2636 if (rib_debug && ibt_status != IBT_SUCCESS) 2637 cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR" 2638 " wr_id %llx on qp %p, status=%d", 2639 (longlong_t)rx_wr.wr_id, (void *)qp, 2640 ibt_status); 2641 #endif 2642 (void) rib_free_sendwait(wdesc); 2643 return (RDMA_FAILED); 2644 } 2645 mutex_exit(&conn->c_lock); 2646 2647 /* 2648 * Wait for send to complete 2649 */ 2650 if (wait) { 2651 ret = rib_sendwait(qp, wdesc); 2652 if (ret != 0) { 2653 return (ret); 2654 } 2655 } 2656 2657 return (RDMA_SUCCESS); 2658 } 2659 2660 int 2661 is_for_ipv4(ibt_ar_t *result) 2662 { 2663 int i, size = sizeof (struct in_addr); 2664 uint8_t zero = 0; 2665 2666 for (i = 0; i < (ATS_AR_DATA_LEN - size); i++) 2667 zero |= result->ar_data[i]; 2668 return (zero == 0); 2669 } 2670 2671 /* 2672 * rib_srv_cm_handler() 2673 * Connection Manager callback to handle RC connection requests. 2674 */ 2675 /* ARGSUSED */ 2676 static ibt_cm_status_t 2677 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2678 ibt_cm_return_args_t *ret_args, void *priv_data, 2679 ibt_priv_data_len_t len) 2680 { 2681 queue_t *q; 2682 rib_qp_t *qp; 2683 rpcib_state_t *ribstat; 2684 rib_hca_t *hca; 2685 rdma_stat status = RDMA_SUCCESS; 2686 int i; 2687 struct clist cl; 2688 rdma_buf_t rdbuf; 2689 void *buf = NULL; 2690 ibt_cm_req_rcv_t cm_req_rcv; 2691 CONN *conn; 2692 ibt_status_t ibt_status; 2693 ibt_ar_t ar_query, ar_result; 2694 ib_gid_t sgid; 2695 2696 2697 ASSERT(any != NULL); 2698 ASSERT(event != NULL); 2699 2700 ribstat = (rpcib_state_t *)any; 2701 hca = (rib_hca_t *)ribstat->hca; 2702 ASSERT(hca != NULL); 2703 2704 /* got a connection request */ 2705 switch (event->cm_type) { 2706 case IBT_CM_EVENT_REQ_RCV: 2707 /* 2708 * If the plugin is in the NO_ACCEPT state, bail out. 2709 */ 2710 mutex_enter(&plugin_state_lock); 2711 if (plugin_state == NO_ACCEPT) { 2712 mutex_exit(&plugin_state_lock); 2713 return (IBT_CM_REJECT); 2714 } 2715 mutex_exit(&plugin_state_lock); 2716 2717 /* 2718 * Need to send a MRA MAD to CM so that it does not 2719 * timeout on us. 2720 */ 2721 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2722 event->cm_event.req.req_timeout * 8, NULL, 0); 2723 2724 mutex_enter(&rib_stat->open_hca_lock); 2725 q = rib_stat->q; 2726 mutex_exit(&rib_stat->open_hca_lock); 2727 status = rib_svc_create_chan(hca, (caddr_t)q, 2728 event->cm_event.req.req_prim_hca_port, &qp); 2729 if (status) { 2730 #ifdef DEBUG 2731 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2732 "create_channel failed %d", status); 2733 #endif 2734 return (IBT_CM_REJECT); 2735 } 2736 cm_req_rcv = event->cm_event.req; 2737 2738 #ifdef DEBUG 2739 if (rib_debug > 2) { 2740 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2741 "server recv'ed IBT_CM_EVENT_REQ_RCV\n"); 2742 cmn_err(CE_NOTE, "\t\t SID:%llx\n", 2743 (longlong_t)cm_req_rcv.req_service_id); 2744 cmn_err(CE_NOTE, "\t\t Local Port:%d\n", 2745 cm_req_rcv.req_prim_hca_port); 2746 cmn_err(CE_NOTE, 2747 "\t\t Remote GID:(prefix:%llx,guid:%llx)\n", 2748 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix, 2749 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid); 2750 cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n", 2751 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix, 2752 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid); 2753 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n", 2754 cm_req_rcv.req_remote_qpn); 2755 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n", 2756 cm_req_rcv.req_remote_qkey); 2757 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n", 2758 (void *)qp, (void *)qp->qp_hdl); 2759 } 2760 2761 if (rib_debug > 2) { 2762 ibt_rc_chan_query_attr_t chan_attrs; 2763 2764 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs) 2765 == IBT_SUCCESS) { 2766 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in " 2767 "CEP state %d\n", (void *)qp, chan_attrs.rc_state); 2768 } 2769 } 2770 #endif 2771 2772 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2773 ret_args->cm_ret.rep.cm_rdma_ra_out = 1; 2774 ret_args->cm_ret.rep.cm_rdma_ra_in = 1; 2775 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2776 2777 /* 2778 * Pre-posts RECV buffers 2779 */ 2780 conn = qptoc(qp); 2781 for (i = 0; i < preposted_rbufs; i++) { 2782 bzero(&rdbuf, sizeof (rdbuf)); 2783 rdbuf.type = RECV_BUFFER; 2784 buf = rib_rbuf_alloc(conn, &rdbuf); 2785 if (buf == NULL) { 2786 cmn_err(CE_WARN, "rib_svc_cm_handler: " 2787 "No RECV_BUFFER buf!\n"); 2788 (void) rib_disconnect_channel(conn, NULL); 2789 return (IBT_CM_REJECT); 2790 } 2791 2792 bzero(&cl, sizeof (cl)); 2793 cl.c_saddr = (uintptr_t)rdbuf.addr; 2794 cl.c_len = rdbuf.len; 2795 cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */ 2796 cl.c_next = NULL; 2797 status = rib_post_recv(conn, &cl); 2798 if (status != RDMA_SUCCESS) { 2799 cmn_err(CE_WARN, "rib_srv_cm_handler: failed " 2800 "posting RPC_REQ buf to qp %p!", (void *)qp); 2801 (void) rib_disconnect_channel(conn, NULL); 2802 return (IBT_CM_REJECT); 2803 } 2804 } 2805 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2806 2807 /* 2808 * Get the address translation service record from ATS 2809 */ 2810 rw_enter(&hca->state_lock, RW_READER); 2811 if (hca->state == HCA_DETACHED) { 2812 rw_exit(&hca->state_lock); 2813 return (IBT_CM_REJECT); 2814 } 2815 rw_exit(&hca->state_lock); 2816 2817 for (i = 0; i < hca->hca_nports; i++) { 2818 ibt_status = ibt_get_port_state(hca->hca_hdl, i+1, 2819 &sgid, NULL); 2820 if (ibt_status != IBT_SUCCESS) { 2821 if (rib_debug) { 2822 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2823 "ibt_get_port_state FAILED!" 2824 "status = %d\n", ibt_status); 2825 } 2826 } else { 2827 /* 2828 * do ibt_query_ar() 2829 */ 2830 bzero(&ar_query, sizeof (ar_query)); 2831 bzero(&ar_result, sizeof (ar_result)); 2832 ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid; 2833 ar_query.ar_pkey = event->cm_event.req.req_pkey; 2834 ibt_status = ibt_query_ar(&sgid, &ar_query, 2835 &ar_result); 2836 if (ibt_status != IBT_SUCCESS) { 2837 if (rib_debug) { 2838 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2839 "ibt_query_ar FAILED!" 2840 "status = %d\n", ibt_status); 2841 } 2842 } else { 2843 conn = qptoc(qp); 2844 2845 if (is_for_ipv4(&ar_result)) { 2846 struct sockaddr_in *s; 2847 int sin_size = sizeof (struct sockaddr_in); 2848 int in_size = sizeof (struct in_addr); 2849 uint8_t *start_pos; 2850 2851 conn->c_raddr.maxlen = 2852 conn->c_raddr.len = sin_size; 2853 conn->c_raddr.buf = kmem_zalloc(sin_size, 2854 KM_SLEEP); 2855 s = (struct sockaddr_in *)conn->c_raddr.buf; 2856 s->sin_family = AF_INET; 2857 /* 2858 * For IPv4, the IP addr is stored in 2859 * the last four bytes of ar_data. 2860 */ 2861 start_pos = ar_result.ar_data + 2862 ATS_AR_DATA_LEN - in_size; 2863 bcopy(start_pos, &s->sin_addr, in_size); 2864 if (rib_debug > 1) { 2865 char print_addr[INET_ADDRSTRLEN]; 2866 2867 bzero(print_addr, INET_ADDRSTRLEN); 2868 (void) inet_ntop(AF_INET, &s->sin_addr, 2869 print_addr, INET_ADDRSTRLEN); 2870 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2871 "remote clnt_addr: %s\n", print_addr); 2872 } 2873 } else { 2874 struct sockaddr_in6 *s6; 2875 int sin6_size = sizeof (struct sockaddr_in6); 2876 2877 conn->c_raddr.maxlen = 2878 conn->c_raddr.len = sin6_size; 2879 conn->c_raddr.buf = kmem_zalloc(sin6_size, 2880 KM_SLEEP); 2881 2882 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2883 s6->sin6_family = AF_INET6; 2884 /* sin6_addr is stored in ar_data */ 2885 bcopy(ar_result.ar_data, &s6->sin6_addr, 2886 sizeof (struct in6_addr)); 2887 if (rib_debug > 1) { 2888 char print_addr[INET6_ADDRSTRLEN]; 2889 2890 bzero(print_addr, INET6_ADDRSTRLEN); 2891 (void) inet_ntop(AF_INET6, &s6->sin6_addr, 2892 print_addr, INET6_ADDRSTRLEN); 2893 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2894 "remote clnt_addr: %s\n", print_addr); 2895 } 2896 } 2897 return (IBT_CM_ACCEPT); 2898 } 2899 } 2900 } 2901 if (rib_debug > 1) { 2902 cmn_err(CE_WARN, "rib_srv_cm_handler: " 2903 "address record query failed!"); 2904 } 2905 break; 2906 2907 case IBT_CM_EVENT_CONN_CLOSED: 2908 { 2909 CONN *conn; 2910 rib_qp_t *qp; 2911 2912 switch (event->cm_event.closed) { 2913 case IBT_CM_CLOSED_DREP_RCVD: 2914 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2915 case IBT_CM_CLOSED_DUP: 2916 case IBT_CM_CLOSED_ABORT: 2917 case IBT_CM_CLOSED_ALREADY: 2918 /* 2919 * These cases indicate the local end initiated 2920 * the closing of the channel. Nothing to do here. 2921 */ 2922 break; 2923 default: 2924 /* 2925 * Reason for CONN_CLOSED event must be one of 2926 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2927 * or IBT_CM_CLOSED_STALE. These indicate cases were 2928 * the remote end is closing the channel. In these 2929 * cases free the channel and transition to error 2930 * state 2931 */ 2932 qp = ibt_get_chan_private(event->cm_channel); 2933 conn = qptoc(qp); 2934 mutex_enter(&conn->c_lock); 2935 if (conn->c_state == C_DISCONN_PEND) { 2936 mutex_exit(&conn->c_lock); 2937 break; 2938 } 2939 conn->c_state = C_ERROR; 2940 2941 /* 2942 * Free the rc_channel. Channel has already 2943 * transitioned to ERROR state and WRs have been 2944 * FLUSHED_ERR already. 2945 */ 2946 (void) ibt_free_channel(qp->qp_hdl); 2947 qp->qp_hdl = NULL; 2948 2949 /* 2950 * Free the conn if c_ref goes down to 0 2951 */ 2952 if (conn->c_ref == 0) { 2953 /* 2954 * Remove from list and free conn 2955 */ 2956 conn->c_state = C_DISCONN_PEND; 2957 mutex_exit(&conn->c_lock); 2958 (void) rib_disconnect_channel(conn, 2959 &hca->srv_conn_list); 2960 } else { 2961 mutex_exit(&conn->c_lock); 2962 } 2963 #ifdef DEBUG 2964 if (rib_debug) 2965 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2966 " (CONN_CLOSED) channel disconnected"); 2967 #endif 2968 break; 2969 } 2970 break; 2971 } 2972 case IBT_CM_EVENT_CONN_EST: 2973 /* 2974 * RTU received, hence connection established. 2975 */ 2976 if (rib_debug > 1) 2977 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2978 "(CONN_EST) channel established"); 2979 break; 2980 2981 default: 2982 if (rib_debug > 2) { 2983 /* Let CM handle the following events. */ 2984 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 2985 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2986 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 2987 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 2988 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2989 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 2990 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 2991 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2992 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 2993 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 2994 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2995 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 2996 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 2997 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2998 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 2999 } 3000 } 3001 return (IBT_CM_REJECT); 3002 } 3003 3004 /* accept all other CM messages (i.e. let the CM handle them) */ 3005 return (IBT_CM_ACCEPT); 3006 } 3007 3008 static rdma_stat 3009 rib_register_ats(rib_hca_t *hca) 3010 { 3011 ibt_hca_portinfo_t *port_infop; 3012 uint_t port_size; 3013 uint_t pki, i, num_ports, nbinds; 3014 ibt_status_t ibt_status; 3015 rib_service_t *new_service, *temp_srv; 3016 rpcib_ats_t *atsp; 3017 rpcib_ibd_insts_t ibds; 3018 ib_pkey_t pkey; 3019 ibt_ar_t ar; /* address record */ 3020 3021 /* 3022 * Query all ports for the given HCA 3023 */ 3024 rw_enter(&hca->state_lock, RW_READER); 3025 if (hca->state != HCA_DETACHED) { 3026 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3027 &num_ports, &port_size); 3028 rw_exit(&hca->state_lock); 3029 } else { 3030 rw_exit(&hca->state_lock); 3031 return (RDMA_FAILED); 3032 } 3033 if (ibt_status != IBT_SUCCESS) { 3034 #ifdef DEBUG 3035 if (rib_debug) { 3036 cmn_err(CE_NOTE, "rib_register_ats: FAILED in " 3037 "ibt_query_hca_ports, status = %d\n", ibt_status); 3038 } 3039 #endif 3040 return (RDMA_FAILED); 3041 } 3042 3043 #ifdef DEBUG 3044 if (rib_debug > 1) { 3045 cmn_err(CE_NOTE, "rib_register_ats: Ports detected " 3046 "%d\n", num_ports); 3047 3048 for (i = 0; i < num_ports; i++) { 3049 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3050 cmn_err(CE_WARN, "rib_register_ats " 3051 "Port #: %d INACTIVE\n", i+1); 3052 } else if (port_infop[i].p_linkstate == 3053 IBT_PORT_ACTIVE) { 3054 cmn_err(CE_NOTE, "rib_register_ats " 3055 "Port #: %d ACTIVE\n", i+1); 3056 } 3057 } 3058 } 3059 #endif 3060 3061 ibds.rib_ibd_alloc = N_IBD_INSTANCES; 3062 ibds.rib_ibd_cnt = 0; 3063 ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc * 3064 sizeof (rpcib_ats_t), KM_SLEEP); 3065 rib_get_ibd_insts(&ibds); 3066 3067 if (ibds.rib_ibd_cnt == 0) { 3068 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * 3069 sizeof (rpcib_ats_t)); 3070 ibt_free_portinfo(port_infop, port_size); 3071 return (RDMA_FAILED); 3072 } 3073 3074 /* 3075 * Get the IP addresses of active ports and 3076 * register them with ATS. IPv4 addresses 3077 * have precedence over IPv6 addresses. 3078 */ 3079 if (get_ibd_ipaddr(&ibds) != 0) { 3080 #ifdef DEBUG 3081 if (rib_debug > 1) { 3082 cmn_err(CE_WARN, "rib_register_ats: " 3083 "get_ibd_ipaddr failed"); 3084 } 3085 #endif 3086 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * 3087 sizeof (rpcib_ats_t)); 3088 ibt_free_portinfo(port_infop, port_size); 3089 return (RDMA_FAILED); 3090 } 3091 3092 /* 3093 * Start ATS registration for active ports on this HCA. 3094 */ 3095 rw_enter(&hca->service_list_lock, RW_WRITER); 3096 nbinds = 0; 3097 new_service = NULL; 3098 for (i = 0; i < num_ports; i++) { 3099 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3100 continue; 3101 3102 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3103 pkey = port_infop[i].p_pkey_tbl[pki]; 3104 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) { 3105 ar.ar_gid = port_infop[i].p_sgid_tbl[0]; 3106 ar.ar_pkey = pkey; 3107 atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds); 3108 if (atsp == NULL) 3109 continue; 3110 /* 3111 * store the sin[6]_addr in ar_data 3112 */ 3113 (void) bzero(ar.ar_data, ATS_AR_DATA_LEN); 3114 if (atsp->ras_inet_type == AF_INET) { 3115 uint8_t *start_pos; 3116 3117 /* 3118 * The ipv4 addr goes into the last 3119 * four bytes of ar_data. 3120 */ 3121 start_pos = ar.ar_data + ATS_AR_DATA_LEN - 3122 sizeof (struct in_addr); 3123 bcopy(&atsp->ras_sin.sin_addr, start_pos, 3124 sizeof (struct in_addr)); 3125 } else if (atsp->ras_inet_type == AF_INET6) { 3126 bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data, 3127 sizeof (struct in6_addr)); 3128 } else 3129 continue; 3130 3131 ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar); 3132 if (ibt_status == IBT_SUCCESS) { 3133 #ifdef DEBUG 3134 if (rib_debug > 1) { 3135 cmn_err(CE_WARN, "rib_register_ats: " 3136 "ibt_register_ar OK on port %d", i+1); 3137 } 3138 #endif 3139 /* 3140 * Allocate and prepare a service entry 3141 */ 3142 new_service = kmem_zalloc(sizeof (rib_service_t), 3143 KM_SLEEP); 3144 new_service->srv_port = i + 1; 3145 new_service->srv_ar = ar; 3146 new_service->srv_next = NULL; 3147 3148 /* 3149 * Add to the service list for this HCA 3150 */ 3151 new_service->srv_next = hca->ats_list; 3152 hca->ats_list = new_service; 3153 new_service = NULL; 3154 nbinds ++; 3155 } else { 3156 #ifdef DEBUG 3157 if (rib_debug > 1) { 3158 cmn_err(CE_WARN, "rib_register_ats: " 3159 "ibt_register_ar FAILED on port %d", i+1); 3160 } 3161 #endif 3162 } 3163 } 3164 } 3165 } 3166 3167 #ifdef DEBUG 3168 if (rib_debug > 1) { 3169 for (temp_srv = hca->ats_list; temp_srv != NULL; 3170 temp_srv = temp_srv->srv_next) { 3171 cmn_err(CE_NOTE, "Service: ATS, active on" 3172 " port: %d\n", temp_srv->srv_port); 3173 } 3174 } 3175 #endif 3176 3177 rw_exit(&hca->service_list_lock); 3178 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t)); 3179 ibt_free_portinfo(port_infop, port_size); 3180 3181 if (nbinds == 0) { 3182 #ifdef DEBUG 3183 if (rib_debug > 1) { 3184 cmn_err(CE_WARN, "rib_register_ats FAILED!\n"); 3185 } 3186 #endif 3187 return (RDMA_FAILED); 3188 } 3189 return (RDMA_SUCCESS); 3190 } 3191 3192 static rdma_stat 3193 rib_register_service(rib_hca_t *hca, int service_type) 3194 { 3195 ibt_srv_desc_t sdesc; 3196 ibt_srv_bind_t sbind; 3197 ibt_hca_portinfo_t *port_infop; 3198 ib_svc_id_t srv_id; 3199 ibt_srv_hdl_t srv_hdl; 3200 uint_t port_size; 3201 uint_t pki, i, j, num_ports, nbinds; 3202 ibt_status_t ibt_status; 3203 char **addrs; 3204 int addr_count; 3205 rib_service_t *new_service, *temp_srv; 3206 ib_pkey_t pkey; 3207 3208 /* 3209 * Query all ports for the given HCA 3210 */ 3211 rw_enter(&hca->state_lock, RW_READER); 3212 if (hca->state != HCA_DETACHED) { 3213 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3214 &num_ports, &port_size); 3215 rw_exit(&hca->state_lock); 3216 } else { 3217 rw_exit(&hca->state_lock); 3218 return (RDMA_FAILED); 3219 } 3220 if (ibt_status != IBT_SUCCESS) { 3221 #ifdef DEBUG 3222 cmn_err(CE_NOTE, "rib_register_service: FAILED in " 3223 "ibt_query_hca_ports, status = %d\n", ibt_status); 3224 #endif 3225 return (RDMA_FAILED); 3226 } 3227 3228 #ifdef DEBUG 3229 if (rib_debug > 1) { 3230 cmn_err(CE_NOTE, "rib_register_service: Ports detected " 3231 "%d\n", num_ports); 3232 3233 for (i = 0; i < num_ports; i++) { 3234 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3235 cmn_err(CE_WARN, "rib_register_service " 3236 "Port #: %d INACTIVE\n", i+1); 3237 } else if (port_infop[i].p_linkstate == 3238 IBT_PORT_ACTIVE) { 3239 cmn_err(CE_NOTE, "rib_register_service " 3240 "Port #: %d ACTIVE\n", i+1); 3241 } 3242 } 3243 } 3244 #endif 3245 /* 3246 * Get all the IP addresses on this system to register the 3247 * given "service type" on all DNS recognized IP addrs. 3248 * Each service type such as NFS will have all the systems 3249 * IP addresses as its different names. For now the only 3250 * type of service we support in RPCIB is NFS. 3251 */ 3252 addrs = get_ip_addrs(&addr_count); 3253 if (addrs == NULL) { 3254 #ifdef DEBUG 3255 if (rib_debug) { 3256 cmn_err(CE_WARN, "rib_register_service: " 3257 "get_ip_addrs failed\n"); 3258 } 3259 #endif 3260 ibt_free_portinfo(port_infop, port_size); 3261 return (RDMA_FAILED); 3262 } 3263 3264 #ifdef DEBUG 3265 if (rib_debug > 1) { 3266 for (i = 0; i < addr_count; i++) 3267 cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]); 3268 } 3269 #endif 3270 3271 rw_enter(&hca->service_list_lock, RW_WRITER); 3272 /* 3273 * Start registering and binding service to active 3274 * on active ports on this HCA. 3275 */ 3276 nbinds = 0; 3277 new_service = NULL; 3278 3279 /* 3280 * We use IP addresses as the service names for 3281 * service registration. Register each of them 3282 * with CM to obtain a svc_id and svc_hdl. We do not 3283 * register the service with machine's loopback address. 3284 */ 3285 for (j = 1; j < addr_count; j++) { 3286 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3287 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3288 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3289 3290 sdesc.sd_handler = rib_srv_cm_handler; 3291 sdesc.sd_flags = 0; 3292 3293 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3294 &sdesc, 0, 1, &srv_hdl, &srv_id); 3295 if (ibt_status != IBT_SUCCESS) { 3296 #ifdef DEBUG 3297 if (rib_debug) { 3298 cmn_err(CE_WARN, "rib_register_service: " 3299 "ibt_register_service FAILED, status " 3300 "= %d\n", ibt_status); 3301 } 3302 #endif 3303 /* 3304 * No need to go on, since we failed to obtain 3305 * a srv_id and srv_hdl. Move on to the next 3306 * IP addr as a service name. 3307 */ 3308 continue; 3309 } 3310 for (i = 0; i < num_ports; i++) { 3311 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3312 continue; 3313 3314 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3315 pkey = port_infop[i].p_pkey_tbl[pki]; 3316 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) { 3317 3318 /* 3319 * Allocate and prepare a service entry 3320 */ 3321 new_service = kmem_zalloc(1 * sizeof (rib_service_t), 3322 KM_SLEEP); 3323 new_service->srv_type = service_type; 3324 new_service->srv_port = i + 1; 3325 new_service->srv_id = srv_id; 3326 new_service->srv_hdl = srv_hdl; 3327 new_service->srv_sbind_hdl = kmem_zalloc(1 * 3328 sizeof (ibt_sbind_hdl_t), KM_SLEEP); 3329 3330 new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN, 3331 KM_SLEEP); 3332 (void) bcopy(addrs[j], new_service->srv_name, 3333 IB_SVC_NAME_LEN); 3334 (void) strlcat(new_service->srv_name, "::NFS", 3335 IB_SVC_NAME_LEN); 3336 new_service->srv_next = NULL; 3337 3338 /* 3339 * Bind the service, specified by the IP address, 3340 * to the port/pkey using the srv_hdl returned 3341 * from ibt_register_service(). 3342 */ 3343 (void) bzero(&sbind, sizeof (ibt_srv_bind_t)); 3344 sbind.sb_pkey = pkey; 3345 sbind.sb_lease = 0xFFFFFFFF; 3346 sbind.sb_key[0] = NFS_SEC_KEY0; 3347 sbind.sb_key[1] = NFS_SEC_KEY1; 3348 sbind.sb_name = new_service->srv_name; 3349 3350 #ifdef DEBUG 3351 if (rib_debug > 1) { 3352 cmn_err(CE_NOTE, "rib_register_service: " 3353 "binding service using name: %s\n", 3354 sbind.sb_name); 3355 } 3356 #endif 3357 ibt_status = ibt_bind_service(srv_hdl, 3358 port_infop[i].p_sgid_tbl[0], &sbind, rib_stat, 3359 new_service->srv_sbind_hdl); 3360 if (ibt_status != IBT_SUCCESS) { 3361 #ifdef DEBUG 3362 if (rib_debug) { 3363 cmn_err(CE_WARN, "rib_register_service: FAILED" 3364 " in ibt_bind_service, status = %d\n", 3365 ibt_status); 3366 } 3367 #endif 3368 kmem_free(new_service->srv_sbind_hdl, 3369 sizeof (ibt_sbind_hdl_t)); 3370 kmem_free(new_service->srv_name, 3371 IB_SVC_NAME_LEN); 3372 kmem_free(new_service, 3373 sizeof (rib_service_t)); 3374 new_service = NULL; 3375 continue; 3376 } 3377 #ifdef DEBUG 3378 if (rib_debug > 1) { 3379 if (ibt_status == IBT_SUCCESS) 3380 cmn_err(CE_NOTE, "rib_regstr_service: " 3381 "Serv: %s REGISTERED on port: %d", 3382 sbind.sb_name, i+1); 3383 } 3384 #endif 3385 /* 3386 * Add to the service list for this HCA 3387 */ 3388 new_service->srv_next = hca->service_list; 3389 hca->service_list = new_service; 3390 new_service = NULL; 3391 nbinds ++; 3392 } 3393 } 3394 } 3395 } 3396 rw_exit(&hca->service_list_lock); 3397 3398 #ifdef DEBUG 3399 if (rib_debug > 1) { 3400 /* 3401 * Change this print to a more generic one, as rpcib 3402 * is supposed to handle multiple service types. 3403 */ 3404 for (temp_srv = hca->service_list; temp_srv != NULL; 3405 temp_srv = temp_srv->srv_next) { 3406 cmn_err(CE_NOTE, "NFS-IB, active on port:" 3407 " %d\n" 3408 "Using name: %s", temp_srv->srv_port, 3409 temp_srv->srv_name); 3410 } 3411 } 3412 #endif 3413 3414 ibt_free_portinfo(port_infop, port_size); 3415 for (i = 0; i < addr_count; i++) { 3416 if (addrs[i]) 3417 kmem_free(addrs[i], IB_SVC_NAME_LEN); 3418 } 3419 kmem_free(addrs, addr_count * sizeof (char *)); 3420 3421 if (nbinds == 0) { 3422 #ifdef DEBUG 3423 if (rib_debug) { 3424 cmn_err(CE_WARN, "rib_register_service: " 3425 "bind_service FAILED!\n"); 3426 } 3427 #endif 3428 return (RDMA_FAILED); 3429 } else { 3430 /* 3431 * Put this plugin into accept state, since atleast 3432 * one registration was successful. 3433 */ 3434 mutex_enter(&plugin_state_lock); 3435 plugin_state = ACCEPT; 3436 mutex_exit(&plugin_state_lock); 3437 return (RDMA_SUCCESS); 3438 } 3439 } 3440 3441 void 3442 rib_listen(struct rdma_svc_data *rd) 3443 { 3444 rdma_stat status = RDMA_SUCCESS; 3445 3446 rd->active = 0; 3447 rd->err_code = RDMA_FAILED; 3448 3449 /* 3450 * First check if a hca is still attached 3451 */ 3452 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3453 if (rib_stat->hca->state != HCA_INITED) { 3454 rw_exit(&rib_stat->hca->state_lock); 3455 return; 3456 } 3457 rw_exit(&rib_stat->hca->state_lock); 3458 3459 rib_stat->q = &rd->q; 3460 /* 3461 * Register the Address translation service 3462 */ 3463 mutex_enter(&rib_stat->open_hca_lock); 3464 if (ats_running == 0) { 3465 if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) { 3466 #ifdef DEBUG 3467 if (rib_debug) { 3468 cmn_err(CE_WARN, 3469 "rib_listen(): ats registration failed!"); 3470 } 3471 #endif 3472 mutex_exit(&rib_stat->open_hca_lock); 3473 return; 3474 } else { 3475 ats_running = 1; 3476 } 3477 } 3478 mutex_exit(&rib_stat->open_hca_lock); 3479 3480 /* 3481 * Right now the only service type is NFS. Hence force feed this 3482 * value. Ideally to communicate the service type it should be 3483 * passed down in rdma_svc_data. 3484 */ 3485 rib_stat->service_type = NFS; 3486 status = rib_register_service(rib_stat->hca, NFS); 3487 if (status != RDMA_SUCCESS) { 3488 rd->err_code = status; 3489 return; 3490 } 3491 /* 3492 * Service active on an HCA, check rd->err_code for more 3493 * explainable errors. 3494 */ 3495 rd->active = 1; 3496 rd->err_code = status; 3497 } 3498 3499 /* XXXX */ 3500 /* ARGSUSED */ 3501 static void 3502 rib_listen_stop(struct rdma_svc_data *svcdata) 3503 { 3504 rib_hca_t *hca; 3505 3506 /* 3507 * KRPC called the RDMATF to stop the listeners, this means 3508 * stop sending incomming or recieved requests to KRPC master 3509 * transport handle for RDMA-IB. This is also means that the 3510 * master transport handle, responsible for us, is going away. 3511 */ 3512 mutex_enter(&plugin_state_lock); 3513 plugin_state = NO_ACCEPT; 3514 if (svcdata != NULL) 3515 svcdata->active = 0; 3516 mutex_exit(&plugin_state_lock); 3517 3518 /* 3519 * First check if a hca is still attached 3520 */ 3521 hca = rib_stat->hca; 3522 rw_enter(&hca->state_lock, RW_READER); 3523 if (hca->state != HCA_INITED) { 3524 rw_exit(&hca->state_lock); 3525 return; 3526 } 3527 rib_stop_services(hca); 3528 rw_exit(&hca->state_lock); 3529 } 3530 3531 /* 3532 * Traverse the HCA's service list to unbind and deregister services. 3533 * Instead of unbinding the service for a service handle by 3534 * calling ibt_unbind_service() for each port/pkey, we unbind 3535 * all the services for the service handle by making only one 3536 * call to ibt_unbind_all_services(). Then, we deregister the 3537 * service for the service handle. 3538 * 3539 * When traversing the entries in service_list, we compare the 3540 * srv_hdl of the current entry with that of the next. If they 3541 * are different or if the next entry is NULL, the current entry 3542 * marks the last binding of the service handle. In this case, 3543 * call ibt_unbind_all_services() and deregister the service for 3544 * the service handle. If they are the same, the current and the 3545 * next entries are bound to the same service handle. In this 3546 * case, move on to the next entry. 3547 */ 3548 static void 3549 rib_stop_services(rib_hca_t *hca) 3550 { 3551 rib_service_t *srv_list, *to_remove; 3552 ibt_status_t ibt_status; 3553 3554 /* 3555 * unbind and deregister the services for this service type. 3556 * Right now there is only one service type. In future it will 3557 * be passed down to this function. 3558 */ 3559 rw_enter(&hca->service_list_lock, RW_WRITER); 3560 srv_list = hca->service_list; 3561 while (srv_list != NULL) { 3562 to_remove = srv_list; 3563 srv_list = to_remove->srv_next; 3564 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3565 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3566 3567 ibt_status = ibt_unbind_all_services(to_remove->srv_hdl); 3568 if (ibt_status != IBT_SUCCESS) { 3569 cmn_err(CE_WARN, "rib_listen_stop: " 3570 "ibt_unbind_all_services FAILED" 3571 " status: %d\n", ibt_status); 3572 } 3573 3574 ibt_status = 3575 ibt_deregister_service(hca->ibt_clnt_hdl, 3576 to_remove->srv_hdl); 3577 if (ibt_status != IBT_SUCCESS) { 3578 cmn_err(CE_WARN, "rib_listen_stop: " 3579 "ibt_deregister_service FAILED" 3580 " status: %d\n", ibt_status); 3581 } 3582 3583 #ifdef DEBUG 3584 if (rib_debug > 1) { 3585 if (ibt_status == IBT_SUCCESS) 3586 cmn_err(CE_NOTE, "rib_listen_stop: " 3587 "Successfully stopped and" 3588 " UNREGISTERED service: %s\n", 3589 to_remove->srv_name); 3590 } 3591 #endif 3592 } 3593 kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN); 3594 kmem_free(to_remove->srv_sbind_hdl, 3595 sizeof (ibt_sbind_hdl_t)); 3596 3597 kmem_free(to_remove, sizeof (rib_service_t)); 3598 } 3599 hca->service_list = NULL; 3600 rw_exit(&hca->service_list_lock); 3601 } 3602 3603 static struct svc_recv * 3604 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3605 { 3606 struct svc_recv *recvp; 3607 3608 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3609 recvp->vaddr = sgl->ds_va; 3610 recvp->qp = qp; 3611 recvp->bytes_xfer = 0; 3612 return (recvp); 3613 } 3614 3615 static int 3616 rib_free_svc_recv(struct svc_recv *recvp) 3617 { 3618 kmem_free(recvp, sizeof (*recvp)); 3619 3620 return (0); 3621 } 3622 3623 static struct reply * 3624 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3625 { 3626 struct reply *rep; 3627 3628 3629 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3630 if (rep == NULL) { 3631 mutex_exit(&qp->replylist_lock); 3632 cmn_err(CE_WARN, "rib_addreplylist: no memory\n"); 3633 return (NULL); 3634 } 3635 rep->xid = msgid; 3636 rep->vaddr_cq = NULL; 3637 rep->bytes_xfer = 0; 3638 rep->status = (uint_t)REPLY_WAIT; 3639 rep->prev = NULL; 3640 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3641 3642 mutex_enter(&qp->replylist_lock); 3643 if (qp->replylist) { 3644 rep->next = qp->replylist; 3645 qp->replylist->prev = rep; 3646 } 3647 qp->rep_list_size++; 3648 if (rib_debug > 1) 3649 cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n", 3650 (void *)qp, qp->rep_list_size); 3651 qp->replylist = rep; 3652 mutex_exit(&qp->replylist_lock); 3653 3654 return (rep); 3655 } 3656 3657 static rdma_stat 3658 rib_rem_replylist(rib_qp_t *qp) 3659 { 3660 struct reply *r, *n; 3661 3662 mutex_enter(&qp->replylist_lock); 3663 for (r = qp->replylist; r != NULL; r = n) { 3664 n = r->next; 3665 (void) rib_remreply(qp, r); 3666 } 3667 mutex_exit(&qp->replylist_lock); 3668 3669 return (RDMA_SUCCESS); 3670 } 3671 3672 static int 3673 rib_remreply(rib_qp_t *qp, struct reply *rep) 3674 { 3675 3676 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3677 if (rep->prev) { 3678 rep->prev->next = rep->next; 3679 } 3680 if (rep->next) { 3681 rep->next->prev = rep->prev; 3682 } 3683 if (qp->replylist == rep) 3684 qp->replylist = rep->next; 3685 3686 cv_destroy(&rep->wait_cv); 3687 qp->rep_list_size--; 3688 if (rib_debug > 1) 3689 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n", 3690 (void *)qp, qp->rep_list_size); 3691 3692 kmem_free(rep, sizeof (*rep)); 3693 3694 return (0); 3695 } 3696 3697 rdma_stat 3698 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 3699 struct mrc *buf_handle) 3700 { 3701 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3702 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3703 rdma_stat status; 3704 rib_hca_t *hca = (ctoqp(conn))->hca; 3705 3706 /* 3707 * Note: ALL buffer pools use the same memory type RDMARW. 3708 */ 3709 status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc); 3710 if (status == RDMA_SUCCESS) { 3711 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3712 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3713 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3714 } else { 3715 buf_handle->mrc_linfo = NULL; 3716 buf_handle->mrc_lmr = 0; 3717 buf_handle->mrc_rmr = 0; 3718 } 3719 return (status); 3720 } 3721 3722 static rdma_stat 3723 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 3724 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3725 { 3726 ibt_mr_attr_t mem_attr; 3727 ibt_status_t ibt_status; 3728 3729 mem_attr.mr_vaddr = (uintptr_t)buf; 3730 mem_attr.mr_len = (ib_msglen_t)size; 3731 mem_attr.mr_as = NULL; 3732 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3733 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3734 IBT_MR_ENABLE_WINDOW_BIND | spec; 3735 3736 rw_enter(&hca->state_lock, RW_READER); 3737 if (hca->state == HCA_INITED) { 3738 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3739 &mem_attr, mr_hdlp, mr_descp); 3740 rw_exit(&hca->state_lock); 3741 } else { 3742 rw_exit(&hca->state_lock); 3743 return (RDMA_FAILED); 3744 } 3745 3746 if (ibt_status != IBT_SUCCESS) { 3747 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr " 3748 "(spec:%d) failed for addr %llX, status %d", 3749 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status); 3750 return (RDMA_FAILED); 3751 } 3752 return (RDMA_SUCCESS); 3753 } 3754 3755 rdma_stat 3756 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 3757 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle) 3758 { 3759 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3760 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3761 rdma_stat status; 3762 rib_hca_t *hca = (ctoqp(conn))->hca; 3763 3764 /* 3765 * Non-coherent memory registration. 3766 */ 3767 status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 3768 &mr_desc); 3769 if (status == RDMA_SUCCESS) { 3770 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3771 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3772 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3773 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3774 } else { 3775 buf_handle->mrc_linfo = NULL; 3776 buf_handle->mrc_lmr = 0; 3777 buf_handle->mrc_rmr = 0; 3778 } 3779 return (status); 3780 } 3781 3782 /* ARGSUSED */ 3783 rdma_stat 3784 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3785 { 3786 rib_hca_t *hca = (ctoqp(conn))->hca; 3787 3788 /* 3789 * Allow memory deregistration even if HCA is 3790 * getting detached. Need all outstanding 3791 * memory registrations to be deregistered 3792 * before HCA_DETACH_EVENT can be accepted. 3793 */ 3794 (void) ibt_deregister_mr(hca->hca_hdl, 3795 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3796 return (RDMA_SUCCESS); 3797 } 3798 3799 /* ARGSUSED */ 3800 rdma_stat 3801 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3802 RIB_SYNCMEM_HANDLE sync_handle) 3803 { 3804 (void) rib_deregistermem(conn, buf, buf_handle); 3805 3806 return (RDMA_SUCCESS); 3807 } 3808 3809 /* ARGSUSED */ 3810 rdma_stat 3811 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3812 int len, int cpu) 3813 { 3814 ibt_status_t status; 3815 rib_hca_t *hca = (ctoqp(conn))->hca; 3816 ibt_mr_sync_t mr_segment; 3817 3818 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3819 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3820 mr_segment.ms_len = (ib_memlen_t)len; 3821 if (cpu) { 3822 /* make incoming data visible to memory */ 3823 mr_segment.ms_flags = IBT_SYNC_WRITE; 3824 } else { 3825 /* make memory changes visible to IO */ 3826 mr_segment.ms_flags = IBT_SYNC_READ; 3827 } 3828 rw_enter(&hca->state_lock, RW_READER); 3829 if (hca->state == HCA_INITED) { 3830 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3831 rw_exit(&hca->state_lock); 3832 } else { 3833 rw_exit(&hca->state_lock); 3834 return (RDMA_FAILED); 3835 } 3836 3837 if (status == IBT_SUCCESS) 3838 return (RDMA_SUCCESS); 3839 else { 3840 #ifdef DEBUG 3841 cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n", 3842 status); 3843 #endif 3844 return (RDMA_FAILED); 3845 } 3846 } 3847 3848 /* 3849 * XXXX ???? 3850 */ 3851 static rdma_stat 3852 rib_getinfo(rdma_info_t *info) 3853 { 3854 /* 3855 * XXXX Hack! 3856 */ 3857 info->addrlen = 16; 3858 info->mts = 1000000; 3859 info->mtu = 1000000; 3860 3861 return (RDMA_SUCCESS); 3862 } 3863 3864 rib_bufpool_t * 3865 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3866 { 3867 rib_bufpool_t *rbp = NULL; 3868 bufpool_t *bp = NULL; 3869 caddr_t buf; 3870 ibt_mr_attr_t mem_attr; 3871 ibt_status_t ibt_status; 3872 int i, j; 3873 3874 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3875 3876 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3877 num * sizeof (void *), KM_SLEEP); 3878 3879 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3880 bp->numelems = num; 3881 3882 switch (ptype) { 3883 case SEND_BUFFER: 3884 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3885 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 3886 bp->rsize = RPC_MSG_SZ; 3887 break; 3888 case RECV_BUFFER: 3889 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3890 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 3891 bp->rsize = RPC_BUF_SIZE; 3892 break; 3893 default: 3894 goto fail; 3895 } 3896 3897 /* 3898 * Register the pool. 3899 */ 3900 bp->bufsize = num * bp->rsize; 3901 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3902 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3903 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3904 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3905 sizeof (ibt_mr_desc_t), KM_SLEEP); 3906 3907 rw_enter(&hca->state_lock, RW_READER); 3908 if (hca->state != HCA_INITED) { 3909 rw_exit(&hca->state_lock); 3910 goto fail; 3911 } 3912 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3913 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3914 mem_attr.mr_vaddr = (uintptr_t)buf; 3915 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3916 mem_attr.mr_as = NULL; 3917 ibt_status = ibt_register_mr(hca->hca_hdl, 3918 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i], 3919 &rbp->mr_desc[i]); 3920 if (ibt_status != IBT_SUCCESS) { 3921 for (j = 0; j < i; j++) { 3922 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]); 3923 } 3924 rw_exit(&hca->state_lock); 3925 goto fail; 3926 } 3927 } 3928 rw_exit(&hca->state_lock); 3929 3930 buf = (caddr_t)bp->buf; 3931 for (i = 0; i < num; i++, buf += bp->rsize) { 3932 bp->buflist[i] = (void *)buf; 3933 } 3934 bp->buffree = num - 1; /* no. of free buffers */ 3935 rbp->bpool = bp; 3936 3937 return (rbp); 3938 fail: 3939 if (bp) { 3940 if (bp->buf) 3941 kmem_free(bp->buf, bp->bufsize); 3942 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3943 } 3944 if (rbp) { 3945 if (rbp->mr_hdl) 3946 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3947 if (rbp->mr_desc) 3948 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3949 kmem_free(rbp, sizeof (rib_bufpool_t)); 3950 } 3951 return (NULL); 3952 } 3953 3954 static void 3955 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3956 { 3957 int i; 3958 rib_bufpool_t *rbp = NULL; 3959 bufpool_t *bp; 3960 3961 /* 3962 * Obtain pool address based on type of pool 3963 */ 3964 switch (ptype) { 3965 case SEND_BUFFER: 3966 rbp = hca->send_pool; 3967 break; 3968 case RECV_BUFFER: 3969 rbp = hca->recv_pool; 3970 break; 3971 default: 3972 return; 3973 } 3974 if (rbp == NULL) 3975 return; 3976 3977 bp = rbp->bpool; 3978 3979 /* 3980 * Deregister the pool memory and free it. 3981 */ 3982 for (i = 0; i < bp->numelems; i++) { 3983 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3984 } 3985 } 3986 3987 static void 3988 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3989 { 3990 3991 rib_bufpool_t *rbp = NULL; 3992 bufpool_t *bp; 3993 3994 /* 3995 * Obtain pool address based on type of pool 3996 */ 3997 switch (ptype) { 3998 case SEND_BUFFER: 3999 rbp = hca->send_pool; 4000 break; 4001 case RECV_BUFFER: 4002 rbp = hca->recv_pool; 4003 break; 4004 default: 4005 return; 4006 } 4007 if (rbp == NULL) 4008 return; 4009 4010 bp = rbp->bpool; 4011 4012 /* 4013 * Free the pool memory. 4014 */ 4015 if (rbp->mr_hdl) 4016 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 4017 4018 if (rbp->mr_desc) 4019 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4020 4021 if (bp->buf) 4022 kmem_free(bp->buf, bp->bufsize); 4023 mutex_destroy(&bp->buflock); 4024 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4025 kmem_free(rbp, sizeof (rib_bufpool_t)); 4026 } 4027 4028 void 4029 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4030 { 4031 /* 4032 * Deregister the pool memory and free it. 4033 */ 4034 rib_rbufpool_deregister(hca, ptype); 4035 rib_rbufpool_free(hca, ptype); 4036 } 4037 4038 /* 4039 * Fetch a buffer from the pool of type specified in rdbuf->type. 4040 */ 4041 static rdma_stat 4042 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4043 { 4044 4045 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4046 if (rdbuf->addr) { 4047 switch (rdbuf->type) { 4048 case SEND_BUFFER: 4049 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4050 break; 4051 case RECV_BUFFER: 4052 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4053 break; 4054 default: 4055 rdbuf->len = 0; 4056 } 4057 return (RDMA_SUCCESS); 4058 } else 4059 return (RDMA_FAILED); 4060 } 4061 4062 4063 /* 4064 * Fetch a buffer of specified type. 4065 * Note that rdbuf->handle is mw's rkey. 4066 */ 4067 static void * 4068 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4069 { 4070 rib_qp_t *qp = ctoqp(conn); 4071 rib_hca_t *hca = qp->hca; 4072 rdma_btype ptype = rdbuf->type; 4073 void *buf; 4074 rib_bufpool_t *rbp = NULL; 4075 bufpool_t *bp; 4076 int i; 4077 4078 /* 4079 * Obtain pool address based on type of pool 4080 */ 4081 switch (ptype) { 4082 case SEND_BUFFER: 4083 rbp = hca->send_pool; 4084 break; 4085 case RECV_BUFFER: 4086 rbp = hca->recv_pool; 4087 break; 4088 default: 4089 return (NULL); 4090 } 4091 if (rbp == NULL) 4092 return (NULL); 4093 4094 bp = rbp->bpool; 4095 4096 mutex_enter(&bp->buflock); 4097 if (bp->buffree < 0) { 4098 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!"); 4099 mutex_exit(&bp->buflock); 4100 return (NULL); 4101 } 4102 4103 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4104 buf = bp->buflist[bp->buffree]; 4105 rdbuf->addr = buf; 4106 rdbuf->len = bp->rsize; 4107 for (i = bp->numelems - 1; i >= 0; i--) { 4108 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4109 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey; 4110 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i]; 4111 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey; 4112 bp->buffree--; 4113 if (rib_debug > 1) 4114 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs " 4115 "(type %d)\n", bp->buffree+1, ptype); 4116 4117 mutex_exit(&bp->buflock); 4118 4119 return (buf); 4120 } 4121 } 4122 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of " 4123 "type %d found!", buf, ptype); 4124 mutex_exit(&bp->buflock); 4125 4126 return (NULL); 4127 } 4128 4129 static void 4130 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4131 { 4132 4133 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4134 } 4135 4136 static void 4137 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4138 { 4139 rib_qp_t *qp = ctoqp(conn); 4140 rib_hca_t *hca = qp->hca; 4141 rib_bufpool_t *rbp = NULL; 4142 bufpool_t *bp; 4143 4144 /* 4145 * Obtain pool address based on type of pool 4146 */ 4147 switch (ptype) { 4148 case SEND_BUFFER: 4149 rbp = hca->send_pool; 4150 break; 4151 case RECV_BUFFER: 4152 rbp = hca->recv_pool; 4153 break; 4154 default: 4155 return; 4156 } 4157 if (rbp == NULL) 4158 return; 4159 4160 bp = rbp->bpool; 4161 4162 mutex_enter(&bp->buflock); 4163 if (++bp->buffree >= bp->numelems) { 4164 /* 4165 * Should never happen 4166 */ 4167 cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) " 4168 "too many frees!", ptype); 4169 bp->buffree--; 4170 } else { 4171 bp->buflist[bp->buffree] = buf; 4172 if (rib_debug > 1) 4173 cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs " 4174 "(type %d)\n", bp->buffree+1, ptype); 4175 } 4176 mutex_exit(&bp->buflock); 4177 } 4178 4179 static rdma_stat 4180 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4181 { 4182 rw_enter(&connlist->conn_lock, RW_WRITER); 4183 if (connlist->conn_hd) { 4184 cn->c_next = connlist->conn_hd; 4185 connlist->conn_hd->c_prev = cn; 4186 } 4187 connlist->conn_hd = cn; 4188 rw_exit(&connlist->conn_lock); 4189 4190 return (RDMA_SUCCESS); 4191 } 4192 4193 static rdma_stat 4194 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4195 { 4196 rw_enter(&connlist->conn_lock, RW_WRITER); 4197 if (cn->c_prev) { 4198 cn->c_prev->c_next = cn->c_next; 4199 } 4200 if (cn->c_next) { 4201 cn->c_next->c_prev = cn->c_prev; 4202 } 4203 if (connlist->conn_hd == cn) 4204 connlist->conn_hd = cn->c_next; 4205 rw_exit(&connlist->conn_lock); 4206 4207 return (RDMA_SUCCESS); 4208 } 4209 4210 /* 4211 * Connection management. 4212 * IBTF does not support recycling of channels. So connections are only 4213 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or 4214 * C_DISCONN_PEND state. No C_IDLE state. 4215 * C_CONN_PEND state: Connection establishment in progress to the server. 4216 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4217 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4218 * only in this state. 4219 * C_ERROR state: A connection transitions to this state when WRs on the 4220 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4221 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4222 * C_DISCONN_PEND state: When a connection is in C_ERROR state and when 4223 * c_ref drops to 0 (this indicates that RPC has no more references to this 4224 * connection), the connection should be destroyed. A connection transitions 4225 * into this state when it is being destroyed. 4226 */ 4227 static rdma_stat 4228 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 4229 { 4230 CONN *cn; 4231 int status = RDMA_SUCCESS; 4232 rib_hca_t *hca = (rib_hca_t *)handle; 4233 rib_qp_t *qp; 4234 clock_t cv_stat, timout; 4235 ibt_path_info_t path; 4236 4237 again: 4238 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4239 cn = hca->cl_conn_list.conn_hd; 4240 while (cn != NULL) { 4241 /* 4242 * First, clear up any connection in the ERROR state 4243 */ 4244 mutex_enter(&cn->c_lock); 4245 if (cn->c_state == C_ERROR) { 4246 if (cn->c_ref == 0) { 4247 /* 4248 * Remove connection from list and destroy it. 4249 */ 4250 cn->c_state = C_DISCONN_PEND; 4251 mutex_exit(&cn->c_lock); 4252 rw_exit(&hca->cl_conn_list.conn_lock); 4253 (void) rib_disconnect_channel(cn, 4254 &hca->cl_conn_list); 4255 goto again; 4256 } 4257 mutex_exit(&cn->c_lock); 4258 cn = cn->c_next; 4259 continue; 4260 } else if (cn->c_state == C_DISCONN_PEND) { 4261 mutex_exit(&cn->c_lock); 4262 cn = cn->c_next; 4263 continue; 4264 } 4265 if ((cn->c_raddr.len == svcaddr->len) && 4266 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 4267 /* 4268 * Our connection. Give up conn list lock 4269 * as we are done traversing the list. 4270 */ 4271 rw_exit(&hca->cl_conn_list.conn_lock); 4272 if (cn->c_state == C_CONNECTED) { 4273 cn->c_ref++; /* sharing a conn */ 4274 mutex_exit(&cn->c_lock); 4275 *conn = cn; 4276 return (status); 4277 } 4278 if (cn->c_state == C_CONN_PEND) { 4279 /* 4280 * Hold a reference to this conn before 4281 * we give up the lock. 4282 */ 4283 cn->c_ref++; 4284 timout = ddi_get_lbolt() + 4285 drv_usectohz(CONN_WAIT_TIME * 1000000); 4286 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4287 &cn->c_lock, timout)) > 0 && 4288 cn->c_state == C_CONN_PEND) 4289 ; 4290 if (cv_stat == 0) { 4291 cn->c_ref--; 4292 mutex_exit(&cn->c_lock); 4293 return (RDMA_INTR); 4294 } 4295 if (cv_stat < 0) { 4296 cn->c_ref--; 4297 mutex_exit(&cn->c_lock); 4298 return (RDMA_TIMEDOUT); 4299 } 4300 if (cn->c_state == C_CONNECTED) { 4301 *conn = cn; 4302 mutex_exit(&cn->c_lock); 4303 return (status); 4304 } else { 4305 cn->c_ref--; 4306 mutex_exit(&cn->c_lock); 4307 return (RDMA_TIMEDOUT); 4308 } 4309 } 4310 } 4311 mutex_exit(&cn->c_lock); 4312 cn = cn->c_next; 4313 } 4314 rw_exit(&hca->cl_conn_list.conn_lock); 4315 4316 status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path); 4317 if (status != RDMA_SUCCESS) { 4318 #ifdef DEBUG 4319 if (rib_debug) { 4320 cmn_err(CE_WARN, "rib_conn_get: " 4321 "No server ATS record!"); 4322 } 4323 #endif 4324 return (RDMA_FAILED); 4325 } 4326 4327 /* 4328 * Channel to server doesn't exist yet, create one. 4329 */ 4330 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 4331 return (RDMA_FAILED); 4332 } 4333 cn = qptoc(qp); 4334 cn->c_state = C_CONN_PEND; 4335 cn->c_ref = 1; 4336 4337 /* 4338 * Add to conn list. 4339 * We had given up the READER lock. In the time since then, 4340 * another thread might have created the connection we are 4341 * trying here. But for now, that is quiet alright - there 4342 * might be two connections between a pair of hosts instead 4343 * of one. If we really want to close that window, 4344 * then need to check the list after acquiring the 4345 * WRITER lock. 4346 */ 4347 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4348 status = rib_conn_to_srv(hca, qp, &path); 4349 mutex_enter(&cn->c_lock); 4350 if (status == RDMA_SUCCESS) { 4351 cn->c_state = C_CONNECTED; 4352 *conn = cn; 4353 } else { 4354 cn->c_state = C_ERROR; 4355 cn->c_ref--; 4356 #ifdef DEBUG 4357 if (rib_debug) { 4358 cmn_err(CE_WARN, "rib_conn_get: FAILED creating" 4359 " a channel!"); 4360 } 4361 #endif 4362 } 4363 cv_broadcast(&cn->c_cv); 4364 mutex_exit(&cn->c_lock); 4365 return (status); 4366 } 4367 4368 static rdma_stat 4369 rib_conn_release(CONN *conn) 4370 { 4371 rib_qp_t *qp = ctoqp(conn); 4372 4373 mutex_enter(&conn->c_lock); 4374 conn->c_ref--; 4375 4376 /* 4377 * If a conn is C_ERROR, close the channel. 4378 * If it's CONNECTED, keep it that way. 4379 */ 4380 if (conn->c_ref == 0 && (conn->c_state & C_ERROR)) { 4381 conn->c_state = C_DISCONN_PEND; 4382 mutex_exit(&conn->c_lock); 4383 if (qp->mode == RIB_SERVER) 4384 (void) rib_disconnect_channel(conn, 4385 &qp->hca->srv_conn_list); 4386 else 4387 (void) rib_disconnect_channel(conn, 4388 &qp->hca->cl_conn_list); 4389 return (RDMA_SUCCESS); 4390 } 4391 mutex_exit(&conn->c_lock); 4392 return (RDMA_SUCCESS); 4393 } 4394 4395 /* 4396 * Add at front of list 4397 */ 4398 static struct rdma_done_list * 4399 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4400 { 4401 struct rdma_done_list *rd; 4402 4403 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4404 4405 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4406 rd->xid = xid; 4407 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4408 4409 rd->prev = NULL; 4410 rd->next = qp->rdlist; 4411 if (qp->rdlist != NULL) 4412 qp->rdlist->prev = rd; 4413 qp->rdlist = rd; 4414 4415 return (rd); 4416 } 4417 4418 static void 4419 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4420 { 4421 struct rdma_done_list *r; 4422 4423 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4424 4425 r = rd->next; 4426 if (r != NULL) { 4427 r->prev = rd->prev; 4428 } 4429 4430 r = rd->prev; 4431 if (r != NULL) { 4432 r->next = rd->next; 4433 } else { 4434 qp->rdlist = rd->next; 4435 } 4436 4437 cv_destroy(&rd->rdma_done_cv); 4438 kmem_free(rd, sizeof (*rd)); 4439 } 4440 4441 static void 4442 rdma_done_rem_list(rib_qp_t *qp) 4443 { 4444 struct rdma_done_list *r, *n; 4445 4446 mutex_enter(&qp->rdlist_lock); 4447 for (r = qp->rdlist; r != NULL; r = n) { 4448 n = r->next; 4449 rdma_done_rm(qp, r); 4450 } 4451 mutex_exit(&qp->rdlist_lock); 4452 } 4453 4454 static void 4455 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4456 { 4457 struct rdma_done_list *r = qp->rdlist; 4458 4459 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4460 4461 while (r) { 4462 if (r->xid == xid) { 4463 cv_signal(&r->rdma_done_cv); 4464 return; 4465 } else { 4466 r = r->next; 4467 } 4468 } 4469 if (rib_debug > 1) { 4470 cmn_err(CE_WARN, "rdma_done_notify: " 4471 "No matching xid for %u, qp %p\n", xid, (void *)qp); 4472 } 4473 } 4474 4475 rpcib_ats_t * 4476 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds) 4477 { 4478 rpcib_ats_t *atsp; 4479 int i; 4480 4481 for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) { 4482 if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix && 4483 atsp->ras_port_gid.gid_guid == gid->gid_guid && 4484 atsp->ras_pkey == pkey) { 4485 return (atsp); 4486 } 4487 } 4488 return (NULL); 4489 } 4490 4491 int 4492 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg) 4493 { 4494 rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg; 4495 rpcib_ats_t *atsp; 4496 ib_pkey_t pkey; 4497 uint8_t port; 4498 ib_guid_t hca_guid; 4499 ib_gid_t port_gid; 4500 4501 if ((i_ddi_node_state(dip) >= DS_ATTACHED) && 4502 (strcmp(ddi_node_name(dip), "ibport") == 0) && 4503 (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) { 4504 4505 if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) { 4506 rpcib_ats_t *tmp; 4507 4508 tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc + 4509 N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP); 4510 bcopy(ibds->rib_ats, tmp, 4511 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t)); 4512 kmem_free(ibds->rib_ats, 4513 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t)); 4514 ibds->rib_ats = tmp; 4515 ibds->rib_ibd_alloc += N_IBD_INSTANCES; 4516 } 4517 if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, 4518 dip, 0, "hca-guid", 0)) == 0) || 4519 ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 4520 0, "port-number", 0)) == 0) || 4521 (ibt_get_port_state_byguid(hca_guid, port, 4522 &port_gid, NULL) != IBT_SUCCESS) || 4523 ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 4524 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= 4525 IB_PKEY_INVALID_FULL)) { 4526 return (DDI_WALK_CONTINUE); 4527 } 4528 atsp = &ibds->rib_ats[ibds->rib_ibd_cnt]; 4529 atsp->ras_inst = ddi_get_instance(dip); 4530 atsp->ras_pkey = pkey; 4531 atsp->ras_port_gid = port_gid; 4532 ibds->rib_ibd_cnt++; 4533 } 4534 return (DDI_WALK_CONTINUE); 4535 } 4536 4537 void 4538 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds) 4539 { 4540 ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds); 4541 } 4542 4543 /* 4544 * Return ibd interfaces and ibd instances. 4545 */ 4546 int 4547 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds) 4548 { 4549 TIUSER *tiptr, *tiptr6; 4550 vnode_t *kvp, *kvp6; 4551 vnode_t *vp = NULL, *vp6 = NULL; 4552 struct strioctl iocb; 4553 struct lifreq lif_req; 4554 int k, ip_cnt; 4555 rpcib_ats_t *atsp; 4556 4557 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, 4558 &kvp) == 0) { 4559 if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, 4560 &tiptr, CRED()) == 0) { 4561 vp = tiptr->fp->f_vnode; 4562 } else { 4563 VN_RELE(kvp); 4564 } 4565 } 4566 4567 if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP, 4568 &kvp6) == 0) { 4569 if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE, 4570 &tiptr6, CRED()) == 0) { 4571 vp6 = tiptr6->fp->f_vnode; 4572 } else { 4573 VN_RELE(kvp6); 4574 } 4575 } 4576 4577 if (vp == NULL && vp6 == NULL) 4578 return (-1); 4579 4580 /* Get ibd ip's */ 4581 ip_cnt = 0; 4582 for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) { 4583 /* IPv4 */ 4584 if (vp != NULL) { 4585 (void) bzero((void *)&lif_req, sizeof (struct lifreq)); 4586 (void) snprintf(lif_req.lifr_name, 4587 sizeof (lif_req.lifr_name), "%s%d", 4588 IBD_NAME, atsp->ras_inst); 4589 4590 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4591 iocb.ic_cmd = SIOCGLIFADDR; 4592 iocb.ic_timout = 0; 4593 iocb.ic_len = sizeof (struct lifreq); 4594 iocb.ic_dp = (caddr_t)&lif_req; 4595 if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) { 4596 atsp->ras_inet_type = AF_INET; 4597 bcopy(&lif_req.lifr_addr, &atsp->ras_sin, 4598 sizeof (struct sockaddr_in)); 4599 ip_cnt++; 4600 continue; 4601 } 4602 } 4603 /* Try IPv6 */ 4604 if (vp6 != NULL) { 4605 (void) bzero((void *)&lif_req, sizeof (struct lifreq)); 4606 (void) snprintf(lif_req.lifr_name, 4607 sizeof (lif_req.lifr_name), "%s%d", 4608 IBD_NAME, atsp->ras_inst); 4609 4610 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4611 iocb.ic_cmd = SIOCGLIFADDR; 4612 iocb.ic_timout = 0; 4613 iocb.ic_len = sizeof (struct lifreq); 4614 iocb.ic_dp = (caddr_t)&lif_req; 4615 if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) { 4616 4617 atsp->ras_inet_type = AF_INET6; 4618 bcopy(&lif_req.lifr_addr, &atsp->ras_sin6, 4619 sizeof (struct sockaddr_in6)); 4620 ip_cnt++; 4621 } 4622 } 4623 } 4624 4625 if (vp6 != NULL) { 4626 (void) t_kclose(tiptr6, 0); 4627 VN_RELE(kvp6); 4628 } 4629 if (vp != NULL) { 4630 (void) t_kclose(tiptr, 0); 4631 VN_RELE(kvp); 4632 } 4633 4634 if (ip_cnt == 0) 4635 return (-1); 4636 else 4637 return (0); 4638 } 4639 4640 char ** 4641 get_ip_addrs(int *count) 4642 { 4643 TIUSER *tiptr; 4644 vnode_t *kvp; 4645 int num_of_ifs; 4646 char **addresses; 4647 int return_code; 4648 4649 /* 4650 * Open a device for doing down stream kernel ioctls 4651 */ 4652 return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, 4653 NULLVPP, &kvp); 4654 if (return_code != 0) { 4655 cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n"); 4656 *count = -1; 4657 return (NULL); 4658 } 4659 4660 return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, 4661 &tiptr, CRED()); 4662 if (return_code != 0) { 4663 cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n"); 4664 VN_RELE(kvp); 4665 *count = -1; 4666 return (NULL); 4667 } 4668 4669 /* 4670 * Perform the first ioctl to get the number of interfaces 4671 */ 4672 return_code = get_interfaces(tiptr, &num_of_ifs); 4673 if (return_code != 0 || num_of_ifs == 0) { 4674 cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n"); 4675 (void) t_kclose(tiptr, 0); 4676 VN_RELE(kvp); 4677 *count = -1; 4678 return (NULL); 4679 } 4680 4681 /* 4682 * Perform the second ioctl to get the address on each interface 4683 * found. 4684 */ 4685 addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP); 4686 return_code = find_addrs(tiptr, addresses, num_of_ifs); 4687 if (return_code <= 0) { 4688 cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n"); 4689 (void) t_kclose(tiptr, 0); 4690 kmem_free(addresses, num_of_ifs * sizeof (char *)); 4691 VN_RELE(kvp); 4692 *count = -1; 4693 return (NULL); 4694 } 4695 4696 *count = return_code; 4697 VN_RELE(kvp); 4698 (void) t_kclose(tiptr, 0); 4699 return (addresses); 4700 } 4701 4702 int 4703 get_interfaces(TIUSER *tiptr, int *num) 4704 { 4705 struct lifnum if_buf; 4706 struct strioctl iocb; 4707 vnode_t *vp; 4708 int return_code; 4709 4710 /* 4711 * Prep the number of interfaces request buffer for ioctl 4712 */ 4713 (void) bzero((void *)&if_buf, sizeof (struct lifnum)); 4714 if_buf.lifn_family = AF_UNSPEC; 4715 if_buf.lifn_flags = 0; 4716 4717 /* 4718 * Prep the kernel ioctl buffer and send it down stream 4719 */ 4720 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4721 iocb.ic_cmd = SIOCGLIFNUM; 4722 iocb.ic_timout = 0; 4723 iocb.ic_len = sizeof (if_buf); 4724 iocb.ic_dp = (caddr_t)&if_buf; 4725 4726 vp = tiptr->fp->f_vnode; 4727 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4728 if (return_code != 0) { 4729 cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n"); 4730 *num = -1; 4731 return (-1); 4732 } 4733 4734 *num = if_buf.lifn_count; 4735 #ifdef DEBUG 4736 if (rib_debug > 1) 4737 cmn_err(CE_NOTE, "Number of interfaces detected: %d\n", 4738 if_buf.lifn_count); 4739 #endif 4740 return (0); 4741 } 4742 4743 int 4744 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs) 4745 { 4746 struct lifconf lifc; 4747 struct lifreq *if_data_buf; 4748 struct strioctl iocb; 4749 caddr_t request_buffer; 4750 struct sockaddr_in *sin4; 4751 struct sockaddr_in6 *sin6; 4752 vnode_t *vp; 4753 int i, count, return_code; 4754 4755 /* 4756 * Prep the buffer for requesting all interface's info 4757 */ 4758 (void) bzero((void *)&lifc, sizeof (struct lifconf)); 4759 lifc.lifc_family = AF_UNSPEC; 4760 lifc.lifc_flags = 0; 4761 lifc.lifc_len = num_ifs * sizeof (struct lifreq); 4762 4763 request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq), 4764 KM_SLEEP); 4765 4766 lifc.lifc_buf = request_buffer; 4767 4768 /* 4769 * Prep the kernel ioctl buffer and send it down stream 4770 */ 4771 (void) bzero((void *)&iocb, sizeof (struct strioctl)); 4772 iocb.ic_cmd = SIOCGLIFCONF; 4773 iocb.ic_timout = 0; 4774 iocb.ic_len = sizeof (struct lifconf); 4775 iocb.ic_dp = (caddr_t)&lifc; 4776 4777 vp = tiptr->fp->f_vnode; 4778 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4779 if (return_code != 0) { 4780 cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n"); 4781 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq)); 4782 return (-1); 4783 } 4784 4785 /* 4786 * Extract addresses and fill them in the requested array 4787 * IB_SVC_NAME_LEN is defined to be 64 so it covers both IPv4 & 4788 * IPv6. Here count is the number of IP addresses collected. 4789 */ 4790 if_data_buf = lifc.lifc_req; 4791 count = 0; 4792 for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, 4793 if_data_buf++) { 4794 if (if_data_buf->lifr_addr.ss_family == AF_INET) { 4795 sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr; 4796 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 4797 (void) inet_ntop(AF_INET, &sin4->sin_addr, 4798 addrs[count], IB_SVC_NAME_LEN); 4799 count ++; 4800 } 4801 4802 if (if_data_buf->lifr_addr.ss_family == AF_INET6) { 4803 sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr; 4804 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP); 4805 (void) inet_ntop(AF_INET6, &sin6->sin6_addr, 4806 addrs[count], IB_SVC_NAME_LEN); 4807 count ++; 4808 } 4809 } 4810 4811 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq)); 4812 return (count); 4813 } 4814 4815 /* 4816 * Goes through all connections and closes the channel 4817 * This will cause all the WRs on those channels to be 4818 * flushed. 4819 */ 4820 static void 4821 rib_close_channels(rib_conn_list_t *connlist) 4822 { 4823 CONN *conn; 4824 rib_qp_t *qp; 4825 4826 rw_enter(&connlist->conn_lock, RW_READER); 4827 conn = connlist->conn_hd; 4828 while (conn != NULL) { 4829 mutex_enter(&conn->c_lock); 4830 qp = ctoqp(conn); 4831 if (conn->c_state & C_CONNECTED) { 4832 /* 4833 * Live connection in CONNECTED state. 4834 * Call ibt_close_rc_channel in nonblocking mode 4835 * with no callbacks. 4836 */ 4837 conn->c_state = C_ERROR; 4838 (void) ibt_close_rc_channel(qp->qp_hdl, 4839 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4840 (void) ibt_free_channel(qp->qp_hdl); 4841 qp->qp_hdl = NULL; 4842 } else { 4843 if (conn->c_state == C_ERROR && 4844 qp->qp_hdl != NULL) { 4845 /* 4846 * Connection in ERROR state but 4847 * channel is not yet freed. 4848 */ 4849 (void) ibt_close_rc_channel(qp->qp_hdl, 4850 IBT_NOCALLBACKS, NULL, 0, NULL, 4851 NULL, 0); 4852 (void) ibt_free_channel(qp->qp_hdl); 4853 qp->qp_hdl = NULL; 4854 } 4855 } 4856 mutex_exit(&conn->c_lock); 4857 conn = conn->c_next; 4858 } 4859 rw_exit(&connlist->conn_lock); 4860 } 4861 4862 /* 4863 * Frees up all connections that are no longer being referenced 4864 */ 4865 static void 4866 rib_purge_connlist(rib_conn_list_t *connlist) 4867 { 4868 CONN *conn; 4869 4870 top: 4871 rw_enter(&connlist->conn_lock, RW_READER); 4872 conn = connlist->conn_hd; 4873 while (conn != NULL) { 4874 mutex_enter(&conn->c_lock); 4875 4876 /* 4877 * At this point connection is either in ERROR 4878 * or DISCONN_PEND state. If in DISCONN_PEND state 4879 * then some other thread is culling that connection. 4880 * If not and if c_ref is 0, then destroy the connection. 4881 */ 4882 if (conn->c_ref == 0 && 4883 conn->c_state != C_DISCONN_PEND) { 4884 /* 4885 * Cull the connection 4886 */ 4887 conn->c_state = C_DISCONN_PEND; 4888 mutex_exit(&conn->c_lock); 4889 rw_exit(&connlist->conn_lock); 4890 (void) rib_disconnect_channel(conn, connlist); 4891 goto top; 4892 } else { 4893 /* 4894 * conn disconnect already scheduled or will 4895 * happen from conn_release when c_ref drops to 0. 4896 */ 4897 mutex_exit(&conn->c_lock); 4898 } 4899 conn = conn->c_next; 4900 } 4901 rw_exit(&connlist->conn_lock); 4902 4903 /* 4904 * At this point, only connections with c_ref != 0 are on the list 4905 */ 4906 } 4907 4908 /* 4909 * Cleans and closes up all uses of the HCA 4910 */ 4911 static void 4912 rib_detach_hca(rib_hca_t *hca) 4913 { 4914 4915 /* 4916 * Stop all services on the HCA 4917 * Go through cl_conn_list and close all rc_channels 4918 * Go through svr_conn_list and close all rc_channels 4919 * Free connections whose c_ref has dropped to 0 4920 * Destroy all CQs 4921 * Deregister and released all buffer pool memory after all 4922 * connections are destroyed 4923 * Free the protection domain 4924 * ibt_close_hca() 4925 */ 4926 rw_enter(&hca->state_lock, RW_WRITER); 4927 if (hca->state == HCA_DETACHED) { 4928 rw_exit(&hca->state_lock); 4929 return; 4930 } 4931 4932 hca->state = HCA_DETACHED; 4933 rib_stat->nhca_inited--; 4934 4935 rib_stop_services(hca); 4936 rib_deregister_ats(); 4937 rib_close_channels(&hca->cl_conn_list); 4938 rib_close_channels(&hca->srv_conn_list); 4939 rw_exit(&hca->state_lock); 4940 4941 rib_purge_connlist(&hca->cl_conn_list); 4942 rib_purge_connlist(&hca->srv_conn_list); 4943 4944 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4945 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4946 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4947 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4948 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4949 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4950 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4951 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4952 4953 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4954 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4955 if (hca->srv_conn_list.conn_hd == NULL && 4956 hca->cl_conn_list.conn_hd == NULL) { 4957 /* 4958 * conn_lists are NULL, so destroy 4959 * buffers, close hca and be done. 4960 */ 4961 rib_rbufpool_destroy(hca, RECV_BUFFER); 4962 rib_rbufpool_destroy(hca, SEND_BUFFER); 4963 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4964 (void) ibt_close_hca(hca->hca_hdl); 4965 hca->hca_hdl = NULL; 4966 } 4967 rw_exit(&hca->cl_conn_list.conn_lock); 4968 rw_exit(&hca->srv_conn_list.conn_lock); 4969 4970 if (hca->hca_hdl != NULL) { 4971 mutex_enter(&hca->inuse_lock); 4972 while (hca->inuse) 4973 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4974 mutex_exit(&hca->inuse_lock); 4975 /* 4976 * conn_lists are now NULL, so destroy 4977 * buffers, close hca and be done. 4978 */ 4979 rib_rbufpool_destroy(hca, RECV_BUFFER); 4980 rib_rbufpool_destroy(hca, SEND_BUFFER); 4981 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4982 (void) ibt_close_hca(hca->hca_hdl); 4983 hca->hca_hdl = NULL; 4984 } 4985 } 4986