1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 /* 221 * rib_stat: private data pointer used when registering 222 * with the IBTF. It is returned to the consumer 223 * in all callbacks. 224 */ 225 static rpcib_state_t *rib_stat = NULL; 226 227 #define RNR_RETRIES IBT_RNR_RETRY_1 228 #define MAX_PORTS 2 229 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 230 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 231 232 int preposted_rbufs = RDMA_BUFS_GRANT; 233 int send_threshold = 1; 234 235 /* 236 * Old cards with Tavor driver have limited memory footprint 237 * when booted in 32bit. The rib_max_rbufs tunable can be 238 * tuned for more buffers if needed. 239 */ 240 241 #if !defined(_ELF64) && !defined(__sparc) 242 int rib_max_rbufs = MAX_BUFS; 243 #else 244 int rib_max_rbufs = 10 * MAX_BUFS; 245 #endif /* !(_ELF64) && !(__sparc) */ 246 247 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 248 249 /* 250 * State of the plugin. 251 * ACCEPT = accepting new connections and requests. 252 * NO_ACCEPT = not accepting new connection and requests. 253 * This should eventually move to rpcib_state_t structure, since this 254 * will tell in which state the plugin is for a particular type of service 255 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 256 * state for one and in no_accept state for the other. 257 */ 258 int plugin_state; 259 kmutex_t plugin_state_lock; 260 261 ldi_ident_t rpcib_li; 262 263 /* 264 * RPCIB RDMATF operations 265 */ 266 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 267 static rdma_stat rib_disconnect(CONN *conn); 268 static void rib_listen(struct rdma_svc_data *rd); 269 static void rib_listen_stop(struct rdma_svc_data *rd); 270 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 271 uint_t buflen, struct mrc *buf_handle); 272 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 273 struct mrc buf_handle); 274 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 275 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 276 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 277 struct mrc buf_handle); 278 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 279 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 280 void *lrc); 281 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 282 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 283 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 284 caddr_t buf, int len, int cpu); 285 286 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 287 288 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 289 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 290 291 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 292 293 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 294 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 295 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 296 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 297 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 298 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 299 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 300 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 301 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 302 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 303 int addr_type, void *, CONN **); 304 static rdma_stat rib_conn_release(CONN *conn); 305 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 306 rpcib_ping_t *, CONN **); 307 static rdma_stat rib_getinfo(rdma_info_t *info); 308 309 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 310 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 311 static void rib_destroy_cache(rib_hca_t *hca); 312 static void rib_server_side_cache_reclaim(void *argp); 313 static int avl_compare(const void *t1, const void *t2); 314 315 static void rib_stop_services(rib_hca_t *); 316 static void rib_close_channels(rib_conn_list_t *); 317 static void rib_conn_close(void *); 318 319 /* 320 * RPCIB addressing operations 321 */ 322 323 /* 324 * RDMA operations the RPCIB module exports 325 */ 326 static rdmaops_t rib_ops = { 327 rib_reachable, 328 rib_conn_get, 329 rib_conn_release, 330 rib_listen, 331 rib_listen_stop, 332 rib_registermem, 333 rib_deregistermem, 334 rib_registermemsync, 335 rib_deregistermemsync, 336 rib_syncmem, 337 rib_reg_buf_alloc, 338 rib_reg_buf_free, 339 rib_send, 340 rib_send_resp, 341 rib_post_resp, 342 rib_post_resp_remove, 343 rib_post_recv, 344 rib_recv, 345 rib_read, 346 rib_write, 347 rib_getinfo, 348 }; 349 350 /* 351 * RDMATF RPCIB plugin details 352 */ 353 static rdma_mod_t rib_mod = { 354 "ibtf", /* api name */ 355 RDMATF_VERS_1, 356 0, 357 &rib_ops, /* rdma op vector for ibtf */ 358 }; 359 360 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 361 static rdma_stat rib_qp_init(rib_qp_t *, int); 362 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 363 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 364 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 365 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 366 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 367 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 368 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 369 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 370 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 371 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 372 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 373 rib_qp_t **); 374 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 375 rib_qp_t **); 376 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 377 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 378 static int rib_free_sendwait(struct send_wid *); 379 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 380 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 381 static void rdma_done_rem_list(rib_qp_t *); 382 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 383 384 static void rib_async_handler(void *, 385 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 386 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 387 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 388 static int rib_free_svc_recv(struct svc_recv *); 389 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 390 static void rib_free_wid(struct recv_wid *); 391 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 392 static void rib_detach_hca(rib_hca_t *); 393 static void rib_close_a_channel(CONN *); 394 static void rib_send_hold(rib_qp_t *); 395 static void rib_send_rele(rib_qp_t *); 396 397 /* 398 * Registration with IBTF as a consumer 399 */ 400 static struct ibt_clnt_modinfo_s rib_modinfo = { 401 IBTI_V_CURR, 402 IBT_GENERIC, 403 rib_async_handler, /* async event handler */ 404 NULL, /* Memory Region Handler */ 405 "nfs/ib" 406 }; 407 408 /* 409 * Global strucuture 410 */ 411 412 typedef struct rpcib_s { 413 dev_info_t *rpcib_dip; 414 kmutex_t rpcib_mutex; 415 } rpcib_t; 416 417 rpcib_t rpcib; 418 419 /* 420 * /etc/system controlled variable to control 421 * debugging in rpcib kernel module. 422 * Set it to values greater that 1 to control 423 * the amount of debugging messages required. 424 */ 425 int rib_debug = 0; 426 427 int 428 _init(void) 429 { 430 int error; 431 432 error = mod_install((struct modlinkage *)&rib_modlinkage); 433 if (error != 0) { 434 /* 435 * Could not load module 436 */ 437 return (error); 438 } 439 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 440 return (0); 441 } 442 443 int 444 _fini() 445 { 446 int status; 447 448 /* 449 * Remove module 450 */ 451 if ((status = mod_remove(&rib_modlinkage)) != 0) { 452 return (status); 453 } 454 mutex_destroy(&plugin_state_lock); 455 return (0); 456 } 457 458 int 459 _info(struct modinfo *modinfop) 460 { 461 return (mod_info(&rib_modlinkage, modinfop)); 462 } 463 464 /* 465 * rpcib_getinfo() 466 * Given the device number, return the devinfo pointer or the 467 * instance number. 468 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 469 */ 470 471 /*ARGSUSED*/ 472 static int 473 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 474 { 475 int ret = DDI_SUCCESS; 476 477 switch (cmd) { 478 case DDI_INFO_DEVT2DEVINFO: 479 if (rpcib.rpcib_dip != NULL) 480 *result = rpcib.rpcib_dip; 481 else { 482 *result = NULL; 483 ret = DDI_FAILURE; 484 } 485 break; 486 487 case DDI_INFO_DEVT2INSTANCE: 488 *result = NULL; 489 break; 490 491 default: 492 ret = DDI_FAILURE; 493 } 494 return (ret); 495 } 496 497 static void 498 rpcib_free_hca_list() 499 { 500 rib_hca_t *hca, *hcap; 501 502 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 503 hca = rib_stat->hcas_list; 504 rib_stat->hcas_list = NULL; 505 rw_exit(&rib_stat->hcas_list_lock); 506 while (hca != NULL) { 507 rw_enter(&hca->state_lock, RW_WRITER); 508 hcap = hca; 509 hca = hca->next; 510 rib_stat->nhca_inited--; 511 rib_mod.rdma_count--; 512 hcap->state = HCA_DETACHED; 513 rw_exit(&hcap->state_lock); 514 rib_stop_hca_services(hcap); 515 516 kmem_free(hcap, sizeof (*hcap)); 517 } 518 } 519 520 static rdma_stat 521 rpcib_free_service_list() 522 { 523 rib_service_t *service; 524 ibt_status_t ret; 525 526 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 527 while (rib_stat->service_list != NULL) { 528 service = rib_stat->service_list; 529 ret = ibt_unbind_all_services(service->srv_hdl); 530 if (ret != IBT_SUCCESS) { 531 rw_exit(&rib_stat->service_list_lock); 532 #ifdef DEBUG 533 cmn_err(CE_NOTE, "rpcib_free_service_list: " 534 "ibt_unbind_all_services failed (%d)\n", (int)ret); 535 #endif 536 return (RDMA_FAILED); 537 } 538 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 539 service->srv_hdl); 540 if (ret != IBT_SUCCESS) { 541 rw_exit(&rib_stat->service_list_lock); 542 #ifdef DEBUG 543 cmn_err(CE_NOTE, "rpcib_free_service_list: " 544 "ibt_deregister_service failed (%d)\n", (int)ret); 545 #endif 546 return (RDMA_FAILED); 547 } 548 rib_stat->service_list = service->next; 549 kmem_free(service, sizeof (rib_service_t)); 550 } 551 rw_exit(&rib_stat->service_list_lock); 552 553 return (RDMA_SUCCESS); 554 } 555 556 static int 557 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 558 { 559 ibt_status_t ibt_status; 560 rdma_stat r_status; 561 562 switch (cmd) { 563 case DDI_ATTACH: 564 break; 565 case DDI_RESUME: 566 return (DDI_SUCCESS); 567 default: 568 return (DDI_FAILURE); 569 } 570 571 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 572 573 mutex_enter(&rpcib.rpcib_mutex); 574 if (rpcib.rpcib_dip != NULL) { 575 mutex_exit(&rpcib.rpcib_mutex); 576 return (DDI_FAILURE); 577 } 578 rpcib.rpcib_dip = dip; 579 mutex_exit(&rpcib.rpcib_mutex); 580 /* 581 * Create the "rpcib" minor-node. 582 */ 583 if (ddi_create_minor_node(dip, 584 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 585 /* Error message, no cmn_err as they print on console */ 586 return (DDI_FAILURE); 587 } 588 589 if (rib_stat == NULL) { 590 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 591 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 592 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 593 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 594 } 595 596 rib_stat->hca_count = ibt_get_hca_list(NULL); 597 if (rib_stat->hca_count < 1) { 598 mutex_destroy(&rib_stat->listen_lock); 599 rw_destroy(&rib_stat->hcas_list_lock); 600 mutex_destroy(&rib_stat->open_hca_lock); 601 kmem_free(rib_stat, sizeof (*rib_stat)); 602 rib_stat = NULL; 603 return (DDI_FAILURE); 604 } 605 606 ibt_status = ibt_attach(&rib_modinfo, dip, 607 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 608 609 if (ibt_status != IBT_SUCCESS) { 610 mutex_destroy(&rib_stat->listen_lock); 611 rw_destroy(&rib_stat->hcas_list_lock); 612 mutex_destroy(&rib_stat->open_hca_lock); 613 kmem_free(rib_stat, sizeof (*rib_stat)); 614 rib_stat = NULL; 615 return (DDI_FAILURE); 616 } 617 618 rib_stat->service_list = NULL; 619 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 620 mutex_enter(&rib_stat->open_hca_lock); 621 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 622 mutex_exit(&rib_stat->open_hca_lock); 623 goto open_fail; 624 } 625 mutex_exit(&rib_stat->open_hca_lock); 626 627 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 628 DDI_PROP_SUCCESS) { 629 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 630 "failed."); 631 goto register_fail; 632 } 633 634 /* 635 * Register with rdmatf 636 */ 637 r_status = rdma_register_mod(&rib_mod); 638 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 639 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 640 "status = %d", r_status); 641 goto register_fail; 642 } 643 644 return (DDI_SUCCESS); 645 646 register_fail: 647 648 open_fail: 649 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 650 rpcib_free_hca_list(); 651 (void) rpcib_free_service_list(); 652 mutex_destroy(&rib_stat->listen_lock); 653 rw_destroy(&rib_stat->hcas_list_lock); 654 mutex_destroy(&rib_stat->open_hca_lock); 655 rw_destroy(&rib_stat->service_list_lock); 656 kmem_free(rib_stat, sizeof (*rib_stat)); 657 rib_stat = NULL; 658 return (DDI_FAILURE); 659 } 660 661 /*ARGSUSED*/ 662 static int 663 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 664 { 665 switch (cmd) { 666 667 case DDI_DETACH: 668 break; 669 670 case DDI_SUSPEND: 671 default: 672 return (DDI_FAILURE); 673 } 674 675 /* 676 * Detach the hca and free resources 677 */ 678 mutex_enter(&plugin_state_lock); 679 plugin_state = NO_ACCEPT; 680 mutex_exit(&plugin_state_lock); 681 682 if (rpcib_free_service_list() != RDMA_SUCCESS) 683 return (DDI_FAILURE); 684 rpcib_free_hca_list(); 685 686 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 687 mutex_destroy(&rib_stat->listen_lock); 688 rw_destroy(&rib_stat->hcas_list_lock); 689 mutex_destroy(&rib_stat->open_hca_lock); 690 rw_destroy(&rib_stat->service_list_lock); 691 692 kmem_free(rib_stat, sizeof (*rib_stat)); 693 rib_stat = NULL; 694 695 mutex_enter(&rpcib.rpcib_mutex); 696 rpcib.rpcib_dip = NULL; 697 mutex_exit(&rpcib.rpcib_mutex); 698 mutex_destroy(&rpcib.rpcib_mutex); 699 return (DDI_SUCCESS); 700 } 701 702 703 static void rib_rbufpool_free(rib_hca_t *, int); 704 static void rib_rbufpool_deregister(rib_hca_t *, int); 705 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 706 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 707 static rdma_stat rib_rem_replylist(rib_qp_t *); 708 static int rib_remreply(rib_qp_t *, struct reply *); 709 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 710 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 711 712 713 /* 714 * One CQ pair per HCA 715 */ 716 static rdma_stat 717 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 718 rib_cq_t **cqp) 719 { 720 rib_cq_t *cq; 721 ibt_cq_attr_t cq_attr; 722 uint32_t real_size; 723 ibt_status_t status; 724 rdma_stat error = RDMA_SUCCESS; 725 726 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 727 cq->rib_hca = hca; 728 cq_attr.cq_size = cq_size; 729 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 730 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 731 &real_size); 732 if (status != IBT_SUCCESS) { 733 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 734 " status=%d", status); 735 error = RDMA_FAILED; 736 goto fail; 737 } 738 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 739 740 /* 741 * Enable CQ callbacks. CQ Callbacks are single shot 742 * (e.g. you have to call ibt_enable_cq_notify() 743 * after each callback to get another one). 744 */ 745 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 746 if (status != IBT_SUCCESS) { 747 cmn_err(CE_WARN, "rib_create_cq: " 748 "enable_cq_notify failed, status %d", status); 749 error = RDMA_FAILED; 750 goto fail; 751 } 752 *cqp = cq; 753 754 return (error); 755 fail: 756 if (cq->rib_cq_hdl) 757 (void) ibt_free_cq(cq->rib_cq_hdl); 758 if (cq) 759 kmem_free(cq, sizeof (rib_cq_t)); 760 return (error); 761 } 762 763 /* 764 * rpcib_find_hca 765 * 766 * Caller should have already locked the hcas_lock before calling 767 * this function. 768 */ 769 static rib_hca_t * 770 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 771 { 772 rib_hca_t *hca = ribstat->hcas_list; 773 774 while (hca && hca->hca_guid != guid) 775 hca = hca->next; 776 777 return (hca); 778 } 779 780 static rdma_stat 781 rpcib_open_hcas(rpcib_state_t *ribstat) 782 { 783 rib_hca_t *hca; 784 ibt_status_t ibt_status; 785 rdma_stat status; 786 ibt_hca_portinfo_t *pinfop; 787 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 788 uint_t size, cq_size; 789 int i; 790 kstat_t *ksp; 791 cache_avl_struct_t example_avl_node; 792 char rssc_name[32]; 793 int old_nhca_inited = ribstat->nhca_inited; 794 ib_guid_t *hca_guids; 795 796 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 797 798 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 799 if (ribstat->hca_count == 0) 800 return (RDMA_FAILED); 801 802 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 803 /* 804 * Open a hca and setup for RDMA 805 */ 806 for (i = 0; i < ribstat->hca_count; i++) { 807 if (rpcib_find_hca(ribstat, hca_guids[i])) 808 continue; 809 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 810 811 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 812 hca_guids[i], &hca->hca_hdl); 813 if (ibt_status != IBT_SUCCESS) { 814 kmem_free(hca, sizeof (rib_hca_t)); 815 continue; 816 } 817 hca->hca_guid = hca_guids[i]; 818 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 819 hca->state = HCA_INITED; 820 821 /* 822 * query HCA info 823 */ 824 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 825 if (ibt_status != IBT_SUCCESS) { 826 goto fail1; 827 } 828 829 /* 830 * One PD (Protection Domain) per HCA. 831 * A qp is allowed to access a memory region 832 * only when it's in the same PD as that of 833 * the memory region. 834 */ 835 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 836 if (ibt_status != IBT_SUCCESS) { 837 goto fail1; 838 } 839 840 /* 841 * query HCA ports 842 */ 843 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 844 0, &pinfop, &hca->hca_nports, &size); 845 if (ibt_status != IBT_SUCCESS) { 846 goto fail2; 847 } 848 hca->hca_ports = pinfop; 849 hca->hca_pinfosz = size; 850 pinfop = NULL; 851 852 cq_size = DEF_CQ_SIZE; /* default cq size */ 853 /* 854 * Create 2 pairs of cq's (1 pair for client 855 * and the other pair for server) on this hca. 856 * If number of qp's gets too large, then several 857 * cq's will be needed. 858 */ 859 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 860 &hca->svc_rcq); 861 if (status != RDMA_SUCCESS) { 862 goto fail3; 863 } 864 865 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 866 &hca->svc_scq); 867 if (status != RDMA_SUCCESS) { 868 goto fail3; 869 } 870 871 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 872 &hca->clnt_rcq); 873 if (status != RDMA_SUCCESS) { 874 goto fail3; 875 } 876 877 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 878 &hca->clnt_scq); 879 if (status != RDMA_SUCCESS) { 880 goto fail3; 881 } 882 883 /* 884 * Create buffer pools. 885 * Note rib_rbuf_create also allocates memory windows. 886 */ 887 hca->recv_pool = rib_rbufpool_create(hca, 888 RECV_BUFFER, rib_max_rbufs); 889 if (hca->recv_pool == NULL) { 890 goto fail3; 891 } 892 893 hca->send_pool = rib_rbufpool_create(hca, 894 SEND_BUFFER, rib_max_rbufs); 895 if (hca->send_pool == NULL) { 896 rib_rbufpool_destroy(hca, RECV_BUFFER); 897 goto fail3; 898 } 899 900 if (hca->server_side_cache == NULL) { 901 (void) sprintf(rssc_name, 902 "rib_srvr_cache_%llx", 903 (long long unsigned int) hca->hca_guid); 904 hca->server_side_cache = kmem_cache_create( 905 rssc_name, 906 sizeof (cache_avl_struct_t), 0, 907 NULL, 908 NULL, 909 rib_server_side_cache_reclaim, 910 hca, NULL, 0); 911 } 912 913 avl_create(&hca->avl_tree, 914 avl_compare, 915 sizeof (cache_avl_struct_t), 916 (uint_t)(uintptr_t)&example_avl_node.avl_link- 917 (uint_t)(uintptr_t)&example_avl_node); 918 919 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 920 hca->iblock); 921 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 922 rw_init(&hca->avl_rw_lock, 923 NULL, RW_DRIVER, hca->iblock); 924 mutex_init(&hca->cache_allocation_lock, 925 NULL, MUTEX_DRIVER, NULL); 926 hca->avl_init = TRUE; 927 928 /* Create kstats for the cache */ 929 ASSERT(INGLOBALZONE(curproc)); 930 931 if (!stats_enabled) { 932 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 933 KSTAT_TYPE_NAMED, 934 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 935 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 936 GLOBAL_ZONEID); 937 if (ksp) { 938 ksp->ks_data = (void *) &rpcib_kstat; 939 ksp->ks_update = rpcib_cache_kstat_update; 940 kstat_install(ksp); 941 stats_enabled = TRUE; 942 } 943 } 944 if (hca->cleanup_helper == NULL) { 945 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 946 947 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 948 (unsigned long long int) hca->hca_guid); 949 hca->cleanup_helper = ddi_taskq_create(NULL, 950 tq_name, 1, TASKQ_DEFAULTPRI, 0); 951 } 952 953 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 954 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 955 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 956 hca->iblock); 957 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 958 hca->iblock); 959 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 960 hca->inuse = TRUE; 961 962 hca->next = ribstat->hcas_list; 963 ribstat->hcas_list = hca; 964 ribstat->nhca_inited++; 965 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 966 continue; 967 968 fail3: 969 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 970 fail2: 971 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 972 fail1: 973 (void) ibt_close_hca(hca->hca_hdl); 974 kmem_free(hca, sizeof (rib_hca_t)); 975 } 976 rw_exit(&ribstat->hcas_list_lock); 977 ibt_free_hca_list(hca_guids, ribstat->hca_count); 978 rib_mod.rdma_count = rib_stat->nhca_inited; 979 980 /* 981 * return success if at least one new hca has been configured. 982 */ 983 if (ribstat->nhca_inited != old_nhca_inited) 984 return (RDMA_SUCCESS); 985 else 986 return (RDMA_FAILED); 987 } 988 989 /* 990 * Callback routines 991 */ 992 993 /* 994 * SCQ handlers 995 */ 996 /* ARGSUSED */ 997 static void 998 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 999 { 1000 ibt_status_t ibt_status; 1001 ibt_wc_t wc; 1002 struct send_wid *wd; 1003 CONN *conn; 1004 rib_qp_t *qp; 1005 int i; 1006 1007 /* 1008 * Re-enable cq notify here to avoid missing any 1009 * completion queue notification. 1010 */ 1011 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1012 1013 ibt_status = IBT_SUCCESS; 1014 while (ibt_status != IBT_CQ_EMPTY) { 1015 bzero(&wc, sizeof (wc)); 1016 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1017 if (ibt_status != IBT_SUCCESS) 1018 return; 1019 1020 /* 1021 * Got a send completion 1022 */ 1023 if (wc.wc_id != RDMA_DUMMY_WRID) { 1024 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1025 qp = wd->qp; 1026 conn = qptoc(qp); 1027 1028 mutex_enter(&wd->sendwait_lock); 1029 switch (wc.wc_status) { 1030 case IBT_WC_SUCCESS: 1031 wd->status = RDMA_SUCCESS; 1032 break; 1033 default: 1034 /* 1035 * RC Send Q Error Code Local state Remote State 1036 * ==================== =========== ============ 1037 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1038 * IBT_WC_LOCAL_LEN_ERR ERROR None 1039 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1040 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1041 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1042 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1043 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1044 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1045 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1046 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1047 * IBT_WC_WR_FLUSHED_ERR ERROR None 1048 */ 1049 /* 1050 * Channel in error state. Set connection to 1051 * ERROR and cleanup will happen either from 1052 * conn_release or from rib_conn_get 1053 */ 1054 wd->status = RDMA_FAILED; 1055 mutex_enter(&conn->c_lock); 1056 if (conn->c_state != C_DISCONN_PEND) 1057 conn->c_state = C_ERROR_CONN; 1058 mutex_exit(&conn->c_lock); 1059 break; 1060 } 1061 1062 if (wd->cv_sig == 1) { 1063 /* 1064 * Notify poster 1065 */ 1066 cv_signal(&wd->wait_cv); 1067 mutex_exit(&wd->sendwait_lock); 1068 } else { 1069 /* 1070 * Poster not waiting for notification. 1071 * Free the send buffers and send_wid 1072 */ 1073 for (i = 0; i < wd->nsbufs; i++) { 1074 rib_rbuf_free(qptoc(wd->qp), 1075 SEND_BUFFER, 1076 (void *)(uintptr_t)wd->sbufaddr[i]); 1077 } 1078 1079 /* decrement the send ref count */ 1080 rib_send_rele(qp); 1081 1082 mutex_exit(&wd->sendwait_lock); 1083 (void) rib_free_sendwait(wd); 1084 } 1085 } 1086 } 1087 } 1088 1089 /* ARGSUSED */ 1090 static void 1091 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1092 { 1093 ibt_status_t ibt_status; 1094 ibt_wc_t wc; 1095 struct send_wid *wd; 1096 rib_qp_t *qp; 1097 CONN *conn; 1098 int i; 1099 1100 /* 1101 * Re-enable cq notify here to avoid missing any 1102 * completion queue notification. 1103 */ 1104 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1105 1106 ibt_status = IBT_SUCCESS; 1107 while (ibt_status != IBT_CQ_EMPTY) { 1108 bzero(&wc, sizeof (wc)); 1109 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1110 if (ibt_status != IBT_SUCCESS) 1111 return; 1112 1113 /* 1114 * Got a send completion 1115 */ 1116 if (wc.wc_id != RDMA_DUMMY_WRID) { 1117 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1118 qp = wd->qp; 1119 conn = qptoc(qp); 1120 mutex_enter(&wd->sendwait_lock); 1121 1122 switch (wc.wc_status) { 1123 case IBT_WC_SUCCESS: 1124 wd->status = RDMA_SUCCESS; 1125 break; 1126 default: 1127 /* 1128 * Channel in error state. Set connection to 1129 * ERROR and cleanup will happen either from 1130 * conn_release or conn timeout. 1131 */ 1132 wd->status = RDMA_FAILED; 1133 mutex_enter(&conn->c_lock); 1134 if (conn->c_state != C_DISCONN_PEND) 1135 conn->c_state = C_ERROR_CONN; 1136 mutex_exit(&conn->c_lock); 1137 break; 1138 } 1139 1140 if (wd->cv_sig == 1) { 1141 /* 1142 * Update completion status and notify poster 1143 */ 1144 cv_signal(&wd->wait_cv); 1145 mutex_exit(&wd->sendwait_lock); 1146 } else { 1147 /* 1148 * Poster not waiting for notification. 1149 * Free the send buffers and send_wid 1150 */ 1151 for (i = 0; i < wd->nsbufs; i++) { 1152 rib_rbuf_free(qptoc(wd->qp), 1153 SEND_BUFFER, 1154 (void *)(uintptr_t)wd->sbufaddr[i]); 1155 } 1156 1157 /* decrement the send ref count */ 1158 rib_send_rele(qp); 1159 1160 mutex_exit(&wd->sendwait_lock); 1161 (void) rib_free_sendwait(wd); 1162 } 1163 } 1164 } 1165 } 1166 1167 /* 1168 * RCQ handler 1169 */ 1170 /* ARGSUSED */ 1171 static void 1172 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1173 { 1174 rib_qp_t *qp; 1175 ibt_status_t ibt_status; 1176 ibt_wc_t wc; 1177 struct recv_wid *rwid; 1178 1179 /* 1180 * Re-enable cq notify here to avoid missing any 1181 * completion queue notification. 1182 */ 1183 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1184 1185 ibt_status = IBT_SUCCESS; 1186 while (ibt_status != IBT_CQ_EMPTY) { 1187 bzero(&wc, sizeof (wc)); 1188 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1189 if (ibt_status != IBT_SUCCESS) 1190 return; 1191 1192 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1193 qp = rwid->qp; 1194 if (wc.wc_status == IBT_WC_SUCCESS) { 1195 XDR inxdrs, *xdrs; 1196 uint_t xid, vers, op, find_xid = 0; 1197 struct reply *r; 1198 CONN *conn = qptoc(qp); 1199 uint32_t rdma_credit = 0; 1200 1201 xdrs = &inxdrs; 1202 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1203 wc.wc_bytes_xfer, XDR_DECODE); 1204 /* 1205 * Treat xid as opaque (xid is the first entity 1206 * in the rpc rdma message). 1207 */ 1208 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1209 1210 /* Skip xid and set the xdr position accordingly. */ 1211 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1212 (void) xdr_u_int(xdrs, &vers); 1213 (void) xdr_u_int(xdrs, &rdma_credit); 1214 (void) xdr_u_int(xdrs, &op); 1215 XDR_DESTROY(xdrs); 1216 1217 if (vers != RPCRDMA_VERS) { 1218 /* 1219 * Invalid RPC/RDMA version. Cannot 1220 * interoperate. Set connection to 1221 * ERROR state and bail out. 1222 */ 1223 mutex_enter(&conn->c_lock); 1224 if (conn->c_state != C_DISCONN_PEND) 1225 conn->c_state = C_ERROR_CONN; 1226 mutex_exit(&conn->c_lock); 1227 rib_rbuf_free(conn, RECV_BUFFER, 1228 (void *)(uintptr_t)rwid->addr); 1229 rib_free_wid(rwid); 1230 continue; 1231 } 1232 1233 mutex_enter(&qp->replylist_lock); 1234 for (r = qp->replylist; r != NULL; r = r->next) { 1235 if (r->xid == xid) { 1236 find_xid = 1; 1237 switch (op) { 1238 case RDMA_MSG: 1239 case RDMA_NOMSG: 1240 case RDMA_MSGP: 1241 r->status = RDMA_SUCCESS; 1242 r->vaddr_cq = rwid->addr; 1243 r->bytes_xfer = 1244 wc.wc_bytes_xfer; 1245 cv_signal(&r->wait_cv); 1246 break; 1247 default: 1248 rib_rbuf_free(qptoc(qp), 1249 RECV_BUFFER, 1250 (void *)(uintptr_t) 1251 rwid->addr); 1252 break; 1253 } 1254 break; 1255 } 1256 } 1257 mutex_exit(&qp->replylist_lock); 1258 if (find_xid == 0) { 1259 /* RPC caller not waiting for reply */ 1260 1261 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1262 int, xid); 1263 1264 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1265 (void *)(uintptr_t)rwid->addr); 1266 } 1267 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1268 CONN *conn = qptoc(qp); 1269 1270 /* 1271 * Connection being flushed. Just free 1272 * the posted buffer 1273 */ 1274 rib_rbuf_free(conn, RECV_BUFFER, 1275 (void *)(uintptr_t)rwid->addr); 1276 } else { 1277 CONN *conn = qptoc(qp); 1278 /* 1279 * RC Recv Q Error Code Local state Remote State 1280 * ==================== =========== ============ 1281 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1282 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1283 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1284 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1285 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1286 * IBT_WC_WR_FLUSHED_ERR None None 1287 */ 1288 /* 1289 * Channel in error state. Set connection 1290 * in ERROR state. 1291 */ 1292 mutex_enter(&conn->c_lock); 1293 if (conn->c_state != C_DISCONN_PEND) 1294 conn->c_state = C_ERROR_CONN; 1295 mutex_exit(&conn->c_lock); 1296 rib_rbuf_free(conn, RECV_BUFFER, 1297 (void *)(uintptr_t)rwid->addr); 1298 } 1299 rib_free_wid(rwid); 1300 } 1301 } 1302 1303 /* Server side */ 1304 /* ARGSUSED */ 1305 static void 1306 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1307 { 1308 rdma_recv_data_t *rdp; 1309 rib_qp_t *qp; 1310 ibt_status_t ibt_status; 1311 ibt_wc_t wc; 1312 struct svc_recv *s_recvp; 1313 CONN *conn; 1314 mblk_t *mp; 1315 1316 /* 1317 * Re-enable cq notify here to avoid missing any 1318 * completion queue notification. 1319 */ 1320 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1321 1322 ibt_status = IBT_SUCCESS; 1323 while (ibt_status != IBT_CQ_EMPTY) { 1324 bzero(&wc, sizeof (wc)); 1325 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1326 if (ibt_status != IBT_SUCCESS) 1327 return; 1328 1329 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1330 qp = s_recvp->qp; 1331 conn = qptoc(qp); 1332 mutex_enter(&qp->posted_rbufs_lock); 1333 qp->n_posted_rbufs--; 1334 if (qp->n_posted_rbufs == 0) 1335 cv_signal(&qp->posted_rbufs_cv); 1336 mutex_exit(&qp->posted_rbufs_lock); 1337 1338 if (wc.wc_status == IBT_WC_SUCCESS) { 1339 XDR inxdrs, *xdrs; 1340 uint_t xid, vers, op; 1341 uint32_t rdma_credit; 1342 1343 xdrs = &inxdrs; 1344 /* s_recvp->vaddr stores data */ 1345 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1346 wc.wc_bytes_xfer, XDR_DECODE); 1347 1348 /* 1349 * Treat xid as opaque (xid is the first entity 1350 * in the rpc rdma message). 1351 */ 1352 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1353 /* Skip xid and set the xdr position accordingly. */ 1354 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1355 if (!xdr_u_int(xdrs, &vers) || 1356 !xdr_u_int(xdrs, &rdma_credit) || 1357 !xdr_u_int(xdrs, &op)) { 1358 rib_rbuf_free(conn, RECV_BUFFER, 1359 (void *)(uintptr_t)s_recvp->vaddr); 1360 XDR_DESTROY(xdrs); 1361 (void) rib_free_svc_recv(s_recvp); 1362 continue; 1363 } 1364 XDR_DESTROY(xdrs); 1365 1366 if (vers != RPCRDMA_VERS) { 1367 /* 1368 * Invalid RPC/RDMA version. 1369 * Drop rpc rdma message. 1370 */ 1371 rib_rbuf_free(conn, RECV_BUFFER, 1372 (void *)(uintptr_t)s_recvp->vaddr); 1373 (void) rib_free_svc_recv(s_recvp); 1374 continue; 1375 } 1376 /* 1377 * Is this for RDMA_DONE? 1378 */ 1379 if (op == RDMA_DONE) { 1380 rib_rbuf_free(conn, RECV_BUFFER, 1381 (void *)(uintptr_t)s_recvp->vaddr); 1382 /* 1383 * Wake up the thread waiting on 1384 * a RDMA_DONE for xid 1385 */ 1386 mutex_enter(&qp->rdlist_lock); 1387 rdma_done_notify(qp, xid); 1388 mutex_exit(&qp->rdlist_lock); 1389 (void) rib_free_svc_recv(s_recvp); 1390 continue; 1391 } 1392 1393 mutex_enter(&plugin_state_lock); 1394 if (plugin_state == ACCEPT) { 1395 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1396 == NULL) 1397 (void) strwaitbuf( 1398 sizeof (*rdp), BPRI_LO); 1399 /* 1400 * Plugin is in accept state, hence the master 1401 * transport queue for this is still accepting 1402 * requests. Hence we can call svc_queuereq to 1403 * queue this recieved msg. 1404 */ 1405 rdp = (rdma_recv_data_t *)mp->b_rptr; 1406 rdp->conn = conn; 1407 rdp->rpcmsg.addr = 1408 (caddr_t)(uintptr_t)s_recvp->vaddr; 1409 rdp->rpcmsg.type = RECV_BUFFER; 1410 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1411 rdp->status = wc.wc_status; 1412 mutex_enter(&conn->c_lock); 1413 conn->c_ref++; 1414 mutex_exit(&conn->c_lock); 1415 mp->b_wptr += sizeof (*rdp); 1416 svc_queuereq((queue_t *)rib_stat->q, mp); 1417 mutex_exit(&plugin_state_lock); 1418 } else { 1419 /* 1420 * The master transport for this is going 1421 * away and the queue is not accepting anymore 1422 * requests for krpc, so don't do anything, just 1423 * free the msg. 1424 */ 1425 mutex_exit(&plugin_state_lock); 1426 rib_rbuf_free(conn, RECV_BUFFER, 1427 (void *)(uintptr_t)s_recvp->vaddr); 1428 } 1429 } else { 1430 rib_rbuf_free(conn, RECV_BUFFER, 1431 (void *)(uintptr_t)s_recvp->vaddr); 1432 } 1433 (void) rib_free_svc_recv(s_recvp); 1434 } 1435 } 1436 1437 static void 1438 rib_attach_hca() 1439 { 1440 mutex_enter(&rib_stat->open_hca_lock); 1441 (void) rpcib_open_hcas(rib_stat); 1442 rib_listen(NULL); 1443 mutex_exit(&rib_stat->open_hca_lock); 1444 } 1445 1446 /* 1447 * Handles DR event of IBT_HCA_DETACH_EVENT. 1448 */ 1449 /* ARGSUSED */ 1450 static void 1451 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1452 ibt_async_code_t code, ibt_async_event_t *event) 1453 { 1454 switch (code) { 1455 case IBT_HCA_ATTACH_EVENT: 1456 rib_attach_hca(); 1457 break; 1458 case IBT_HCA_DETACH_EVENT: 1459 { 1460 rib_hca_t *hca; 1461 1462 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1463 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1464 rw_enter(&hca->state_lock, RW_READER); 1465 if ((hca->state != HCA_DETACHED) && 1466 (hca->hca_hdl == hca_hdl)) { 1467 rw_exit(&hca->state_lock); 1468 break; 1469 } 1470 rw_exit(&hca->state_lock); 1471 } 1472 rw_exit(&rib_stat->hcas_list_lock); 1473 1474 if (hca == NULL) 1475 return; 1476 ASSERT(hca->hca_hdl == hca_hdl); 1477 rib_detach_hca(hca); 1478 #ifdef DEBUG 1479 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1480 #endif 1481 break; 1482 } 1483 case IBT_EVENT_PORT_UP: 1484 /* 1485 * A port is up. We should call rib_listen() since there is 1486 * a chance that rib_listen() may have failed during 1487 * rib_attach_hca() because the port had not been up yet. 1488 */ 1489 rib_listen(NULL); 1490 #ifdef DEBUG 1491 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1492 #endif 1493 break; 1494 #ifdef DEBUG 1495 case IBT_EVENT_PATH_MIGRATED: 1496 cmn_err(CE_NOTE, "rib_async_handler(): " 1497 "IBT_EVENT_PATH_MIGRATED\n"); 1498 break; 1499 case IBT_EVENT_SQD: 1500 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1501 break; 1502 case IBT_EVENT_COM_EST: 1503 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1504 break; 1505 case IBT_ERROR_CATASTROPHIC_CHAN: 1506 cmn_err(CE_NOTE, "rib_async_handler(): " 1507 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1508 break; 1509 case IBT_ERROR_INVALID_REQUEST_CHAN: 1510 cmn_err(CE_NOTE, "rib_async_handler(): " 1511 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1512 break; 1513 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1514 cmn_err(CE_NOTE, "rib_async_handler(): " 1515 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1516 break; 1517 case IBT_ERROR_PATH_MIGRATE_REQ: 1518 cmn_err(CE_NOTE, "rib_async_handler(): " 1519 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1520 break; 1521 case IBT_ERROR_CQ: 1522 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1523 break; 1524 case IBT_ERROR_PORT_DOWN: 1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1526 break; 1527 case IBT_ASYNC_OPAQUE1: 1528 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1529 break; 1530 case IBT_ASYNC_OPAQUE2: 1531 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1532 break; 1533 case IBT_ASYNC_OPAQUE3: 1534 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1535 break; 1536 case IBT_ASYNC_OPAQUE4: 1537 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1538 break; 1539 #endif 1540 default: 1541 break; 1542 } 1543 } 1544 1545 /* 1546 * Client's reachable function. 1547 */ 1548 static rdma_stat 1549 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1550 { 1551 rdma_stat status; 1552 rpcib_ping_t rpt; 1553 struct netbuf saddr; 1554 CONN *conn; 1555 1556 bzero(&saddr, sizeof (struct netbuf)); 1557 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1558 1559 if (status == RDMA_SUCCESS) { 1560 *handle = (void *)rpt.hca; 1561 /* release the reference */ 1562 (void) rib_conn_release(conn); 1563 return (RDMA_SUCCESS); 1564 } else { 1565 *handle = NULL; 1566 DTRACE_PROBE(rpcib__i__pingfailed); 1567 return (RDMA_FAILED); 1568 } 1569 } 1570 1571 /* Client side qp creation */ 1572 static rdma_stat 1573 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1574 { 1575 rib_qp_t *kqp = NULL; 1576 CONN *conn; 1577 rdma_clnt_cred_ctrl_t *cc_info; 1578 1579 ASSERT(qp != NULL); 1580 *qp = NULL; 1581 1582 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1583 conn = qptoc(kqp); 1584 kqp->hca = hca; 1585 kqp->rdmaconn.c_rdmamod = &rib_mod; 1586 kqp->rdmaconn.c_private = (caddr_t)kqp; 1587 1588 kqp->mode = RIB_CLIENT; 1589 kqp->chan_flags = IBT_BLOCKING; 1590 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1591 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1592 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1593 /* 1594 * Initialize 1595 */ 1596 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1597 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1598 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1599 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1600 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1601 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1602 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1603 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1604 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1605 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1606 /* 1607 * Initialize the client credit control 1608 * portion of the rdmaconn struct. 1609 */ 1610 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1611 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1612 cc_info->clnt_cc_granted_ops = 0; 1613 cc_info->clnt_cc_in_flight_ops = 0; 1614 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1615 1616 *qp = kqp; 1617 return (RDMA_SUCCESS); 1618 } 1619 1620 /* Server side qp creation */ 1621 static rdma_stat 1622 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1623 { 1624 rib_qp_t *kqp = NULL; 1625 ibt_chan_sizes_t chan_sizes; 1626 ibt_rc_chan_alloc_args_t qp_attr; 1627 ibt_status_t ibt_status; 1628 rdma_srv_cred_ctrl_t *cc_info; 1629 1630 *qp = NULL; 1631 1632 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1633 kqp->hca = hca; 1634 kqp->port_num = port; 1635 kqp->rdmaconn.c_rdmamod = &rib_mod; 1636 kqp->rdmaconn.c_private = (caddr_t)kqp; 1637 1638 /* 1639 * Create the qp handle 1640 */ 1641 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1642 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1643 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1644 qp_attr.rc_pd = hca->pd_hdl; 1645 qp_attr.rc_hca_port_num = port; 1646 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1647 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1648 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1649 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1650 qp_attr.rc_clone_chan = NULL; 1651 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1652 qp_attr.rc_flags = IBT_WR_SIGNALED; 1653 1654 rw_enter(&hca->state_lock, RW_READER); 1655 if (hca->state != HCA_DETACHED) { 1656 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1657 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1658 &chan_sizes); 1659 } else { 1660 rw_exit(&hca->state_lock); 1661 goto fail; 1662 } 1663 rw_exit(&hca->state_lock); 1664 1665 if (ibt_status != IBT_SUCCESS) { 1666 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1667 int, ibt_status); 1668 goto fail; 1669 } 1670 1671 kqp->mode = RIB_SERVER; 1672 kqp->chan_flags = IBT_BLOCKING; 1673 kqp->q = q; /* server ONLY */ 1674 1675 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1676 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1677 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1678 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1679 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1680 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1681 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1682 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1683 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1684 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1685 /* 1686 * Set the private data area to qp to be used in callbacks 1687 */ 1688 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1689 kqp->rdmaconn.c_state = C_CONNECTED; 1690 1691 /* 1692 * Initialize the server credit control 1693 * portion of the rdmaconn struct. 1694 */ 1695 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1696 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1697 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1698 cc_info->srv_cc_cur_buffers_used = 0; 1699 cc_info->srv_cc_posted = preposted_rbufs; 1700 1701 *qp = kqp; 1702 1703 return (RDMA_SUCCESS); 1704 fail: 1705 if (kqp) 1706 kmem_free(kqp, sizeof (rib_qp_t)); 1707 1708 return (RDMA_FAILED); 1709 } 1710 1711 /* ARGSUSED */ 1712 ibt_cm_status_t 1713 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1714 ibt_cm_return_args_t *ret_args, void *priv_data, 1715 ibt_priv_data_len_t len) 1716 { 1717 rib_hca_t *hca; 1718 1719 hca = (rib_hca_t *)clnt_hdl; 1720 1721 switch (event->cm_type) { 1722 1723 /* got a connection close event */ 1724 case IBT_CM_EVENT_CONN_CLOSED: 1725 { 1726 CONN *conn; 1727 rib_qp_t *qp; 1728 1729 /* check reason why connection was closed */ 1730 switch (event->cm_event.closed) { 1731 case IBT_CM_CLOSED_DREP_RCVD: 1732 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1733 case IBT_CM_CLOSED_DUP: 1734 case IBT_CM_CLOSED_ABORT: 1735 case IBT_CM_CLOSED_ALREADY: 1736 /* 1737 * These cases indicate the local end initiated 1738 * the closing of the channel. Nothing to do here. 1739 */ 1740 break; 1741 default: 1742 /* 1743 * Reason for CONN_CLOSED event must be one of 1744 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1745 * or IBT_CM_CLOSED_STALE. These indicate cases were 1746 * the remote end is closing the channel. In these 1747 * cases free the channel and transition to error 1748 * state 1749 */ 1750 qp = ibt_get_chan_private(event->cm_channel); 1751 conn = qptoc(qp); 1752 mutex_enter(&conn->c_lock); 1753 if (conn->c_state == C_DISCONN_PEND) { 1754 mutex_exit(&conn->c_lock); 1755 break; 1756 } 1757 1758 conn->c_state = C_ERROR_CONN; 1759 1760 /* 1761 * Free the conn if c_ref is down to 0 already 1762 */ 1763 if (conn->c_ref == 0) { 1764 /* 1765 * Remove from list and free conn 1766 */ 1767 conn->c_state = C_DISCONN_PEND; 1768 mutex_exit(&conn->c_lock); 1769 rw_enter(&hca->state_lock, RW_READER); 1770 if (hca->state != HCA_DETACHED) 1771 (void) rib_disconnect_channel(conn, 1772 &hca->cl_conn_list); 1773 rw_exit(&hca->state_lock); 1774 } else { 1775 /* 1776 * conn will be freed when c_ref goes to 0. 1777 * Indicate to cleaning thread not to close 1778 * the connection, but just free the channel. 1779 */ 1780 conn->c_flags |= C_CLOSE_NOTNEEDED; 1781 mutex_exit(&conn->c_lock); 1782 } 1783 #ifdef DEBUG 1784 if (rib_debug) 1785 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1786 "(CONN_CLOSED) channel disconnected"); 1787 #endif 1788 break; 1789 } 1790 break; 1791 } 1792 default: 1793 break; 1794 } 1795 return (IBT_CM_ACCEPT); 1796 } 1797 1798 /* 1799 * Connect to the server. 1800 */ 1801 rdma_stat 1802 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1803 { 1804 ibt_chan_open_args_t chan_args; /* channel args */ 1805 ibt_chan_sizes_t chan_sizes; 1806 ibt_rc_chan_alloc_args_t qp_attr; 1807 ibt_status_t ibt_status; 1808 ibt_rc_returns_t ret_args; /* conn reject info */ 1809 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1810 ibt_ip_cm_info_t ipcm_info; 1811 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1812 1813 1814 (void) bzero(&chan_args, sizeof (chan_args)); 1815 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1816 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1817 1818 ipcm_info.src_addr.family = rptp->srcip.family; 1819 switch (ipcm_info.src_addr.family) { 1820 case AF_INET: 1821 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1822 break; 1823 case AF_INET6: 1824 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1825 break; 1826 } 1827 1828 ipcm_info.dst_addr.family = rptp->srcip.family; 1829 switch (ipcm_info.dst_addr.family) { 1830 case AF_INET: 1831 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1832 break; 1833 case AF_INET6: 1834 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1835 break; 1836 } 1837 1838 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1839 1840 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1841 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1842 1843 if (ibt_status != IBT_SUCCESS) { 1844 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1845 return (-1); 1846 } 1847 1848 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1849 /* Alloc a RC channel */ 1850 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1851 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1852 qp_attr.rc_pd = hca->pd_hdl; 1853 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1854 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1855 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1856 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1857 qp_attr.rc_clone_chan = NULL; 1858 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1859 qp_attr.rc_flags = IBT_WR_SIGNALED; 1860 1861 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1862 chan_args.oc_path = &rptp->path; 1863 1864 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1865 chan_args.oc_cm_clnt_private = (void *)hca; 1866 chan_args.oc_rdma_ra_out = 4; 1867 chan_args.oc_rdma_ra_in = 4; 1868 chan_args.oc_path_retry_cnt = 2; 1869 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1870 chan_args.oc_priv_data = cmp_ip_pvt; 1871 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1872 1873 refresh: 1874 rw_enter(&hca->state_lock, RW_READER); 1875 if (hca->state != HCA_DETACHED) { 1876 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1877 IBT_ACHAN_NO_FLAGS, 1878 &qp_attr, &qp->qp_hdl, 1879 &chan_sizes); 1880 } else { 1881 rw_exit(&hca->state_lock); 1882 return (RDMA_FAILED); 1883 } 1884 rw_exit(&hca->state_lock); 1885 1886 if (ibt_status != IBT_SUCCESS) { 1887 DTRACE_PROBE1(rpcib__i_conntosrv, 1888 int, ibt_status); 1889 return (RDMA_FAILED); 1890 } 1891 1892 /* Connect to the Server */ 1893 (void) bzero(&ret_args, sizeof (ret_args)); 1894 mutex_enter(&qp->cb_lock); 1895 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1896 IBT_BLOCKING, &chan_args, &ret_args); 1897 if (ibt_status != IBT_SUCCESS) { 1898 DTRACE_PROBE2(rpcib__i_openrctosrv, 1899 int, ibt_status, int, ret_args.rc_status); 1900 1901 (void) ibt_free_channel(qp->qp_hdl); 1902 qp->qp_hdl = NULL; 1903 mutex_exit(&qp->cb_lock); 1904 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1905 ret_args.rc_status == IBT_CM_CONN_STALE) { 1906 /* 1907 * Got IBT_CM_CONN_STALE probably because of stale 1908 * data on the passive end of a channel that existed 1909 * prior to reboot. Retry establishing a channel 1910 * REFRESH_ATTEMPTS times, during which time the 1911 * stale conditions on the server might clear up. 1912 */ 1913 goto refresh; 1914 } 1915 return (RDMA_FAILED); 1916 } 1917 mutex_exit(&qp->cb_lock); 1918 /* 1919 * Set the private data area to qp to be used in callbacks 1920 */ 1921 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1922 return (RDMA_SUCCESS); 1923 } 1924 1925 rdma_stat 1926 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1927 { 1928 uint_t i, addr_count; 1929 ibt_status_t ibt_status; 1930 uint8_t num_paths_p; 1931 ibt_ip_path_attr_t ipattr; 1932 ibt_path_ip_src_t srcip; 1933 rpcib_ipaddrs_t addrs4; 1934 rpcib_ipaddrs_t addrs6; 1935 struct sockaddr_in *sinp; 1936 struct sockaddr_in6 *sin6p; 1937 rdma_stat retval = RDMA_FAILED; 1938 rib_hca_t *hca; 1939 1940 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1941 return (RDMA_INVAL); 1942 ASSERT(raddr->buf != NULL); 1943 1944 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1945 1946 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1947 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1948 retval = RDMA_FAILED; 1949 goto done2; 1950 } 1951 1952 if (addr_type == AF_INET) { 1953 addr_count = addrs4.ri_count; 1954 sinp = (struct sockaddr_in *)raddr->buf; 1955 rptp->dstip.family = AF_INET; 1956 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1957 sinp = addrs4.ri_list; 1958 } else { 1959 addr_count = addrs6.ri_count; 1960 sin6p = (struct sockaddr_in6 *)raddr->buf; 1961 rptp->dstip.family = AF_INET6; 1962 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1963 sin6p = addrs6.ri_list; 1964 } 1965 1966 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1967 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1968 rw_enter(&hca->state_lock, RW_READER); 1969 if (hca->state == HCA_DETACHED) { 1970 rw_exit(&hca->state_lock); 1971 continue; 1972 } 1973 1974 ipattr.ipa_dst_ip = &rptp->dstip; 1975 ipattr.ipa_hca_guid = hca->hca_guid; 1976 ipattr.ipa_ndst = 1; 1977 ipattr.ipa_max_paths = 1; 1978 ipattr.ipa_src_ip.family = rptp->dstip.family; 1979 for (i = 0; i < addr_count; i++) { 1980 num_paths_p = 0; 1981 if (addr_type == AF_INET) { 1982 ipattr.ipa_src_ip.un.ip4addr = 1983 sinp[i].sin_addr.s_addr; 1984 } else { 1985 ipattr.ipa_src_ip.un.ip6addr = 1986 sin6p[i].sin6_addr; 1987 } 1988 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1989 1990 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1991 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1992 &num_paths_p, &srcip); 1993 if (ibt_status == IBT_SUCCESS && 1994 num_paths_p != 0 && 1995 rptp->path.pi_hca_guid == hca->hca_guid) { 1996 rptp->hca = hca; 1997 rw_exit(&hca->state_lock); 1998 if (addr_type == AF_INET) { 1999 rptp->srcip.family = AF_INET; 2000 rptp->srcip.un.ip4addr = 2001 srcip.ip_primary.un.ip4addr; 2002 } else { 2003 rptp->srcip.family = AF_INET6; 2004 rptp->srcip.un.ip6addr = 2005 srcip.ip_primary.un.ip6addr; 2006 2007 } 2008 retval = RDMA_SUCCESS; 2009 goto done1; 2010 } 2011 } 2012 rw_exit(&hca->state_lock); 2013 } 2014 done1: 2015 rw_exit(&rib_stat->hcas_list_lock); 2016 done2: 2017 if (addrs4.ri_size > 0) 2018 kmem_free(addrs4.ri_list, addrs4.ri_size); 2019 if (addrs6.ri_size > 0) 2020 kmem_free(addrs6.ri_list, addrs6.ri_size); 2021 return (retval); 2022 } 2023 2024 /* 2025 * Close channel, remove from connection list and 2026 * free up resources allocated for that channel. 2027 */ 2028 rdma_stat 2029 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2030 { 2031 rib_qp_t *qp = ctoqp(conn); 2032 rib_hca_t *hca; 2033 2034 mutex_enter(&conn->c_lock); 2035 if (conn->c_timeout != NULL) { 2036 mutex_exit(&conn->c_lock); 2037 (void) untimeout(conn->c_timeout); 2038 mutex_enter(&conn->c_lock); 2039 } 2040 2041 while (conn->c_flags & C_CLOSE_PENDING) { 2042 cv_wait(&conn->c_cv, &conn->c_lock); 2043 } 2044 mutex_exit(&conn->c_lock); 2045 2046 /* 2047 * c_ref == 0 and connection is in C_DISCONN_PEND 2048 */ 2049 hca = qp->hca; 2050 if (conn_list != NULL) 2051 (void) rib_rm_conn(conn, conn_list); 2052 2053 /* 2054 * There is only one case where we get here with 2055 * qp_hdl = NULL, which is during connection setup on 2056 * the client. In such a case there are no posted 2057 * send/recv buffers. 2058 */ 2059 if (qp->qp_hdl != NULL) { 2060 mutex_enter(&qp->posted_rbufs_lock); 2061 while (qp->n_posted_rbufs) 2062 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2063 mutex_exit(&qp->posted_rbufs_lock); 2064 2065 mutex_enter(&qp->send_rbufs_lock); 2066 while (qp->n_send_rbufs) 2067 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2068 mutex_exit(&qp->send_rbufs_lock); 2069 2070 (void) ibt_free_channel(qp->qp_hdl); 2071 qp->qp_hdl = NULL; 2072 } 2073 2074 ASSERT(qp->rdlist == NULL); 2075 2076 if (qp->replylist != NULL) { 2077 (void) rib_rem_replylist(qp); 2078 } 2079 2080 cv_destroy(&qp->cb_conn_cv); 2081 cv_destroy(&qp->posted_rbufs_cv); 2082 cv_destroy(&qp->send_rbufs_cv); 2083 mutex_destroy(&qp->cb_lock); 2084 mutex_destroy(&qp->replylist_lock); 2085 mutex_destroy(&qp->posted_rbufs_lock); 2086 mutex_destroy(&qp->send_rbufs_lock); 2087 mutex_destroy(&qp->rdlist_lock); 2088 2089 cv_destroy(&conn->c_cv); 2090 mutex_destroy(&conn->c_lock); 2091 2092 if (conn->c_raddr.buf != NULL) { 2093 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2094 } 2095 if (conn->c_laddr.buf != NULL) { 2096 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2097 } 2098 2099 /* 2100 * Credit control cleanup. 2101 */ 2102 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2103 rdma_clnt_cred_ctrl_t *cc_info; 2104 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2105 cv_destroy(&cc_info->clnt_cc_cv); 2106 } 2107 2108 kmem_free(qp, sizeof (rib_qp_t)); 2109 2110 /* 2111 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2112 * then the hca is no longer being used. 2113 */ 2114 if (conn_list != NULL) { 2115 rw_enter(&hca->state_lock, RW_READER); 2116 if (hca->state == HCA_DETACHED) { 2117 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2118 if (hca->srv_conn_list.conn_hd == NULL) { 2119 rw_enter(&hca->cl_conn_list.conn_lock, 2120 RW_READER); 2121 2122 if (hca->cl_conn_list.conn_hd == NULL) { 2123 mutex_enter(&hca->inuse_lock); 2124 hca->inuse = FALSE; 2125 cv_signal(&hca->cb_cv); 2126 mutex_exit(&hca->inuse_lock); 2127 } 2128 rw_exit(&hca->cl_conn_list.conn_lock); 2129 } 2130 rw_exit(&hca->srv_conn_list.conn_lock); 2131 } 2132 rw_exit(&hca->state_lock); 2133 } 2134 2135 return (RDMA_SUCCESS); 2136 } 2137 2138 /* 2139 * All sends are done under the protection of 2140 * the wdesc->sendwait_lock. n_send_rbufs count 2141 * is protected using the send_rbufs_lock. 2142 * lock ordering is: 2143 * sendwait_lock -> send_rbufs_lock 2144 */ 2145 2146 void 2147 rib_send_hold(rib_qp_t *qp) 2148 { 2149 mutex_enter(&qp->send_rbufs_lock); 2150 qp->n_send_rbufs++; 2151 mutex_exit(&qp->send_rbufs_lock); 2152 } 2153 2154 void 2155 rib_send_rele(rib_qp_t *qp) 2156 { 2157 mutex_enter(&qp->send_rbufs_lock); 2158 qp->n_send_rbufs--; 2159 if (qp->n_send_rbufs == 0) 2160 cv_signal(&qp->send_rbufs_cv); 2161 mutex_exit(&qp->send_rbufs_lock); 2162 } 2163 2164 /* 2165 * Wait for send completion notification. Only on receiving a 2166 * notification be it a successful or error completion, free the 2167 * send_wid. 2168 */ 2169 static rdma_stat 2170 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2171 { 2172 clock_t timout, cv_wait_ret; 2173 rdma_stat error = RDMA_SUCCESS; 2174 int i; 2175 2176 /* 2177 * Wait for send to complete 2178 */ 2179 ASSERT(wd != NULL); 2180 mutex_enter(&wd->sendwait_lock); 2181 if (wd->status == (uint_t)SEND_WAIT) { 2182 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2183 ddi_get_lbolt(); 2184 2185 if (qp->mode == RIB_SERVER) { 2186 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2187 &wd->sendwait_lock, timout)) > 0 && 2188 wd->status == (uint_t)SEND_WAIT) 2189 ; 2190 switch (cv_wait_ret) { 2191 case -1: /* timeout */ 2192 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2193 2194 wd->cv_sig = 0; /* no signal needed */ 2195 error = RDMA_TIMEDOUT; 2196 break; 2197 default: /* got send completion */ 2198 break; 2199 } 2200 } else { 2201 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2202 &wd->sendwait_lock, timout)) > 0 && 2203 wd->status == (uint_t)SEND_WAIT) 2204 ; 2205 switch (cv_wait_ret) { 2206 case -1: /* timeout */ 2207 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2208 2209 wd->cv_sig = 0; /* no signal needed */ 2210 error = RDMA_TIMEDOUT; 2211 break; 2212 case 0: /* interrupted */ 2213 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2214 2215 wd->cv_sig = 0; /* no signal needed */ 2216 error = RDMA_INTR; 2217 break; 2218 default: /* got send completion */ 2219 break; 2220 } 2221 } 2222 } 2223 2224 if (wd->status != (uint_t)SEND_WAIT) { 2225 /* got send completion */ 2226 if (wd->status != RDMA_SUCCESS) { 2227 switch (wd->status) { 2228 case RDMA_CONNLOST: 2229 error = RDMA_CONNLOST; 2230 break; 2231 default: 2232 error = RDMA_FAILED; 2233 break; 2234 } 2235 } 2236 for (i = 0; i < wd->nsbufs; i++) { 2237 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2238 (void *)(uintptr_t)wd->sbufaddr[i]); 2239 } 2240 2241 rib_send_rele(qp); 2242 2243 mutex_exit(&wd->sendwait_lock); 2244 (void) rib_free_sendwait(wd); 2245 2246 } else { 2247 mutex_exit(&wd->sendwait_lock); 2248 } 2249 return (error); 2250 } 2251 2252 static struct send_wid * 2253 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2254 { 2255 struct send_wid *wd; 2256 2257 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2258 wd->xid = xid; 2259 wd->cv_sig = cv_sig; 2260 wd->qp = qp; 2261 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2262 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2263 wd->status = (uint_t)SEND_WAIT; 2264 2265 return (wd); 2266 } 2267 2268 static int 2269 rib_free_sendwait(struct send_wid *wdesc) 2270 { 2271 cv_destroy(&wdesc->wait_cv); 2272 mutex_destroy(&wdesc->sendwait_lock); 2273 kmem_free(wdesc, sizeof (*wdesc)); 2274 2275 return (0); 2276 } 2277 2278 static rdma_stat 2279 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2280 { 2281 mutex_enter(&qp->replylist_lock); 2282 if (rep != NULL) { 2283 (void) rib_remreply(qp, rep); 2284 mutex_exit(&qp->replylist_lock); 2285 return (RDMA_SUCCESS); 2286 } 2287 mutex_exit(&qp->replylist_lock); 2288 return (RDMA_FAILED); 2289 } 2290 2291 /* 2292 * Send buffers are freed here only in case of error in posting 2293 * on QP. If the post succeeded, the send buffers are freed upon 2294 * send completion in rib_sendwait() or in the scq_handler. 2295 */ 2296 rdma_stat 2297 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2298 int send_sig, int cv_sig, caddr_t *swid) 2299 { 2300 struct send_wid *wdesc; 2301 struct clist *clp; 2302 ibt_status_t ibt_status = IBT_SUCCESS; 2303 rdma_stat ret = RDMA_SUCCESS; 2304 ibt_send_wr_t tx_wr; 2305 int i, nds; 2306 ibt_wr_ds_t sgl[DSEG_MAX]; 2307 uint_t total_msg_size; 2308 rib_qp_t *qp; 2309 2310 qp = ctoqp(conn); 2311 2312 ASSERT(cl != NULL); 2313 2314 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2315 2316 nds = 0; 2317 total_msg_size = 0; 2318 clp = cl; 2319 while (clp != NULL) { 2320 if (nds >= DSEG_MAX) { 2321 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2322 return (RDMA_FAILED); 2323 } 2324 sgl[nds].ds_va = clp->w.c_saddr; 2325 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2326 sgl[nds].ds_len = clp->c_len; 2327 total_msg_size += clp->c_len; 2328 clp = clp->c_next; 2329 nds++; 2330 } 2331 2332 if (send_sig) { 2333 /* Set SEND_SIGNAL flag. */ 2334 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2335 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2336 *swid = (caddr_t)wdesc; 2337 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2338 mutex_enter(&wdesc->sendwait_lock); 2339 wdesc->nsbufs = nds; 2340 for (i = 0; i < nds; i++) { 2341 wdesc->sbufaddr[i] = sgl[i].ds_va; 2342 } 2343 } else { 2344 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2345 *swid = NULL; 2346 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2347 } 2348 2349 tx_wr.wr_opcode = IBT_WRC_SEND; 2350 tx_wr.wr_trans = IBT_RC_SRV; 2351 tx_wr.wr_nds = nds; 2352 tx_wr.wr_sgl = sgl; 2353 2354 mutex_enter(&conn->c_lock); 2355 if (conn->c_state == C_CONNECTED) { 2356 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2357 } 2358 if (conn->c_state != C_CONNECTED || 2359 ibt_status != IBT_SUCCESS) { 2360 if (conn->c_state != C_DISCONN_PEND) 2361 conn->c_state = C_ERROR_CONN; 2362 mutex_exit(&conn->c_lock); 2363 if (send_sig) { 2364 for (i = 0; i < nds; i++) { 2365 rib_rbuf_free(conn, SEND_BUFFER, 2366 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2367 } 2368 mutex_exit(&wdesc->sendwait_lock); 2369 (void) rib_free_sendwait(wdesc); 2370 } 2371 return (RDMA_CONNLOST); 2372 } 2373 2374 mutex_exit(&conn->c_lock); 2375 2376 if (send_sig) { 2377 rib_send_hold(qp); 2378 mutex_exit(&wdesc->sendwait_lock); 2379 if (cv_sig) { 2380 /* 2381 * cv_wait for send to complete. 2382 * We can fail due to a timeout or signal or 2383 * unsuccessful send. 2384 */ 2385 ret = rib_sendwait(qp, wdesc); 2386 2387 return (ret); 2388 } 2389 } 2390 2391 return (RDMA_SUCCESS); 2392 } 2393 2394 2395 rdma_stat 2396 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2397 { 2398 rdma_stat ret; 2399 caddr_t wd; 2400 2401 /* send-wait & cv_signal */ 2402 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2403 return (ret); 2404 } 2405 2406 /* 2407 * Deprecated/obsolete interface not used currently 2408 * but earlier used for READ-READ protocol. 2409 * Send RPC reply and wait for RDMA_DONE. 2410 */ 2411 rdma_stat 2412 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2413 { 2414 rdma_stat ret = RDMA_SUCCESS; 2415 struct rdma_done_list *rd; 2416 clock_t timout, cv_wait_ret; 2417 caddr_t *wid = NULL; 2418 rib_qp_t *qp = ctoqp(conn); 2419 2420 mutex_enter(&qp->rdlist_lock); 2421 rd = rdma_done_add(qp, msgid); 2422 2423 /* No cv_signal (whether send-wait or no-send-wait) */ 2424 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2425 2426 if (ret != RDMA_SUCCESS) { 2427 rdma_done_rm(qp, rd); 2428 } else { 2429 /* 2430 * Wait for RDMA_DONE from remote end 2431 */ 2432 timout = 2433 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2434 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2435 &qp->rdlist_lock, 2436 timout); 2437 2438 rdma_done_rm(qp, rd); 2439 2440 if (cv_wait_ret < 0) { 2441 ret = RDMA_TIMEDOUT; 2442 } 2443 } 2444 2445 mutex_exit(&qp->rdlist_lock); 2446 return (ret); 2447 } 2448 2449 static struct recv_wid * 2450 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2451 { 2452 struct recv_wid *rwid; 2453 2454 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2455 rwid->xid = msgid; 2456 rwid->addr = sgl->ds_va; 2457 rwid->qp = qp; 2458 2459 return (rwid); 2460 } 2461 2462 static void 2463 rib_free_wid(struct recv_wid *rwid) 2464 { 2465 kmem_free(rwid, sizeof (struct recv_wid)); 2466 } 2467 2468 rdma_stat 2469 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2470 { 2471 rib_qp_t *qp = ctoqp(conn); 2472 struct clist *clp = cl; 2473 struct reply *rep; 2474 struct recv_wid *rwid; 2475 int nds; 2476 ibt_wr_ds_t sgl[DSEG_MAX]; 2477 ibt_recv_wr_t recv_wr; 2478 rdma_stat ret; 2479 ibt_status_t ibt_status; 2480 2481 /* 2482 * rdma_clnt_postrecv uses RECV_BUFFER. 2483 */ 2484 2485 nds = 0; 2486 while (cl != NULL) { 2487 if (nds >= DSEG_MAX) { 2488 ret = RDMA_FAILED; 2489 goto done; 2490 } 2491 sgl[nds].ds_va = cl->w.c_saddr; 2492 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2493 sgl[nds].ds_len = cl->c_len; 2494 cl = cl->c_next; 2495 nds++; 2496 } 2497 2498 if (nds != 1) { 2499 ret = RDMA_FAILED; 2500 goto done; 2501 } 2502 2503 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2504 recv_wr.wr_nds = nds; 2505 recv_wr.wr_sgl = sgl; 2506 2507 rwid = rib_create_wid(qp, &sgl[0], msgid); 2508 if (rwid) { 2509 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2510 } else { 2511 ret = RDMA_NORESOURCE; 2512 goto done; 2513 } 2514 rep = rib_addreplylist(qp, msgid); 2515 if (!rep) { 2516 rib_free_wid(rwid); 2517 ret = RDMA_NORESOURCE; 2518 goto done; 2519 } 2520 2521 mutex_enter(&conn->c_lock); 2522 2523 if (conn->c_state == C_CONNECTED) { 2524 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2525 } 2526 2527 if (conn->c_state != C_CONNECTED || 2528 ibt_status != IBT_SUCCESS) { 2529 if (conn->c_state != C_DISCONN_PEND) 2530 conn->c_state = C_ERROR_CONN; 2531 mutex_exit(&conn->c_lock); 2532 rib_free_wid(rwid); 2533 (void) rib_rem_rep(qp, rep); 2534 ret = RDMA_CONNLOST; 2535 goto done; 2536 } 2537 mutex_exit(&conn->c_lock); 2538 return (RDMA_SUCCESS); 2539 2540 done: 2541 while (clp != NULL) { 2542 rib_rbuf_free(conn, RECV_BUFFER, 2543 (void *)(uintptr_t)clp->w.c_saddr3); 2544 clp = clp->c_next; 2545 } 2546 return (ret); 2547 } 2548 2549 rdma_stat 2550 rib_svc_post(CONN* conn, struct clist *cl) 2551 { 2552 rib_qp_t *qp = ctoqp(conn); 2553 struct svc_recv *s_recvp; 2554 int nds; 2555 ibt_wr_ds_t sgl[DSEG_MAX]; 2556 ibt_recv_wr_t recv_wr; 2557 ibt_status_t ibt_status; 2558 2559 nds = 0; 2560 while (cl != NULL) { 2561 if (nds >= DSEG_MAX) { 2562 return (RDMA_FAILED); 2563 } 2564 sgl[nds].ds_va = cl->w.c_saddr; 2565 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2566 sgl[nds].ds_len = cl->c_len; 2567 cl = cl->c_next; 2568 nds++; 2569 } 2570 2571 if (nds != 1) { 2572 rib_rbuf_free(conn, RECV_BUFFER, 2573 (caddr_t)(uintptr_t)sgl[0].ds_va); 2574 2575 return (RDMA_FAILED); 2576 } 2577 2578 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2579 recv_wr.wr_nds = nds; 2580 recv_wr.wr_sgl = sgl; 2581 2582 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2583 /* Use s_recvp's addr as wr id */ 2584 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2585 mutex_enter(&conn->c_lock); 2586 if (conn->c_state == C_CONNECTED) { 2587 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2588 } 2589 if (conn->c_state != C_CONNECTED || 2590 ibt_status != IBT_SUCCESS) { 2591 if (conn->c_state != C_DISCONN_PEND) 2592 conn->c_state = C_ERROR_CONN; 2593 mutex_exit(&conn->c_lock); 2594 rib_rbuf_free(conn, RECV_BUFFER, 2595 (caddr_t)(uintptr_t)sgl[0].ds_va); 2596 (void) rib_free_svc_recv(s_recvp); 2597 2598 return (RDMA_CONNLOST); 2599 } 2600 mutex_exit(&conn->c_lock); 2601 2602 return (RDMA_SUCCESS); 2603 } 2604 2605 /* Client */ 2606 rdma_stat 2607 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2608 { 2609 2610 return (rib_clnt_post(conn, cl, msgid)); 2611 } 2612 2613 /* Client */ 2614 rdma_stat 2615 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2616 { 2617 rib_qp_t *qp = ctoqp(conn); 2618 struct reply *rep; 2619 2620 mutex_enter(&qp->replylist_lock); 2621 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2622 if (rep->xid == msgid) { 2623 if (rep->vaddr_cq) { 2624 rib_rbuf_free(conn, RECV_BUFFER, 2625 (caddr_t)(uintptr_t)rep->vaddr_cq); 2626 } 2627 (void) rib_remreply(qp, rep); 2628 break; 2629 } 2630 } 2631 mutex_exit(&qp->replylist_lock); 2632 2633 return (RDMA_SUCCESS); 2634 } 2635 2636 /* Server */ 2637 rdma_stat 2638 rib_post_recv(CONN *conn, struct clist *cl) 2639 { 2640 rib_qp_t *qp = ctoqp(conn); 2641 2642 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2643 mutex_enter(&qp->posted_rbufs_lock); 2644 qp->n_posted_rbufs++; 2645 mutex_exit(&qp->posted_rbufs_lock); 2646 return (RDMA_SUCCESS); 2647 } 2648 return (RDMA_FAILED); 2649 } 2650 2651 /* 2652 * Client side only interface to "recv" the rpc reply buf 2653 * posted earlier by rib_post_resp(conn, cl, msgid). 2654 */ 2655 rdma_stat 2656 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2657 { 2658 struct reply *rep = NULL; 2659 clock_t timout, cv_wait_ret; 2660 rdma_stat ret = RDMA_SUCCESS; 2661 rib_qp_t *qp = ctoqp(conn); 2662 2663 /* 2664 * Find the reply structure for this msgid 2665 */ 2666 mutex_enter(&qp->replylist_lock); 2667 2668 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2669 if (rep->xid == msgid) 2670 break; 2671 } 2672 2673 if (rep != NULL) { 2674 /* 2675 * If message not yet received, wait. 2676 */ 2677 if (rep->status == (uint_t)REPLY_WAIT) { 2678 timout = ddi_get_lbolt() + 2679 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2680 2681 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2682 &qp->replylist_lock, timout)) > 0 && 2683 rep->status == (uint_t)REPLY_WAIT) 2684 ; 2685 2686 switch (cv_wait_ret) { 2687 case -1: /* timeout */ 2688 ret = RDMA_TIMEDOUT; 2689 break; 2690 case 0: 2691 ret = RDMA_INTR; 2692 break; 2693 default: 2694 break; 2695 } 2696 } 2697 2698 if (rep->status == RDMA_SUCCESS) { 2699 struct clist *cl = NULL; 2700 2701 /* 2702 * Got message successfully 2703 */ 2704 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2705 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2706 *clp = cl; 2707 } else { 2708 if (rep->status != (uint_t)REPLY_WAIT) { 2709 /* 2710 * Got error in reply message. Free 2711 * recv buffer here. 2712 */ 2713 ret = rep->status; 2714 rib_rbuf_free(conn, RECV_BUFFER, 2715 (caddr_t)(uintptr_t)rep->vaddr_cq); 2716 } 2717 } 2718 (void) rib_remreply(qp, rep); 2719 } else { 2720 /* 2721 * No matching reply structure found for given msgid on the 2722 * reply wait list. 2723 */ 2724 ret = RDMA_INVAL; 2725 DTRACE_PROBE(rpcib__i__nomatchxid2); 2726 } 2727 2728 /* 2729 * Done. 2730 */ 2731 mutex_exit(&qp->replylist_lock); 2732 return (ret); 2733 } 2734 2735 /* 2736 * RDMA write a buffer to the remote address. 2737 */ 2738 rdma_stat 2739 rib_write(CONN *conn, struct clist *cl, int wait) 2740 { 2741 ibt_send_wr_t tx_wr; 2742 int cv_sig; 2743 ibt_wr_ds_t sgl[DSEG_MAX]; 2744 struct send_wid *wdesc; 2745 ibt_status_t ibt_status; 2746 rdma_stat ret = RDMA_SUCCESS; 2747 rib_qp_t *qp = ctoqp(conn); 2748 uint64_t n_writes = 0; 2749 2750 if (cl == NULL) { 2751 return (RDMA_FAILED); 2752 } 2753 2754 while ((cl != NULL)) { 2755 if (cl->c_len > 0) { 2756 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2757 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2758 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2759 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2760 sgl[0].ds_va = cl->w.c_saddr; 2761 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2762 sgl[0].ds_len = cl->c_len; 2763 2764 if (wait) { 2765 cv_sig = 1; 2766 } else { 2767 if (n_writes > max_unsignaled_rws) { 2768 n_writes = 0; 2769 cv_sig = 1; 2770 } else { 2771 cv_sig = 0; 2772 } 2773 } 2774 2775 if (cv_sig) { 2776 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2777 wdesc = rib_init_sendwait(0, cv_sig, qp); 2778 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2779 mutex_enter(&wdesc->sendwait_lock); 2780 } else { 2781 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2782 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2783 } 2784 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2785 tx_wr.wr_trans = IBT_RC_SRV; 2786 tx_wr.wr_nds = 1; 2787 tx_wr.wr_sgl = sgl; 2788 2789 mutex_enter(&conn->c_lock); 2790 if (conn->c_state == C_CONNECTED) { 2791 ibt_status = 2792 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2793 } 2794 if (conn->c_state != C_CONNECTED || 2795 ibt_status != IBT_SUCCESS) { 2796 if (conn->c_state != C_DISCONN_PEND) 2797 conn->c_state = C_ERROR_CONN; 2798 mutex_exit(&conn->c_lock); 2799 if (cv_sig) { 2800 mutex_exit(&wdesc->sendwait_lock); 2801 (void) rib_free_sendwait(wdesc); 2802 } 2803 return (RDMA_CONNLOST); 2804 } 2805 2806 mutex_exit(&conn->c_lock); 2807 2808 /* 2809 * Wait for send to complete 2810 */ 2811 if (cv_sig) { 2812 2813 rib_send_hold(qp); 2814 mutex_exit(&wdesc->sendwait_lock); 2815 2816 ret = rib_sendwait(qp, wdesc); 2817 if (ret != 0) 2818 return (ret); 2819 } 2820 n_writes ++; 2821 } 2822 cl = cl->c_next; 2823 } 2824 return (RDMA_SUCCESS); 2825 } 2826 2827 /* 2828 * RDMA Read a buffer from the remote address. 2829 */ 2830 rdma_stat 2831 rib_read(CONN *conn, struct clist *cl, int wait) 2832 { 2833 ibt_send_wr_t rx_wr; 2834 int cv_sig = 0; 2835 ibt_wr_ds_t sgl; 2836 struct send_wid *wdesc; 2837 ibt_status_t ibt_status = IBT_SUCCESS; 2838 rdma_stat ret = RDMA_SUCCESS; 2839 rib_qp_t *qp = ctoqp(conn); 2840 2841 if (cl == NULL) { 2842 return (RDMA_FAILED); 2843 } 2844 2845 while (cl != NULL) { 2846 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2847 /* 2848 * Remote address is at the head chunk item in list. 2849 */ 2850 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2851 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2852 2853 sgl.ds_va = cl->u.c_daddr; 2854 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2855 sgl.ds_len = cl->c_len; 2856 2857 /* 2858 * If there are multiple chunks to be read, and 2859 * wait is set, ask for signal only for the last chunk 2860 * and wait only on the last chunk. The completion of 2861 * RDMA_READ on last chunk ensures that reads on all 2862 * previous chunks are also completed. 2863 */ 2864 if (wait && (cl->c_next == NULL)) { 2865 cv_sig = 1; 2866 wdesc = rib_init_sendwait(0, cv_sig, qp); 2867 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2868 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2869 mutex_enter(&wdesc->sendwait_lock); 2870 } else { 2871 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2872 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2873 } 2874 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2875 rx_wr.wr_trans = IBT_RC_SRV; 2876 rx_wr.wr_nds = 1; 2877 rx_wr.wr_sgl = &sgl; 2878 2879 mutex_enter(&conn->c_lock); 2880 if (conn->c_state == C_CONNECTED) { 2881 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2882 } 2883 if (conn->c_state != C_CONNECTED || 2884 ibt_status != IBT_SUCCESS) { 2885 if (conn->c_state != C_DISCONN_PEND) 2886 conn->c_state = C_ERROR_CONN; 2887 mutex_exit(&conn->c_lock); 2888 if (wait && (cl->c_next == NULL)) { 2889 mutex_exit(&wdesc->sendwait_lock); 2890 (void) rib_free_sendwait(wdesc); 2891 } 2892 return (RDMA_CONNLOST); 2893 } 2894 2895 mutex_exit(&conn->c_lock); 2896 2897 /* 2898 * Wait for send to complete if this is the 2899 * last item in the list. 2900 */ 2901 if (wait && cl->c_next == NULL) { 2902 rib_send_hold(qp); 2903 mutex_exit(&wdesc->sendwait_lock); 2904 2905 ret = rib_sendwait(qp, wdesc); 2906 2907 if (ret != 0) 2908 return (ret); 2909 } 2910 cl = cl->c_next; 2911 } 2912 return (RDMA_SUCCESS); 2913 } 2914 2915 /* 2916 * rib_srv_cm_handler() 2917 * Connection Manager callback to handle RC connection requests. 2918 */ 2919 /* ARGSUSED */ 2920 static ibt_cm_status_t 2921 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2922 ibt_cm_return_args_t *ret_args, void *priv_data, 2923 ibt_priv_data_len_t len) 2924 { 2925 queue_t *q; 2926 rib_qp_t *qp; 2927 rib_hca_t *hca; 2928 rdma_stat status = RDMA_SUCCESS; 2929 int i; 2930 struct clist cl; 2931 rdma_buf_t rdbuf = {0}; 2932 void *buf = NULL; 2933 CONN *conn; 2934 ibt_ip_cm_info_t ipinfo; 2935 struct sockaddr_in *s; 2936 struct sockaddr_in6 *s6; 2937 int sin_size = sizeof (struct sockaddr_in); 2938 int in_size = sizeof (struct in_addr); 2939 int sin6_size = sizeof (struct sockaddr_in6); 2940 2941 ASSERT(any != NULL); 2942 ASSERT(event != NULL); 2943 2944 hca = (rib_hca_t *)any; 2945 2946 /* got a connection request */ 2947 switch (event->cm_type) { 2948 case IBT_CM_EVENT_REQ_RCV: 2949 /* 2950 * If the plugin is in the NO_ACCEPT state, bail out. 2951 */ 2952 mutex_enter(&plugin_state_lock); 2953 if (plugin_state == NO_ACCEPT) { 2954 mutex_exit(&plugin_state_lock); 2955 return (IBT_CM_REJECT); 2956 } 2957 mutex_exit(&plugin_state_lock); 2958 2959 /* 2960 * Need to send a MRA MAD to CM so that it does not 2961 * timeout on us. 2962 */ 2963 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2964 event->cm_event.req.req_timeout * 8, NULL, 0); 2965 2966 mutex_enter(&rib_stat->open_hca_lock); 2967 q = rib_stat->q; 2968 mutex_exit(&rib_stat->open_hca_lock); 2969 2970 status = rib_svc_create_chan(hca, (caddr_t)q, 2971 event->cm_event.req.req_prim_hca_port, &qp); 2972 2973 if (status) { 2974 return (IBT_CM_REJECT); 2975 } 2976 2977 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2978 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2979 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2980 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2981 2982 /* 2983 * Pre-posts RECV buffers 2984 */ 2985 conn = qptoc(qp); 2986 for (i = 0; i < preposted_rbufs; i++) { 2987 bzero(&rdbuf, sizeof (rdbuf)); 2988 rdbuf.type = RECV_BUFFER; 2989 buf = rib_rbuf_alloc(conn, &rdbuf); 2990 if (buf == NULL) { 2991 /* 2992 * A connection is not established yet. 2993 * Just flush the channel. Buffers 2994 * posted till now will error out with 2995 * IBT_WC_WR_FLUSHED_ERR. 2996 */ 2997 (void) ibt_flush_channel(qp->qp_hdl); 2998 (void) rib_disconnect_channel(conn, NULL); 2999 return (IBT_CM_REJECT); 3000 } 3001 3002 bzero(&cl, sizeof (cl)); 3003 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3004 cl.c_len = rdbuf.len; 3005 cl.c_smemhandle.mrc_lmr = 3006 rdbuf.handle.mrc_lmr; /* lkey */ 3007 cl.c_next = NULL; 3008 status = rib_post_recv(conn, &cl); 3009 if (status != RDMA_SUCCESS) { 3010 /* 3011 * A connection is not established yet. 3012 * Just flush the channel. Buffers 3013 * posted till now will error out with 3014 * IBT_WC_WR_FLUSHED_ERR. 3015 */ 3016 (void) ibt_flush_channel(qp->qp_hdl); 3017 (void) rib_disconnect_channel(conn, NULL); 3018 return (IBT_CM_REJECT); 3019 } 3020 } 3021 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3022 3023 /* 3024 * Get the address translation 3025 */ 3026 rw_enter(&hca->state_lock, RW_READER); 3027 if (hca->state == HCA_DETACHED) { 3028 rw_exit(&hca->state_lock); 3029 return (IBT_CM_REJECT); 3030 } 3031 rw_exit(&hca->state_lock); 3032 3033 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3034 3035 if (ibt_get_ip_data(event->cm_priv_data_len, 3036 event->cm_priv_data, 3037 &ipinfo) != IBT_SUCCESS) { 3038 3039 return (IBT_CM_REJECT); 3040 } 3041 3042 switch (ipinfo.src_addr.family) { 3043 case AF_INET: 3044 3045 conn->c_raddr.maxlen = 3046 conn->c_raddr.len = sin_size; 3047 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3048 3049 s = (struct sockaddr_in *)conn->c_raddr.buf; 3050 s->sin_family = AF_INET; 3051 3052 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3053 &s->sin_addr, in_size); 3054 3055 break; 3056 3057 case AF_INET6: 3058 3059 conn->c_raddr.maxlen = 3060 conn->c_raddr.len = sin6_size; 3061 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3062 3063 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3064 s6->sin6_family = AF_INET6; 3065 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3066 &s6->sin6_addr, 3067 sizeof (struct in6_addr)); 3068 3069 break; 3070 3071 default: 3072 return (IBT_CM_REJECT); 3073 } 3074 3075 break; 3076 3077 case IBT_CM_EVENT_CONN_CLOSED: 3078 { 3079 CONN *conn; 3080 rib_qp_t *qp; 3081 3082 switch (event->cm_event.closed) { 3083 case IBT_CM_CLOSED_DREP_RCVD: 3084 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3085 case IBT_CM_CLOSED_DUP: 3086 case IBT_CM_CLOSED_ABORT: 3087 case IBT_CM_CLOSED_ALREADY: 3088 /* 3089 * These cases indicate the local end initiated 3090 * the closing of the channel. Nothing to do here. 3091 */ 3092 break; 3093 default: 3094 /* 3095 * Reason for CONN_CLOSED event must be one of 3096 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3097 * or IBT_CM_CLOSED_STALE. These indicate cases were 3098 * the remote end is closing the channel. In these 3099 * cases free the channel and transition to error 3100 * state 3101 */ 3102 qp = ibt_get_chan_private(event->cm_channel); 3103 conn = qptoc(qp); 3104 mutex_enter(&conn->c_lock); 3105 if (conn->c_state == C_DISCONN_PEND) { 3106 mutex_exit(&conn->c_lock); 3107 break; 3108 } 3109 conn->c_state = C_ERROR_CONN; 3110 3111 /* 3112 * Free the conn if c_ref goes down to 0 3113 */ 3114 if (conn->c_ref == 0) { 3115 /* 3116 * Remove from list and free conn 3117 */ 3118 conn->c_state = C_DISCONN_PEND; 3119 mutex_exit(&conn->c_lock); 3120 (void) rib_disconnect_channel(conn, 3121 &hca->srv_conn_list); 3122 } else { 3123 /* 3124 * conn will be freed when c_ref goes to 0. 3125 * Indicate to cleaning thread not to close 3126 * the connection, but just free the channel. 3127 */ 3128 conn->c_flags |= C_CLOSE_NOTNEEDED; 3129 mutex_exit(&conn->c_lock); 3130 } 3131 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3132 break; 3133 } 3134 break; 3135 } 3136 case IBT_CM_EVENT_CONN_EST: 3137 /* 3138 * RTU received, hence connection established. 3139 */ 3140 if (rib_debug > 1) 3141 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3142 "(CONN_EST) channel established"); 3143 break; 3144 3145 default: 3146 if (rib_debug > 2) { 3147 /* Let CM handle the following events. */ 3148 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3149 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3150 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3151 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3152 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3153 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3154 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3155 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3156 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3157 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3158 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3159 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3160 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3161 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3162 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3163 } 3164 } 3165 return (IBT_CM_DEFAULT); 3166 } 3167 3168 /* accept all other CM messages (i.e. let the CM handle them) */ 3169 return (IBT_CM_ACCEPT); 3170 } 3171 3172 static rdma_stat 3173 rib_register_service(rib_hca_t *hca, int service_type, 3174 uint8_t protocol_num, in_port_t dst_port) 3175 { 3176 ibt_srv_desc_t sdesc; 3177 ibt_hca_portinfo_t *port_infop; 3178 ib_svc_id_t srv_id; 3179 ibt_srv_hdl_t srv_hdl; 3180 uint_t port_size; 3181 uint_t pki, i, num_ports, nbinds; 3182 ibt_status_t ibt_status; 3183 rib_service_t *service; 3184 ib_pkey_t pkey; 3185 3186 /* 3187 * Query all ports for the given HCA 3188 */ 3189 rw_enter(&hca->state_lock, RW_READER); 3190 if (hca->state != HCA_DETACHED) { 3191 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3192 &num_ports, &port_size); 3193 rw_exit(&hca->state_lock); 3194 } else { 3195 rw_exit(&hca->state_lock); 3196 return (RDMA_FAILED); 3197 } 3198 if (ibt_status != IBT_SUCCESS) { 3199 return (RDMA_FAILED); 3200 } 3201 3202 DTRACE_PROBE1(rpcib__i__regservice_numports, 3203 int, num_ports); 3204 3205 for (i = 0; i < num_ports; i++) { 3206 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3207 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3208 int, i+1); 3209 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3210 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3211 int, i+1); 3212 } 3213 } 3214 3215 /* 3216 * Get all the IP addresses on this system to register the 3217 * given "service type" on all DNS recognized IP addrs. 3218 * Each service type such as NFS will have all the systems 3219 * IP addresses as its different names. For now the only 3220 * type of service we support in RPCIB is NFS. 3221 */ 3222 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3223 /* 3224 * Start registering and binding service to active 3225 * on active ports on this HCA. 3226 */ 3227 nbinds = 0; 3228 for (service = rib_stat->service_list; 3229 service && (service->srv_type != service_type); 3230 service = service->next) 3231 ; 3232 3233 if (service == NULL) { 3234 /* 3235 * We use IP addresses as the service names for 3236 * service registration. Register each of them 3237 * with CM to obtain a svc_id and svc_hdl. We do not 3238 * register the service with machine's loopback address. 3239 */ 3240 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3241 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3242 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3243 sdesc.sd_handler = rib_srv_cm_handler; 3244 sdesc.sd_flags = 0; 3245 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3246 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3247 1, &srv_hdl, &srv_id); 3248 if ((ibt_status != IBT_SUCCESS) && 3249 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3250 rw_exit(&rib_stat->service_list_lock); 3251 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3252 int, ibt_status); 3253 ibt_free_portinfo(port_infop, port_size); 3254 return (RDMA_FAILED); 3255 } 3256 3257 /* 3258 * Allocate and prepare a service entry 3259 */ 3260 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3261 3262 service->srv_type = service_type; 3263 service->srv_hdl = srv_hdl; 3264 service->srv_id = srv_id; 3265 3266 service->next = rib_stat->service_list; 3267 rib_stat->service_list = service; 3268 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3269 int, service->srv_type); 3270 } else { 3271 srv_hdl = service->srv_hdl; 3272 srv_id = service->srv_id; 3273 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3274 int, service->srv_type); 3275 } 3276 3277 for (i = 0; i < num_ports; i++) { 3278 ibt_sbind_hdl_t sbp; 3279 rib_hca_service_t *hca_srv; 3280 ib_gid_t gid; 3281 3282 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3283 continue; 3284 3285 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3286 pkey = port_infop[i].p_pkey_tbl[pki]; 3287 3288 rw_enter(&hca->bound_services_lock, RW_READER); 3289 gid = port_infop[i].p_sgid_tbl[0]; 3290 for (hca_srv = hca->bound_services; hca_srv; 3291 hca_srv = hca_srv->next) { 3292 if ((hca_srv->srv_id == service->srv_id) && 3293 (hca_srv->gid.gid_prefix == 3294 gid.gid_prefix) && 3295 (hca_srv->gid.gid_guid == gid.gid_guid)) 3296 break; 3297 } 3298 rw_exit(&hca->bound_services_lock); 3299 if (hca_srv != NULL) { 3300 /* 3301 * port is alreay bound the the service 3302 */ 3303 DTRACE_PROBE1( 3304 rpcib__i__regservice__already__bound, 3305 int, i+1); 3306 nbinds++; 3307 continue; 3308 } 3309 3310 if ((pkey & IBSRM_HB) && 3311 (pkey != IB_PKEY_INVALID_FULL)) { 3312 3313 sbp = NULL; 3314 ibt_status = ibt_bind_service(srv_hdl, 3315 gid, NULL, hca, &sbp); 3316 3317 if (ibt_status == IBT_SUCCESS) { 3318 hca_srv = kmem_zalloc( 3319 sizeof (rib_hca_service_t), 3320 KM_SLEEP); 3321 hca_srv->srv_id = srv_id; 3322 hca_srv->gid = gid; 3323 hca_srv->sbind_hdl = sbp; 3324 3325 rw_enter(&hca->bound_services_lock, 3326 RW_WRITER); 3327 hca_srv->next = hca->bound_services; 3328 hca->bound_services = hca_srv; 3329 rw_exit(&hca->bound_services_lock); 3330 nbinds++; 3331 } 3332 3333 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3334 int, ibt_status); 3335 } 3336 } 3337 } 3338 rw_exit(&rib_stat->service_list_lock); 3339 3340 ibt_free_portinfo(port_infop, port_size); 3341 3342 if (nbinds == 0) { 3343 return (RDMA_FAILED); 3344 } else { 3345 /* 3346 * Put this plugin into accept state, since atleast 3347 * one registration was successful. 3348 */ 3349 mutex_enter(&plugin_state_lock); 3350 plugin_state = ACCEPT; 3351 mutex_exit(&plugin_state_lock); 3352 return (RDMA_SUCCESS); 3353 } 3354 } 3355 3356 void 3357 rib_listen(struct rdma_svc_data *rd) 3358 { 3359 rdma_stat status; 3360 int n_listening = 0; 3361 rib_hca_t *hca; 3362 3363 mutex_enter(&rib_stat->listen_lock); 3364 /* 3365 * if rd parameter is NULL then it means that rib_stat->q is 3366 * already initialized by a call from RDMA and we just want to 3367 * add a newly attached HCA to the same listening state as other 3368 * HCAs. 3369 */ 3370 if (rd == NULL) { 3371 if (rib_stat->q == NULL) { 3372 mutex_exit(&rib_stat->listen_lock); 3373 return; 3374 } 3375 } else { 3376 rib_stat->q = &rd->q; 3377 } 3378 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3379 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3380 /* 3381 * First check if a hca is still attached 3382 */ 3383 rw_enter(&hca->state_lock, RW_READER); 3384 if (hca->state != HCA_INITED) { 3385 rw_exit(&hca->state_lock); 3386 continue; 3387 } 3388 rw_exit(&hca->state_lock); 3389 3390 /* 3391 * Right now the only service type is NFS. Hence 3392 * force feed this value. Ideally to communicate 3393 * the service type it should be passed down in 3394 * rdma_svc_data. 3395 */ 3396 status = rib_register_service(hca, NFS, 3397 IPPROTO_TCP, nfs_rdma_port); 3398 if (status == RDMA_SUCCESS) 3399 n_listening++; 3400 } 3401 rw_exit(&rib_stat->hcas_list_lock); 3402 3403 /* 3404 * Service active on an HCA, check rd->err_code for more 3405 * explainable errors. 3406 */ 3407 if (rd) { 3408 if (n_listening > 0) { 3409 rd->active = 1; 3410 rd->err_code = RDMA_SUCCESS; 3411 } else { 3412 rd->active = 0; 3413 rd->err_code = RDMA_FAILED; 3414 } 3415 } 3416 mutex_exit(&rib_stat->listen_lock); 3417 } 3418 3419 /* XXXX */ 3420 /* ARGSUSED */ 3421 static void 3422 rib_listen_stop(struct rdma_svc_data *svcdata) 3423 { 3424 rib_hca_t *hca; 3425 3426 mutex_enter(&rib_stat->listen_lock); 3427 /* 3428 * KRPC called the RDMATF to stop the listeners, this means 3429 * stop sending incomming or recieved requests to KRPC master 3430 * transport handle for RDMA-IB. This is also means that the 3431 * master transport handle, responsible for us, is going away. 3432 */ 3433 mutex_enter(&plugin_state_lock); 3434 plugin_state = NO_ACCEPT; 3435 if (svcdata != NULL) 3436 svcdata->active = 0; 3437 mutex_exit(&plugin_state_lock); 3438 3439 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3440 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3441 /* 3442 * First check if a hca is still attached 3443 */ 3444 rw_enter(&hca->state_lock, RW_READER); 3445 if (hca->state == HCA_DETACHED) { 3446 rw_exit(&hca->state_lock); 3447 continue; 3448 } 3449 rib_close_channels(&hca->srv_conn_list); 3450 rib_stop_services(hca); 3451 rw_exit(&hca->state_lock); 3452 } 3453 rw_exit(&rib_stat->hcas_list_lock); 3454 3455 /* 3456 * Avoid rib_listen() using the stale q field. 3457 * This could happen if a port goes up after all services 3458 * are already unregistered. 3459 */ 3460 rib_stat->q = NULL; 3461 mutex_exit(&rib_stat->listen_lock); 3462 } 3463 3464 /* 3465 * Traverse the HCA's service list to unbind and deregister services. 3466 * For each bound service of HCA to be removed, first find the corresponding 3467 * service handle (srv_hdl) and then unbind the service by calling 3468 * ibt_unbind_service(). 3469 */ 3470 static void 3471 rib_stop_services(rib_hca_t *hca) 3472 { 3473 rib_hca_service_t *srv_list, *to_remove; 3474 3475 /* 3476 * unbind and deregister the services for this service type. 3477 * Right now there is only one service type. In future it will 3478 * be passed down to this function. 3479 */ 3480 rw_enter(&hca->bound_services_lock, RW_READER); 3481 srv_list = hca->bound_services; 3482 hca->bound_services = NULL; 3483 rw_exit(&hca->bound_services_lock); 3484 3485 while (srv_list != NULL) { 3486 rib_service_t *sc; 3487 3488 to_remove = srv_list; 3489 srv_list = to_remove->next; 3490 rw_enter(&rib_stat->service_list_lock, RW_READER); 3491 for (sc = rib_stat->service_list; 3492 sc && (sc->srv_id != to_remove->srv_id); 3493 sc = sc->next) 3494 ; 3495 /* 3496 * if sc is NULL then the service doesn't exist anymore, 3497 * probably just removed completely through rib_stat. 3498 */ 3499 if (sc != NULL) 3500 (void) ibt_unbind_service(sc->srv_hdl, 3501 to_remove->sbind_hdl); 3502 rw_exit(&rib_stat->service_list_lock); 3503 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3504 } 3505 } 3506 3507 static struct svc_recv * 3508 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3509 { 3510 struct svc_recv *recvp; 3511 3512 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3513 recvp->vaddr = sgl->ds_va; 3514 recvp->qp = qp; 3515 recvp->bytes_xfer = 0; 3516 return (recvp); 3517 } 3518 3519 static int 3520 rib_free_svc_recv(struct svc_recv *recvp) 3521 { 3522 kmem_free(recvp, sizeof (*recvp)); 3523 3524 return (0); 3525 } 3526 3527 static struct reply * 3528 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3529 { 3530 struct reply *rep; 3531 3532 3533 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3534 if (rep == NULL) { 3535 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3536 return (NULL); 3537 } 3538 rep->xid = msgid; 3539 rep->vaddr_cq = NULL; 3540 rep->bytes_xfer = 0; 3541 rep->status = (uint_t)REPLY_WAIT; 3542 rep->prev = NULL; 3543 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3544 3545 mutex_enter(&qp->replylist_lock); 3546 if (qp->replylist) { 3547 rep->next = qp->replylist; 3548 qp->replylist->prev = rep; 3549 } 3550 qp->rep_list_size++; 3551 3552 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3553 int, qp->rep_list_size); 3554 3555 qp->replylist = rep; 3556 mutex_exit(&qp->replylist_lock); 3557 3558 return (rep); 3559 } 3560 3561 static rdma_stat 3562 rib_rem_replylist(rib_qp_t *qp) 3563 { 3564 struct reply *r, *n; 3565 3566 mutex_enter(&qp->replylist_lock); 3567 for (r = qp->replylist; r != NULL; r = n) { 3568 n = r->next; 3569 (void) rib_remreply(qp, r); 3570 } 3571 mutex_exit(&qp->replylist_lock); 3572 3573 return (RDMA_SUCCESS); 3574 } 3575 3576 static int 3577 rib_remreply(rib_qp_t *qp, struct reply *rep) 3578 { 3579 3580 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3581 if (rep->prev) { 3582 rep->prev->next = rep->next; 3583 } 3584 if (rep->next) { 3585 rep->next->prev = rep->prev; 3586 } 3587 if (qp->replylist == rep) 3588 qp->replylist = rep->next; 3589 3590 cv_destroy(&rep->wait_cv); 3591 qp->rep_list_size--; 3592 3593 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3594 int, qp->rep_list_size); 3595 3596 kmem_free(rep, sizeof (*rep)); 3597 3598 return (0); 3599 } 3600 3601 rdma_stat 3602 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3603 struct mrc *buf_handle) 3604 { 3605 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3606 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3607 rdma_stat status; 3608 rib_hca_t *hca = (ctoqp(conn))->hca; 3609 3610 /* 3611 * Note: ALL buffer pools use the same memory type RDMARW. 3612 */ 3613 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3614 if (status == RDMA_SUCCESS) { 3615 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3616 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3617 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3618 } else { 3619 buf_handle->mrc_linfo = NULL; 3620 buf_handle->mrc_lmr = 0; 3621 buf_handle->mrc_rmr = 0; 3622 } 3623 return (status); 3624 } 3625 3626 static rdma_stat 3627 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3628 ibt_mr_flags_t spec, 3629 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3630 { 3631 ibt_mr_attr_t mem_attr; 3632 ibt_status_t ibt_status; 3633 mem_attr.mr_vaddr = (uintptr_t)buf; 3634 mem_attr.mr_len = (ib_msglen_t)size; 3635 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3636 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3637 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3638 IBT_MR_ENABLE_WINDOW_BIND | spec; 3639 3640 rw_enter(&hca->state_lock, RW_READER); 3641 if (hca->state != HCA_DETACHED) { 3642 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3643 &mem_attr, mr_hdlp, mr_descp); 3644 rw_exit(&hca->state_lock); 3645 } else { 3646 rw_exit(&hca->state_lock); 3647 return (RDMA_FAILED); 3648 } 3649 3650 if (ibt_status != IBT_SUCCESS) { 3651 return (RDMA_FAILED); 3652 } 3653 return (RDMA_SUCCESS); 3654 } 3655 3656 rdma_stat 3657 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3658 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3659 { 3660 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3661 rib_lrc_entry_t *l; 3662 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3663 rdma_stat status; 3664 rib_hca_t *hca = (ctoqp(conn))->hca; 3665 3666 /* 3667 * Non-coherent memory registration. 3668 */ 3669 l = (rib_lrc_entry_t *)lrc; 3670 if (l) { 3671 if (l->registered) { 3672 buf_handle->mrc_linfo = 3673 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3674 buf_handle->mrc_lmr = 3675 (uint32_t)l->lrc_mhandle.mrc_lmr; 3676 buf_handle->mrc_rmr = 3677 (uint32_t)l->lrc_mhandle.mrc_rmr; 3678 *sync_handle = (RIB_SYNCMEM_HANDLE) 3679 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3680 return (RDMA_SUCCESS); 3681 } else { 3682 /* Always register the whole buffer */ 3683 buf = (caddr_t)l->lrc_buf; 3684 buflen = l->lrc_len; 3685 } 3686 } 3687 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3688 3689 if (status == RDMA_SUCCESS) { 3690 if (l) { 3691 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3692 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3693 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3694 l->registered = TRUE; 3695 } 3696 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3697 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3698 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3699 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3700 } else { 3701 buf_handle->mrc_linfo = NULL; 3702 buf_handle->mrc_lmr = 0; 3703 buf_handle->mrc_rmr = 0; 3704 } 3705 return (status); 3706 } 3707 3708 /* ARGSUSED */ 3709 rdma_stat 3710 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3711 { 3712 rib_hca_t *hca = (ctoqp(conn))->hca; 3713 /* 3714 * Allow memory deregistration even if HCA is 3715 * getting detached. Need all outstanding 3716 * memory registrations to be deregistered 3717 * before HCA_DETACH_EVENT can be accepted. 3718 */ 3719 (void) ibt_deregister_mr(hca->hca_hdl, 3720 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3721 return (RDMA_SUCCESS); 3722 } 3723 3724 /* ARGSUSED */ 3725 rdma_stat 3726 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3727 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3728 { 3729 rib_lrc_entry_t *l; 3730 l = (rib_lrc_entry_t *)lrc; 3731 if (l) 3732 if (l->registered) 3733 return (RDMA_SUCCESS); 3734 3735 (void) rib_deregistermem(conn, buf, buf_handle); 3736 3737 return (RDMA_SUCCESS); 3738 } 3739 3740 /* ARGSUSED */ 3741 rdma_stat 3742 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3743 int len, int cpu) 3744 { 3745 ibt_status_t status; 3746 rib_hca_t *hca = (ctoqp(conn))->hca; 3747 ibt_mr_sync_t mr_segment; 3748 3749 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3750 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3751 mr_segment.ms_len = (ib_memlen_t)len; 3752 if (cpu) { 3753 /* make incoming data visible to memory */ 3754 mr_segment.ms_flags = IBT_SYNC_WRITE; 3755 } else { 3756 /* make memory changes visible to IO */ 3757 mr_segment.ms_flags = IBT_SYNC_READ; 3758 } 3759 rw_enter(&hca->state_lock, RW_READER); 3760 if (hca->state != HCA_DETACHED) { 3761 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3762 rw_exit(&hca->state_lock); 3763 } else { 3764 rw_exit(&hca->state_lock); 3765 return (RDMA_FAILED); 3766 } 3767 3768 if (status == IBT_SUCCESS) 3769 return (RDMA_SUCCESS); 3770 else { 3771 return (RDMA_FAILED); 3772 } 3773 } 3774 3775 /* 3776 * XXXX ???? 3777 */ 3778 static rdma_stat 3779 rib_getinfo(rdma_info_t *info) 3780 { 3781 /* 3782 * XXXX Hack! 3783 */ 3784 info->addrlen = 16; 3785 info->mts = 1000000; 3786 info->mtu = 1000000; 3787 3788 return (RDMA_SUCCESS); 3789 } 3790 3791 rib_bufpool_t * 3792 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3793 { 3794 rib_bufpool_t *rbp = NULL; 3795 bufpool_t *bp = NULL; 3796 caddr_t buf; 3797 ibt_mr_attr_t mem_attr; 3798 ibt_status_t ibt_status; 3799 int i, j; 3800 3801 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3802 3803 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3804 num * sizeof (void *), KM_SLEEP); 3805 3806 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3807 bp->numelems = num; 3808 3809 3810 switch (ptype) { 3811 case SEND_BUFFER: 3812 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3813 bp->rsize = RPC_MSG_SZ; 3814 break; 3815 case RECV_BUFFER: 3816 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3817 bp->rsize = RPC_BUF_SIZE; 3818 break; 3819 default: 3820 goto fail; 3821 } 3822 3823 /* 3824 * Register the pool. 3825 */ 3826 bp->bufsize = num * bp->rsize; 3827 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3828 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3829 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3830 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3831 sizeof (ibt_mr_desc_t), KM_SLEEP); 3832 rw_enter(&hca->state_lock, RW_READER); 3833 3834 if (hca->state == HCA_DETACHED) { 3835 rw_exit(&hca->state_lock); 3836 goto fail; 3837 } 3838 3839 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3840 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3841 mem_attr.mr_vaddr = (uintptr_t)buf; 3842 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3843 mem_attr.mr_as = NULL; 3844 ibt_status = ibt_register_mr(hca->hca_hdl, 3845 hca->pd_hdl, &mem_attr, 3846 &rbp->mr_hdl[i], 3847 &rbp->mr_desc[i]); 3848 if (ibt_status != IBT_SUCCESS) { 3849 for (j = 0; j < i; j++) { 3850 (void) ibt_deregister_mr(hca->hca_hdl, 3851 rbp->mr_hdl[j]); 3852 } 3853 rw_exit(&hca->state_lock); 3854 goto fail; 3855 } 3856 } 3857 rw_exit(&hca->state_lock); 3858 buf = (caddr_t)bp->buf; 3859 for (i = 0; i < num; i++, buf += bp->rsize) { 3860 bp->buflist[i] = (void *)buf; 3861 } 3862 bp->buffree = num - 1; /* no. of free buffers */ 3863 rbp->bpool = bp; 3864 3865 return (rbp); 3866 fail: 3867 if (bp) { 3868 if (bp->buf) 3869 kmem_free(bp->buf, bp->bufsize); 3870 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3871 } 3872 if (rbp) { 3873 if (rbp->mr_hdl) 3874 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3875 if (rbp->mr_desc) 3876 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3877 kmem_free(rbp, sizeof (rib_bufpool_t)); 3878 } 3879 return (NULL); 3880 } 3881 3882 static void 3883 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3884 { 3885 int i; 3886 rib_bufpool_t *rbp = NULL; 3887 bufpool_t *bp; 3888 3889 /* 3890 * Obtain pool address based on type of pool 3891 */ 3892 switch (ptype) { 3893 case SEND_BUFFER: 3894 rbp = hca->send_pool; 3895 break; 3896 case RECV_BUFFER: 3897 rbp = hca->recv_pool; 3898 break; 3899 default: 3900 return; 3901 } 3902 if (rbp == NULL) 3903 return; 3904 3905 bp = rbp->bpool; 3906 3907 /* 3908 * Deregister the pool memory and free it. 3909 */ 3910 for (i = 0; i < bp->numelems; i++) { 3911 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3912 } 3913 } 3914 3915 static void 3916 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3917 { 3918 3919 rib_bufpool_t *rbp = NULL; 3920 bufpool_t *bp; 3921 3922 /* 3923 * Obtain pool address based on type of pool 3924 */ 3925 switch (ptype) { 3926 case SEND_BUFFER: 3927 rbp = hca->send_pool; 3928 break; 3929 case RECV_BUFFER: 3930 rbp = hca->recv_pool; 3931 break; 3932 default: 3933 return; 3934 } 3935 if (rbp == NULL) 3936 return; 3937 3938 bp = rbp->bpool; 3939 3940 /* 3941 * Free the pool memory. 3942 */ 3943 if (rbp->mr_hdl) 3944 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3945 3946 if (rbp->mr_desc) 3947 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3948 if (bp->buf) 3949 kmem_free(bp->buf, bp->bufsize); 3950 mutex_destroy(&bp->buflock); 3951 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3952 kmem_free(rbp, sizeof (rib_bufpool_t)); 3953 } 3954 3955 void 3956 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3957 { 3958 /* 3959 * Deregister the pool memory and free it. 3960 */ 3961 rib_rbufpool_deregister(hca, ptype); 3962 rib_rbufpool_free(hca, ptype); 3963 } 3964 3965 /* 3966 * Fetch a buffer from the pool of type specified in rdbuf->type. 3967 */ 3968 static rdma_stat 3969 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3970 { 3971 rib_lrc_entry_t *rlep; 3972 3973 if (rdbuf->type == RDMA_LONG_BUFFER) { 3974 rlep = rib_get_cache_buf(conn, rdbuf->len); 3975 rdbuf->rb_private = (caddr_t)rlep; 3976 rdbuf->addr = rlep->lrc_buf; 3977 rdbuf->handle = rlep->lrc_mhandle; 3978 return (RDMA_SUCCESS); 3979 } 3980 3981 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3982 if (rdbuf->addr) { 3983 switch (rdbuf->type) { 3984 case SEND_BUFFER: 3985 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3986 break; 3987 case RECV_BUFFER: 3988 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3989 break; 3990 default: 3991 rdbuf->len = 0; 3992 } 3993 return (RDMA_SUCCESS); 3994 } else 3995 return (RDMA_FAILED); 3996 } 3997 3998 /* 3999 * Fetch a buffer of specified type. 4000 * Note that rdbuf->handle is mw's rkey. 4001 */ 4002 static void * 4003 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4004 { 4005 rib_qp_t *qp = ctoqp(conn); 4006 rib_hca_t *hca = qp->hca; 4007 rdma_btype ptype = rdbuf->type; 4008 void *buf; 4009 rib_bufpool_t *rbp = NULL; 4010 bufpool_t *bp; 4011 int i; 4012 4013 /* 4014 * Obtain pool address based on type of pool 4015 */ 4016 switch (ptype) { 4017 case SEND_BUFFER: 4018 rbp = hca->send_pool; 4019 break; 4020 case RECV_BUFFER: 4021 rbp = hca->recv_pool; 4022 break; 4023 default: 4024 return (NULL); 4025 } 4026 if (rbp == NULL) 4027 return (NULL); 4028 4029 bp = rbp->bpool; 4030 4031 mutex_enter(&bp->buflock); 4032 if (bp->buffree < 0) { 4033 mutex_exit(&bp->buflock); 4034 return (NULL); 4035 } 4036 4037 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4038 buf = bp->buflist[bp->buffree]; 4039 rdbuf->addr = buf; 4040 rdbuf->len = bp->rsize; 4041 for (i = bp->numelems - 1; i >= 0; i--) { 4042 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4043 rdbuf->handle.mrc_rmr = 4044 (uint32_t)rbp->mr_desc[i].md_rkey; 4045 rdbuf->handle.mrc_linfo = 4046 (uintptr_t)rbp->mr_hdl[i]; 4047 rdbuf->handle.mrc_lmr = 4048 (uint32_t)rbp->mr_desc[i].md_lkey; 4049 bp->buffree--; 4050 4051 mutex_exit(&bp->buflock); 4052 4053 return (buf); 4054 } 4055 } 4056 4057 mutex_exit(&bp->buflock); 4058 4059 return (NULL); 4060 } 4061 4062 static void 4063 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4064 { 4065 4066 if (rdbuf->type == RDMA_LONG_BUFFER) { 4067 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4068 rdbuf->rb_private = NULL; 4069 return; 4070 } 4071 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4072 } 4073 4074 static void 4075 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4076 { 4077 rib_qp_t *qp = ctoqp(conn); 4078 rib_hca_t *hca = qp->hca; 4079 rib_bufpool_t *rbp = NULL; 4080 bufpool_t *bp; 4081 4082 /* 4083 * Obtain pool address based on type of pool 4084 */ 4085 switch (ptype) { 4086 case SEND_BUFFER: 4087 rbp = hca->send_pool; 4088 break; 4089 case RECV_BUFFER: 4090 rbp = hca->recv_pool; 4091 break; 4092 default: 4093 return; 4094 } 4095 if (rbp == NULL) 4096 return; 4097 4098 bp = rbp->bpool; 4099 4100 mutex_enter(&bp->buflock); 4101 if (++bp->buffree >= bp->numelems) { 4102 /* 4103 * Should never happen 4104 */ 4105 bp->buffree--; 4106 } else { 4107 bp->buflist[bp->buffree] = buf; 4108 } 4109 mutex_exit(&bp->buflock); 4110 } 4111 4112 static rdma_stat 4113 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4114 { 4115 rw_enter(&connlist->conn_lock, RW_WRITER); 4116 if (connlist->conn_hd) { 4117 cn->c_next = connlist->conn_hd; 4118 connlist->conn_hd->c_prev = cn; 4119 } 4120 connlist->conn_hd = cn; 4121 rw_exit(&connlist->conn_lock); 4122 4123 return (RDMA_SUCCESS); 4124 } 4125 4126 static rdma_stat 4127 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4128 { 4129 rw_enter(&connlist->conn_lock, RW_WRITER); 4130 if (cn->c_prev) { 4131 cn->c_prev->c_next = cn->c_next; 4132 } 4133 if (cn->c_next) { 4134 cn->c_next->c_prev = cn->c_prev; 4135 } 4136 if (connlist->conn_hd == cn) 4137 connlist->conn_hd = cn->c_next; 4138 rw_exit(&connlist->conn_lock); 4139 4140 return (RDMA_SUCCESS); 4141 } 4142 4143 /* ARGSUSED */ 4144 static rdma_stat 4145 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4146 int addr_type, void *handle, CONN **conn) 4147 { 4148 rdma_stat status; 4149 rpcib_ping_t rpt; 4150 4151 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4152 return (status); 4153 } 4154 4155 /* 4156 * rib_find_hca_connection 4157 * 4158 * if there is an existing connection to the specified address then 4159 * it will be returned in conn, otherwise conn will be set to NULL. 4160 * Also cleans up any connection that is in error state. 4161 */ 4162 static int 4163 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4164 struct netbuf *d_svcaddr, CONN **conn) 4165 { 4166 CONN *cn; 4167 clock_t cv_stat, timout; 4168 4169 *conn = NULL; 4170 again: 4171 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4172 cn = hca->cl_conn_list.conn_hd; 4173 while (cn != NULL) { 4174 /* 4175 * First, clear up any connection in the ERROR state 4176 */ 4177 mutex_enter(&cn->c_lock); 4178 if (cn->c_state == C_ERROR_CONN) { 4179 if (cn->c_ref == 0) { 4180 /* 4181 * Remove connection from list and destroy it. 4182 */ 4183 cn->c_state = C_DISCONN_PEND; 4184 mutex_exit(&cn->c_lock); 4185 rw_exit(&hca->cl_conn_list.conn_lock); 4186 rib_conn_close((void *)cn); 4187 goto again; 4188 } 4189 mutex_exit(&cn->c_lock); 4190 cn = cn->c_next; 4191 continue; 4192 } 4193 if (cn->c_state == C_DISCONN_PEND) { 4194 mutex_exit(&cn->c_lock); 4195 cn = cn->c_next; 4196 continue; 4197 } 4198 4199 /* 4200 * source address is only checked for if there is one, 4201 * this is the case for retries. 4202 */ 4203 if ((cn->c_raddr.len == d_svcaddr->len) && 4204 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4205 d_svcaddr->len) == 0) && 4206 ((s_svcaddr->len == 0) || 4207 ((cn->c_laddr.len == s_svcaddr->len) && 4208 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4209 s_svcaddr->len) == 0)))) { 4210 /* 4211 * Our connection. Give up conn list lock 4212 * as we are done traversing the list. 4213 */ 4214 rw_exit(&hca->cl_conn_list.conn_lock); 4215 if (cn->c_state == C_CONNECTED) { 4216 cn->c_ref++; /* sharing a conn */ 4217 mutex_exit(&cn->c_lock); 4218 *conn = cn; 4219 return (RDMA_SUCCESS); 4220 } 4221 if (cn->c_state == C_CONN_PEND) { 4222 /* 4223 * Hold a reference to this conn before 4224 * we give up the lock. 4225 */ 4226 cn->c_ref++; 4227 timout = ddi_get_lbolt() + 4228 drv_usectohz(CONN_WAIT_TIME * 1000000); 4229 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4230 &cn->c_lock, timout)) > 0 && 4231 cn->c_state == C_CONN_PEND) 4232 ; 4233 if (cv_stat == 0) { 4234 cn->c_ref--; 4235 mutex_exit(&cn->c_lock); 4236 return (RDMA_INTR); 4237 } 4238 if (cv_stat < 0) { 4239 cn->c_ref--; 4240 mutex_exit(&cn->c_lock); 4241 return (RDMA_TIMEDOUT); 4242 } 4243 if (cn->c_state == C_CONNECTED) { 4244 *conn = cn; 4245 mutex_exit(&cn->c_lock); 4246 return (RDMA_SUCCESS); 4247 } else { 4248 cn->c_ref--; 4249 mutex_exit(&cn->c_lock); 4250 return (RDMA_TIMEDOUT); 4251 } 4252 } 4253 } 4254 mutex_exit(&cn->c_lock); 4255 cn = cn->c_next; 4256 } 4257 rw_exit(&hca->cl_conn_list.conn_lock); 4258 *conn = NULL; 4259 return (RDMA_FAILED); 4260 } 4261 4262 /* 4263 * Connection management. 4264 * IBTF does not support recycling of channels. So connections are only 4265 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4266 * C_DISCONN_PEND state. No C_IDLE state. 4267 * C_CONN_PEND state: Connection establishment in progress to the server. 4268 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4269 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4270 * only in this state. 4271 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4272 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4273 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4274 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4275 * c_ref drops to 0 (this indicates that RPC has no more references to this 4276 * connection), the connection should be destroyed. A connection transitions 4277 * into this state when it is being destroyed. 4278 */ 4279 /* ARGSUSED */ 4280 static rdma_stat 4281 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4282 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4283 { 4284 CONN *cn; 4285 int status; 4286 rib_hca_t *hca; 4287 rib_qp_t *qp; 4288 int s_addr_len; 4289 char *s_addr_buf; 4290 4291 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4292 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4293 rw_enter(&hca->state_lock, RW_READER); 4294 if (hca->state != HCA_DETACHED) { 4295 status = rib_find_hca_connection(hca, s_svcaddr, 4296 d_svcaddr, conn); 4297 rw_exit(&hca->state_lock); 4298 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4299 rw_exit(&rib_stat->hcas_list_lock); 4300 return (status); 4301 } 4302 } else 4303 rw_exit(&hca->state_lock); 4304 } 4305 rw_exit(&rib_stat->hcas_list_lock); 4306 4307 /* 4308 * No existing connection found, establish a new connection. 4309 */ 4310 bzero(rpt, sizeof (rpcib_ping_t)); 4311 4312 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4313 if (status != RDMA_SUCCESS) { 4314 return (RDMA_FAILED); 4315 } 4316 hca = rpt->hca; 4317 4318 if (rpt->srcip.family == AF_INET) { 4319 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4320 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4321 } else if (rpt->srcip.family == AF_INET6) { 4322 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4323 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4324 } else { 4325 return (RDMA_FAILED); 4326 } 4327 4328 /* 4329 * Channel to server doesn't exist yet, create one. 4330 */ 4331 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4332 return (RDMA_FAILED); 4333 } 4334 cn = qptoc(qp); 4335 cn->c_state = C_CONN_PEND; 4336 cn->c_ref = 1; 4337 4338 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4339 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4340 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4341 4342 /* 4343 * Add to conn list. 4344 * We had given up the READER lock. In the time since then, 4345 * another thread might have created the connection we are 4346 * trying here. But for now, that is quiet alright - there 4347 * might be two connections between a pair of hosts instead 4348 * of one. If we really want to close that window, 4349 * then need to check the list after acquiring the 4350 * WRITER lock. 4351 */ 4352 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4353 status = rib_conn_to_srv(hca, qp, rpt); 4354 mutex_enter(&cn->c_lock); 4355 if (status == RDMA_SUCCESS) { 4356 cn->c_state = C_CONNECTED; 4357 *conn = cn; 4358 } else { 4359 cn->c_state = C_ERROR_CONN; 4360 cn->c_ref--; 4361 } 4362 cv_broadcast(&cn->c_cv); 4363 mutex_exit(&cn->c_lock); 4364 return (status); 4365 } 4366 4367 static void 4368 rib_conn_close(void *rarg) 4369 { 4370 CONN *conn = (CONN *)rarg; 4371 rib_qp_t *qp = ctoqp(conn); 4372 4373 mutex_enter(&conn->c_lock); 4374 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4375 4376 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4377 /* 4378 * Live connection in CONNECTED state. 4379 */ 4380 if (conn->c_state == C_CONNECTED) { 4381 conn->c_state = C_ERROR_CONN; 4382 } 4383 mutex_exit(&conn->c_lock); 4384 4385 rib_close_a_channel(conn); 4386 4387 mutex_enter(&conn->c_lock); 4388 conn->c_flags &= ~C_CLOSE_PENDING; 4389 cv_signal(&conn->c_cv); 4390 } 4391 4392 mutex_exit(&conn->c_lock); 4393 4394 if (qp->mode == RIB_SERVER) 4395 (void) rib_disconnect_channel(conn, 4396 &qp->hca->srv_conn_list); 4397 else 4398 (void) rib_disconnect_channel(conn, 4399 &qp->hca->cl_conn_list); 4400 } 4401 4402 static void 4403 rib_conn_timeout_call(void *carg) 4404 { 4405 time_t idle_time; 4406 CONN *conn = (CONN *)carg; 4407 rib_hca_t *hca = ctoqp(conn)->hca; 4408 int error; 4409 4410 mutex_enter(&conn->c_lock); 4411 if ((conn->c_ref > 0) || 4412 (conn->c_state == C_DISCONN_PEND)) { 4413 conn->c_timeout = NULL; 4414 mutex_exit(&conn->c_lock); 4415 return; 4416 } 4417 4418 idle_time = (gethrestime_sec() - conn->c_last_used); 4419 4420 if ((idle_time <= rib_conn_timeout) && 4421 (conn->c_state != C_ERROR_CONN)) { 4422 /* 4423 * There was activity after the last timeout. 4424 * Extend the conn life. Unless the conn is 4425 * already in error state. 4426 */ 4427 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4428 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4429 mutex_exit(&conn->c_lock); 4430 return; 4431 } 4432 4433 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4434 (void *)conn, DDI_NOSLEEP); 4435 4436 /* 4437 * If taskq dispatch fails above, then reset the timeout 4438 * to try again after 10 secs. 4439 */ 4440 4441 if (error != DDI_SUCCESS) { 4442 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4443 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4444 mutex_exit(&conn->c_lock); 4445 return; 4446 } 4447 4448 conn->c_state = C_DISCONN_PEND; 4449 mutex_exit(&conn->c_lock); 4450 } 4451 4452 static rdma_stat 4453 rib_conn_release(CONN *conn) 4454 { 4455 4456 mutex_enter(&conn->c_lock); 4457 conn->c_ref--; 4458 4459 conn->c_last_used = gethrestime_sec(); 4460 if (conn->c_ref > 0) { 4461 mutex_exit(&conn->c_lock); 4462 return (RDMA_SUCCESS); 4463 } 4464 4465 /* 4466 * If a conn is C_ERROR_CONN, close the channel. 4467 */ 4468 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4469 conn->c_state = C_DISCONN_PEND; 4470 mutex_exit(&conn->c_lock); 4471 rib_conn_close((void *)conn); 4472 return (RDMA_SUCCESS); 4473 } 4474 4475 /* 4476 * c_ref == 0, set a timeout for conn release 4477 */ 4478 4479 if (conn->c_timeout == NULL) { 4480 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4481 SEC_TO_TICK(rib_conn_timeout)); 4482 } 4483 4484 mutex_exit(&conn->c_lock); 4485 return (RDMA_SUCCESS); 4486 } 4487 4488 /* 4489 * Add at front of list 4490 */ 4491 static struct rdma_done_list * 4492 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4493 { 4494 struct rdma_done_list *rd; 4495 4496 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4497 4498 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4499 rd->xid = xid; 4500 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4501 4502 rd->prev = NULL; 4503 rd->next = qp->rdlist; 4504 if (qp->rdlist != NULL) 4505 qp->rdlist->prev = rd; 4506 qp->rdlist = rd; 4507 4508 return (rd); 4509 } 4510 4511 static void 4512 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4513 { 4514 struct rdma_done_list *r; 4515 4516 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4517 4518 r = rd->next; 4519 if (r != NULL) { 4520 r->prev = rd->prev; 4521 } 4522 4523 r = rd->prev; 4524 if (r != NULL) { 4525 r->next = rd->next; 4526 } else { 4527 qp->rdlist = rd->next; 4528 } 4529 4530 cv_destroy(&rd->rdma_done_cv); 4531 kmem_free(rd, sizeof (*rd)); 4532 } 4533 4534 static void 4535 rdma_done_rem_list(rib_qp_t *qp) 4536 { 4537 struct rdma_done_list *r, *n; 4538 4539 mutex_enter(&qp->rdlist_lock); 4540 for (r = qp->rdlist; r != NULL; r = n) { 4541 n = r->next; 4542 rdma_done_rm(qp, r); 4543 } 4544 mutex_exit(&qp->rdlist_lock); 4545 } 4546 4547 static void 4548 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4549 { 4550 struct rdma_done_list *r = qp->rdlist; 4551 4552 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4553 4554 while (r) { 4555 if (r->xid == xid) { 4556 cv_signal(&r->rdma_done_cv); 4557 return; 4558 } else { 4559 r = r->next; 4560 } 4561 } 4562 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4563 int, xid); 4564 } 4565 4566 /* 4567 * Expects conn->c_lock to be held by the caller. 4568 */ 4569 4570 static void 4571 rib_close_a_channel(CONN *conn) 4572 { 4573 rib_qp_t *qp; 4574 qp = ctoqp(conn); 4575 4576 if (qp->qp_hdl == NULL) { 4577 /* channel already freed */ 4578 return; 4579 } 4580 4581 /* 4582 * Call ibt_close_rc_channel in blocking mode 4583 * with no callbacks. 4584 */ 4585 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4586 NULL, 0, NULL, NULL, 0); 4587 } 4588 4589 /* 4590 * Goes through all connections and closes the channel 4591 * This will cause all the WRs on those channels to be 4592 * flushed. 4593 */ 4594 static void 4595 rib_close_channels(rib_conn_list_t *connlist) 4596 { 4597 CONN *conn, *tmp; 4598 4599 rw_enter(&connlist->conn_lock, RW_READER); 4600 conn = connlist->conn_hd; 4601 while (conn != NULL) { 4602 mutex_enter(&conn->c_lock); 4603 tmp = conn->c_next; 4604 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4605 4606 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4607 4608 /* 4609 * Live connection in CONNECTED state. 4610 */ 4611 if (conn->c_state == C_CONNECTED) 4612 conn->c_state = C_ERROR_CONN; 4613 mutex_exit(&conn->c_lock); 4614 4615 rib_close_a_channel(conn); 4616 4617 mutex_enter(&conn->c_lock); 4618 conn->c_flags &= ~C_CLOSE_PENDING; 4619 /* Signal a pending rib_disconnect_channel() */ 4620 cv_signal(&conn->c_cv); 4621 } 4622 mutex_exit(&conn->c_lock); 4623 conn = tmp; 4624 } 4625 rw_exit(&connlist->conn_lock); 4626 } 4627 4628 /* 4629 * Frees up all connections that are no longer being referenced 4630 */ 4631 static void 4632 rib_purge_connlist(rib_conn_list_t *connlist) 4633 { 4634 CONN *conn; 4635 4636 top: 4637 rw_enter(&connlist->conn_lock, RW_READER); 4638 conn = connlist->conn_hd; 4639 while (conn != NULL) { 4640 mutex_enter(&conn->c_lock); 4641 4642 /* 4643 * At this point connection is either in ERROR 4644 * or DISCONN_PEND state. If in DISCONN_PEND state 4645 * then some other thread is culling that connection. 4646 * If not and if c_ref is 0, then destroy the connection. 4647 */ 4648 if (conn->c_ref == 0 && 4649 conn->c_state != C_DISCONN_PEND) { 4650 /* 4651 * Cull the connection 4652 */ 4653 conn->c_state = C_DISCONN_PEND; 4654 mutex_exit(&conn->c_lock); 4655 rw_exit(&connlist->conn_lock); 4656 (void) rib_disconnect_channel(conn, connlist); 4657 goto top; 4658 } else { 4659 /* 4660 * conn disconnect already scheduled or will 4661 * happen from conn_release when c_ref drops to 0. 4662 */ 4663 mutex_exit(&conn->c_lock); 4664 } 4665 conn = conn->c_next; 4666 } 4667 rw_exit(&connlist->conn_lock); 4668 4669 /* 4670 * At this point, only connections with c_ref != 0 are on the list 4671 */ 4672 } 4673 4674 /* 4675 * Free all the HCA resources and close 4676 * the hca. 4677 */ 4678 4679 static void 4680 rib_free_hca(rib_hca_t *hca) 4681 { 4682 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4683 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4684 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4685 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4686 4687 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4688 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4689 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4690 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4691 4692 rib_rbufpool_destroy(hca, RECV_BUFFER); 4693 rib_rbufpool_destroy(hca, SEND_BUFFER); 4694 rib_destroy_cache(hca); 4695 if (rib_mod.rdma_count == 0) 4696 rdma_unregister_mod(&rib_mod); 4697 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4698 (void) ibt_close_hca(hca->hca_hdl); 4699 hca->hca_hdl = NULL; 4700 } 4701 4702 4703 static void 4704 rib_stop_hca_services(rib_hca_t *hca) 4705 { 4706 rib_stop_services(hca); 4707 rib_close_channels(&hca->cl_conn_list); 4708 rib_close_channels(&hca->srv_conn_list); 4709 4710 rib_purge_connlist(&hca->cl_conn_list); 4711 rib_purge_connlist(&hca->srv_conn_list); 4712 4713 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4714 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4715 GLOBAL_ZONEID); 4716 stats_enabled = FALSE; 4717 } 4718 4719 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4720 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4721 if (hca->srv_conn_list.conn_hd == NULL && 4722 hca->cl_conn_list.conn_hd == NULL) { 4723 /* 4724 * conn_lists are NULL, so destroy 4725 * buffers, close hca and be done. 4726 */ 4727 rib_free_hca(hca); 4728 } 4729 rw_exit(&hca->cl_conn_list.conn_lock); 4730 rw_exit(&hca->srv_conn_list.conn_lock); 4731 4732 if (hca->hca_hdl != NULL) { 4733 mutex_enter(&hca->inuse_lock); 4734 while (hca->inuse) 4735 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4736 mutex_exit(&hca->inuse_lock); 4737 4738 rib_free_hca(hca); 4739 } 4740 rw_destroy(&hca->bound_services_lock); 4741 4742 if (hca->cleanup_helper != NULL) { 4743 ddi_taskq_destroy(hca->cleanup_helper); 4744 hca->cleanup_helper = NULL; 4745 } 4746 } 4747 4748 /* 4749 * Cleans and closes up all uses of the HCA 4750 */ 4751 static void 4752 rib_detach_hca(rib_hca_t *hca) 4753 { 4754 rib_hca_t **hcap; 4755 4756 /* 4757 * Stop all services on the HCA 4758 * Go through cl_conn_list and close all rc_channels 4759 * Go through svr_conn_list and close all rc_channels 4760 * Free connections whose c_ref has dropped to 0 4761 * Destroy all CQs 4762 * Deregister and released all buffer pool memory after all 4763 * connections are destroyed 4764 * Free the protection domain 4765 * ibt_close_hca() 4766 */ 4767 rw_enter(&hca->state_lock, RW_WRITER); 4768 if (hca->state == HCA_DETACHED) { 4769 rw_exit(&hca->state_lock); 4770 return; 4771 } 4772 4773 hca->state = HCA_DETACHED; 4774 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4775 for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca); 4776 hcap = &(*hcap)->next) 4777 ; 4778 ASSERT(*hcap == hca); 4779 *hcap = hca->next; 4780 rib_stat->nhca_inited--; 4781 rib_mod.rdma_count--; 4782 rw_exit(&rib_stat->hcas_list_lock); 4783 rw_exit(&hca->state_lock); 4784 4785 rib_stop_hca_services(hca); 4786 4787 kmem_free(hca, sizeof (*hca)); 4788 } 4789 4790 static void 4791 rib_server_side_cache_reclaim(void *argp) 4792 { 4793 cache_avl_struct_t *rcas; 4794 rib_lrc_entry_t *rb; 4795 rib_hca_t *hca = (rib_hca_t *)argp; 4796 4797 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4798 rcas = avl_first(&hca->avl_tree); 4799 if (rcas != NULL) 4800 avl_remove(&hca->avl_tree, rcas); 4801 4802 while (rcas != NULL) { 4803 while (rcas->r.forw != &rcas->r) { 4804 rcas->elements--; 4805 rb = rcas->r.forw; 4806 remque(rb); 4807 if (rb->registered) 4808 (void) rib_deregistermem_via_hca(hca, 4809 rb->lrc_buf, rb->lrc_mhandle); 4810 4811 hca->cache_allocation -= rb->lrc_len; 4812 kmem_free(rb->lrc_buf, rb->lrc_len); 4813 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4814 } 4815 mutex_destroy(&rcas->node_lock); 4816 kmem_cache_free(hca->server_side_cache, rcas); 4817 rcas = avl_first(&hca->avl_tree); 4818 if (rcas != NULL) 4819 avl_remove(&hca->avl_tree, rcas); 4820 } 4821 rw_exit(&hca->avl_rw_lock); 4822 } 4823 4824 static void 4825 rib_server_side_cache_cleanup(void *argp) 4826 { 4827 cache_avl_struct_t *rcas; 4828 rib_lrc_entry_t *rb; 4829 rib_hca_t *hca = (rib_hca_t *)argp; 4830 4831 mutex_enter(&hca->cache_allocation_lock); 4832 if (hca->cache_allocation < cache_limit) { 4833 mutex_exit(&hca->cache_allocation_lock); 4834 return; 4835 } 4836 mutex_exit(&hca->cache_allocation_lock); 4837 4838 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4839 rcas = avl_last(&hca->avl_tree); 4840 if (rcas != NULL) 4841 avl_remove(&hca->avl_tree, rcas); 4842 4843 while (rcas != NULL) { 4844 while (rcas->r.forw != &rcas->r) { 4845 rcas->elements--; 4846 rb = rcas->r.forw; 4847 remque(rb); 4848 if (rb->registered) 4849 (void) rib_deregistermem_via_hca(hca, 4850 rb->lrc_buf, rb->lrc_mhandle); 4851 4852 hca->cache_allocation -= rb->lrc_len; 4853 4854 kmem_free(rb->lrc_buf, rb->lrc_len); 4855 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4856 } 4857 mutex_destroy(&rcas->node_lock); 4858 if (hca->server_side_cache) { 4859 kmem_cache_free(hca->server_side_cache, rcas); 4860 } 4861 4862 if (hca->cache_allocation < cache_limit) { 4863 rw_exit(&hca->avl_rw_lock); 4864 return; 4865 } 4866 4867 rcas = avl_last(&hca->avl_tree); 4868 if (rcas != NULL) 4869 avl_remove(&hca->avl_tree, rcas); 4870 } 4871 rw_exit(&hca->avl_rw_lock); 4872 } 4873 4874 static int 4875 avl_compare(const void *t1, const void *t2) 4876 { 4877 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4878 return (0); 4879 4880 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4881 return (-1); 4882 4883 return (1); 4884 } 4885 4886 static void 4887 rib_destroy_cache(rib_hca_t *hca) 4888 { 4889 if (hca->avl_init) { 4890 rib_server_side_cache_reclaim((void *)hca); 4891 if (hca->server_side_cache) { 4892 kmem_cache_destroy(hca->server_side_cache); 4893 hca->server_side_cache = NULL; 4894 } 4895 avl_destroy(&hca->avl_tree); 4896 mutex_destroy(&hca->cache_allocation_lock); 4897 rw_destroy(&hca->avl_rw_lock); 4898 } 4899 hca->avl_init = FALSE; 4900 } 4901 4902 static void 4903 rib_force_cleanup(void *hca) 4904 { 4905 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4906 (void) ddi_taskq_dispatch( 4907 ((rib_hca_t *)hca)->cleanup_helper, 4908 rib_server_side_cache_cleanup, 4909 (void *)hca, DDI_NOSLEEP); 4910 } 4911 4912 static rib_lrc_entry_t * 4913 rib_get_cache_buf(CONN *conn, uint32_t len) 4914 { 4915 cache_avl_struct_t cas, *rcas; 4916 rib_hca_t *hca = (ctoqp(conn))->hca; 4917 rib_lrc_entry_t *reply_buf; 4918 avl_index_t where = NULL; 4919 uint64_t c_alloc = 0; 4920 4921 if (!hca->avl_init) 4922 goto error_alloc; 4923 4924 cas.len = len; 4925 4926 rw_enter(&hca->avl_rw_lock, RW_READER); 4927 4928 mutex_enter(&hca->cache_allocation_lock); 4929 c_alloc = hca->cache_allocation; 4930 mutex_exit(&hca->cache_allocation_lock); 4931 4932 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4933 &where)) == NULL) { 4934 /* Am I above the cache limit */ 4935 if ((c_alloc + len) >= cache_limit) { 4936 rib_force_cleanup((void *)hca); 4937 rw_exit(&hca->avl_rw_lock); 4938 mutex_enter(&hca->cache_allocation_lock); 4939 hca->cache_misses_above_the_limit ++; 4940 mutex_exit(&hca->cache_allocation_lock); 4941 4942 /* Allocate and register the buffer directly */ 4943 goto error_alloc; 4944 } 4945 4946 rw_exit(&hca->avl_rw_lock); 4947 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4948 4949 /* Recheck to make sure no other thread added the entry in */ 4950 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4951 &cas, &where)) == NULL) { 4952 /* Allocate an avl tree entry */ 4953 rcas = (cache_avl_struct_t *) 4954 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4955 4956 bzero(rcas, sizeof (cache_avl_struct_t)); 4957 rcas->elements = 0; 4958 rcas->r.forw = &rcas->r; 4959 rcas->r.back = &rcas->r; 4960 rcas->len = len; 4961 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4962 avl_insert(&hca->avl_tree, rcas, where); 4963 } 4964 } 4965 4966 mutex_enter(&rcas->node_lock); 4967 4968 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4969 reply_buf = rcas->r.forw; 4970 remque(reply_buf); 4971 rcas->elements--; 4972 mutex_exit(&rcas->node_lock); 4973 rw_exit(&hca->avl_rw_lock); 4974 4975 mutex_enter(&hca->cache_allocation_lock); 4976 hca->cache_hits++; 4977 hca->cache_allocation -= len; 4978 mutex_exit(&hca->cache_allocation_lock); 4979 } else { 4980 /* Am I above the cache limit */ 4981 mutex_exit(&rcas->node_lock); 4982 if ((c_alloc + len) >= cache_limit) { 4983 rib_force_cleanup((void *)hca); 4984 rw_exit(&hca->avl_rw_lock); 4985 4986 mutex_enter(&hca->cache_allocation_lock); 4987 hca->cache_misses_above_the_limit++; 4988 mutex_exit(&hca->cache_allocation_lock); 4989 /* Allocate and register the buffer directly */ 4990 goto error_alloc; 4991 } 4992 rw_exit(&hca->avl_rw_lock); 4993 mutex_enter(&hca->cache_allocation_lock); 4994 hca->cache_misses++; 4995 mutex_exit(&hca->cache_allocation_lock); 4996 /* Allocate a reply_buf entry */ 4997 reply_buf = (rib_lrc_entry_t *) 4998 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4999 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5000 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5001 reply_buf->lrc_len = len; 5002 reply_buf->registered = FALSE; 5003 reply_buf->avl_node = (void *)rcas; 5004 } 5005 5006 return (reply_buf); 5007 5008 error_alloc: 5009 reply_buf = (rib_lrc_entry_t *) 5010 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5011 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5012 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5013 reply_buf->lrc_len = len; 5014 reply_buf->registered = FALSE; 5015 reply_buf->avl_node = NULL; 5016 5017 return (reply_buf); 5018 } 5019 5020 /* 5021 * Return a pre-registered back to the cache (without 5022 * unregistering the buffer).. 5023 */ 5024 5025 static void 5026 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5027 { 5028 cache_avl_struct_t cas, *rcas; 5029 avl_index_t where = NULL; 5030 rib_hca_t *hca = (ctoqp(conn))->hca; 5031 5032 if (!hca->avl_init) 5033 goto error_free; 5034 5035 cas.len = reg_buf->lrc_len; 5036 rw_enter(&hca->avl_rw_lock, RW_READER); 5037 if ((rcas = (cache_avl_struct_t *) 5038 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5039 rw_exit(&hca->avl_rw_lock); 5040 goto error_free; 5041 } else { 5042 cas.len = reg_buf->lrc_len; 5043 mutex_enter(&rcas->node_lock); 5044 insque(reg_buf, &rcas->r); 5045 rcas->elements ++; 5046 mutex_exit(&rcas->node_lock); 5047 rw_exit(&hca->avl_rw_lock); 5048 mutex_enter(&hca->cache_allocation_lock); 5049 hca->cache_allocation += cas.len; 5050 mutex_exit(&hca->cache_allocation_lock); 5051 } 5052 5053 return; 5054 5055 error_free: 5056 5057 if (reg_buf->registered) 5058 (void) rib_deregistermem_via_hca(hca, 5059 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5060 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5061 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5062 } 5063 5064 static rdma_stat 5065 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5066 uint_t buflen, struct mrc *buf_handle) 5067 { 5068 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5069 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5070 rdma_stat status; 5071 5072 5073 /* 5074 * Note: ALL buffer pools use the same memory type RDMARW. 5075 */ 5076 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5077 if (status == RDMA_SUCCESS) { 5078 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5079 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5080 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5081 } else { 5082 buf_handle->mrc_linfo = NULL; 5083 buf_handle->mrc_lmr = 0; 5084 buf_handle->mrc_rmr = 0; 5085 } 5086 return (status); 5087 } 5088 5089 /* ARGSUSED */ 5090 static rdma_stat 5091 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5092 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5093 { 5094 5095 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5096 return (RDMA_SUCCESS); 5097 } 5098 5099 /* ARGSUSED */ 5100 static rdma_stat 5101 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5102 { 5103 5104 (void) ibt_deregister_mr(hca->hca_hdl, 5105 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5106 return (RDMA_SUCCESS); 5107 } 5108 5109 /* 5110 * Check if the IP interface named by `lifrp' is RDMA-capable. 5111 */ 5112 static boolean_t 5113 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5114 { 5115 char ifname[LIFNAMSIZ]; 5116 char *cp; 5117 5118 if (lifrp->lifr_type == IFT_IB) 5119 return (B_TRUE); 5120 5121 /* 5122 * Strip off the logical interface portion before getting 5123 * intimate with the name. 5124 */ 5125 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5126 if ((cp = strchr(ifname, ':')) != NULL) 5127 *cp = '\0'; 5128 5129 return (strcmp("lo0", ifname) == 0); 5130 } 5131 5132 static int 5133 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5134 { 5135 vnode_t *kvp, *vp; 5136 TIUSER *tiptr; 5137 struct strioctl iocb; 5138 k_sigset_t smask; 5139 int err = 0; 5140 5141 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 5142 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 5143 &tiptr, CRED()) == 0) { 5144 vp = tiptr->fp->f_vnode; 5145 } else { 5146 VN_RELE(kvp); 5147 return (EPROTO); 5148 } 5149 } else { 5150 return (EPROTO); 5151 } 5152 5153 iocb.ic_cmd = cmd; 5154 iocb.ic_timout = 0; 5155 iocb.ic_len = len; 5156 iocb.ic_dp = (caddr_t)arg; 5157 sigintr(&smask, 0); 5158 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5159 sigunintr(&smask); 5160 (void) t_kclose(tiptr, 0); 5161 VN_RELE(kvp); 5162 return (err); 5163 } 5164 5165 /* 5166 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5167 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5168 */ 5169 static int 5170 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5171 { 5172 int err; 5173 struct lifnum lifn; 5174 5175 bzero(&lifn, sizeof (struct lifnum)); 5176 lifn.lifn_family = AF_UNSPEC; 5177 5178 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5179 if (err != 0) 5180 return (err); 5181 5182 /* 5183 * Pad the interface count to account for additional interfaces that 5184 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5185 */ 5186 lifn.lifn_count += 4; 5187 5188 bzero(lifcp, sizeof (struct lifconf)); 5189 lifcp->lifc_family = AF_UNSPEC; 5190 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5191 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5192 5193 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5194 if (err != 0) { 5195 kmem_free(lifcp->lifc_buf, *bufsizep); 5196 return (err); 5197 } 5198 return (0); 5199 } 5200 5201 static boolean_t 5202 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5203 { 5204 uint_t i, nifs; 5205 uint_t bufsize; 5206 struct lifconf lifc; 5207 struct lifreq *lifrp; 5208 struct sockaddr_in *sinp; 5209 struct sockaddr_in6 *sin6p; 5210 5211 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5212 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5213 5214 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5215 return (B_FALSE); 5216 5217 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5218 kmem_free(lifc.lifc_buf, bufsize); 5219 return (B_FALSE); 5220 } 5221 5222 /* 5223 * Worst case is that all of the addresses are IB-capable and have 5224 * the same address family, so size our buffers accordingly. 5225 */ 5226 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5227 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5228 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5229 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5230 5231 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5232 if (!rpcib_rdma_capable_interface(lifrp)) 5233 continue; 5234 5235 if (lifrp->lifr_addr.ss_family == AF_INET) { 5236 sinp = addrs4->ri_list; 5237 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5238 sizeof (struct sockaddr_in)); 5239 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5240 sin6p = addrs6->ri_list; 5241 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5242 sizeof (struct sockaddr_in6)); 5243 } 5244 } 5245 5246 kmem_free(lifc.lifc_buf, bufsize); 5247 return (B_TRUE); 5248 } 5249 5250 /* ARGSUSED */ 5251 static int 5252 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5253 { 5254 rib_hca_t *hca; 5255 5256 if (KSTAT_WRITE == rw) { 5257 return (EACCES); 5258 } 5259 5260 rpcib_kstat.cache_limit.value.ui64 = 5261 (uint64_t)cache_limit; 5262 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5263 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5264 rpcib_kstat.cache_allocation.value.ui64 += 5265 (uint64_t)hca->cache_allocation; 5266 rpcib_kstat.cache_hits.value.ui64 += 5267 (uint64_t)hca->cache_hits; 5268 rpcib_kstat.cache_misses.value.ui64 += 5269 (uint64_t)hca->cache_misses; 5270 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5271 (uint64_t)hca->cache_misses_above_the_limit; 5272 } 5273 rw_exit(&rib_stat->hcas_list_lock); 5274 return (0); 5275 } 5276