1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2007, The Ohio State University. All rights reserved. 27 * 28 * Portions of this source code is developed by the team members of 29 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 30 * headed by Professor Dhabaleswar K. (DK) Panda. 31 * 32 * Acknowledgements to contributions from developors: 33 * Ranjit Noronha: noronha@cse.ohio-state.edu 34 * Lei Chai : chail@cse.ohio-state.edu 35 * Weikuan Yu : yuw@cse.ohio-state.edu 36 * 37 */ 38 39 /* 40 * The rpcib plugin. Implements the interface for RDMATF's 41 * interaction with IBTF. 42 */ 43 44 #include <sys/param.h> 45 #include <sys/types.h> 46 #include <sys/user.h> 47 #include <sys/systm.h> 48 #include <sys/sysmacros.h> 49 #include <sys/proc.h> 50 #include <sys/socket.h> 51 #include <sys/file.h> 52 #include <sys/stream.h> 53 #include <sys/strsubr.h> 54 #include <sys/stropts.h> 55 #include <sys/errno.h> 56 #include <sys/kmem.h> 57 #include <sys/debug.h> 58 #include <sys/pathname.h> 59 #include <sys/kstat.h> 60 #include <sys/t_lock.h> 61 #include <sys/ddi.h> 62 #include <sys/cmn_err.h> 63 #include <sys/time.h> 64 #include <sys/isa_defs.h> 65 #include <sys/callb.h> 66 #include <sys/sunddi.h> 67 #include <sys/sunndi.h> 68 #include <sys/sdt.h> 69 #include <sys/ib/ibtl/ibti.h> 70 #include <rpc/rpc.h> 71 #include <rpc/ib.h> 72 #include <sys/modctl.h> 73 #include <sys/kstr.h> 74 #include <sys/sockio.h> 75 #include <sys/vnode.h> 76 #include <sys/tiuser.h> 77 #include <net/if.h> 78 #include <net/if_types.h> 79 #include <sys/cred.h> 80 #include <rpc/rpc_rdma.h> 81 #include <nfs/nfs.h> 82 #include <sys/atomic.h> 83 84 #define NFS_RDMA_PORT 20049 85 86 87 /* 88 * Convenience structures for connection management 89 */ 90 typedef struct rpcib_ipaddrs { 91 void *ri_list; /* pointer to list of addresses */ 92 uint_t ri_count; /* number of addresses in list */ 93 uint_t ri_size; /* size of ri_list in bytes */ 94 } rpcib_ipaddrs_t; 95 96 97 typedef struct rpcib_ping { 98 rib_hca_t *hca; 99 ibt_path_info_t path; 100 ibt_ip_addr_t srcip; 101 ibt_ip_addr_t dstip; 102 } rpcib_ping_t; 103 104 /* 105 * Prototype declarations for driver ops 106 */ 107 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 108 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 109 void *, void **); 110 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 112 static int rpcib_do_ip_ioctl(int, int, void *); 113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 114 static int rpcib_cache_kstat_update(kstat_t *, int); 115 static void rib_force_cleanup(void *); 116 static void rib_stop_hca_services(rib_hca_t *); 117 static void rib_attach_hca(void); 118 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 119 struct netbuf *d_svcaddr, CONN **conn); 120 121 struct { 122 kstat_named_t cache_limit; 123 kstat_named_t cache_allocation; 124 kstat_named_t cache_hits; 125 kstat_named_t cache_misses; 126 kstat_named_t cache_misses_above_the_limit; 127 } rpcib_kstat = { 128 {"cache_limit", KSTAT_DATA_UINT64 }, 129 {"cache_allocation", KSTAT_DATA_UINT64 }, 130 {"cache_hits", KSTAT_DATA_UINT64 }, 131 {"cache_misses", KSTAT_DATA_UINT64 }, 132 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 133 }; 134 135 /* rpcib cb_ops */ 136 static struct cb_ops rpcib_cbops = { 137 nulldev, /* open */ 138 nulldev, /* close */ 139 nodev, /* strategy */ 140 nodev, /* print */ 141 nodev, /* dump */ 142 nodev, /* read */ 143 nodev, /* write */ 144 nodev, /* ioctl */ 145 nodev, /* devmap */ 146 nodev, /* mmap */ 147 nodev, /* segmap */ 148 nochpoll, /* poll */ 149 ddi_prop_op, /* prop_op */ 150 NULL, /* stream */ 151 D_MP, /* cb_flag */ 152 CB_REV, /* rev */ 153 nodev, /* int (*cb_aread)() */ 154 nodev /* int (*cb_awrite)() */ 155 }; 156 157 /* 158 * Device options 159 */ 160 static struct dev_ops rpcib_ops = { 161 DEVO_REV, /* devo_rev, */ 162 0, /* refcnt */ 163 rpcib_getinfo, /* info */ 164 nulldev, /* identify */ 165 nulldev, /* probe */ 166 rpcib_attach, /* attach */ 167 rpcib_detach, /* detach */ 168 nodev, /* reset */ 169 &rpcib_cbops, /* driver ops - devctl interfaces */ 170 NULL, /* bus operations */ 171 NULL, /* power */ 172 ddi_quiesce_not_needed, /* quiesce */ 173 }; 174 175 /* 176 * Module linkage information. 177 */ 178 179 static struct modldrv rib_modldrv = { 180 &mod_driverops, /* Driver module */ 181 "RPCIB plugin driver", /* Driver name and version */ 182 &rpcib_ops, /* Driver ops */ 183 }; 184 185 static struct modlinkage rib_modlinkage = { 186 MODREV_1, 187 (void *)&rib_modldrv, 188 NULL 189 }; 190 191 typedef struct rib_lrc_entry { 192 struct rib_lrc_entry *forw; 193 struct rib_lrc_entry *back; 194 char *lrc_buf; 195 196 uint32_t lrc_len; 197 void *avl_node; 198 bool_t registered; 199 200 struct mrc lrc_mhandle; 201 bool_t lrc_on_freed_list; 202 } rib_lrc_entry_t; 203 204 typedef struct cache_struct { 205 rib_lrc_entry_t r; 206 uint32_t len; 207 uint32_t elements; 208 kmutex_t node_lock; 209 avl_node_t avl_link; 210 } cache_avl_struct_t; 211 212 uint64_t cache_limit = 100 * 1024 * 1024; 213 static uint64_t cache_watermark = 80 * 1024 * 1024; 214 static bool_t stats_enabled = FALSE; 215 216 static uint64_t max_unsignaled_rws = 5; 217 int nfs_rdma_port = NFS_RDMA_PORT; 218 219 #define RIBNETID_TCP "tcp" 220 #define RIBNETID_TCP6 "tcp6" 221 222 /* 223 * rib_stat: private data pointer used when registering 224 * with the IBTF. It is returned to the consumer 225 * in all callbacks. 226 */ 227 static rpcib_state_t *rib_stat = NULL; 228 229 #define RNR_RETRIES IBT_RNR_RETRY_1 230 #define MAX_PORTS 2 231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 232 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 233 234 int preposted_rbufs = RDMA_BUFS_GRANT; 235 int send_threshold = 1; 236 237 /* 238 * Old cards with Tavor driver have limited memory footprint 239 * when booted in 32bit. The rib_max_rbufs tunable can be 240 * tuned for more buffers if needed. 241 */ 242 243 #if !defined(_ELF64) && !defined(__sparc) 244 int rib_max_rbufs = MAX_BUFS; 245 #else 246 int rib_max_rbufs = 10 * MAX_BUFS; 247 #endif /* !(_ELF64) && !(__sparc) */ 248 249 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 250 251 /* 252 * State of the plugin. 253 * ACCEPT = accepting new connections and requests. 254 * NO_ACCEPT = not accepting new connection and requests. 255 * This should eventually move to rpcib_state_t structure, since this 256 * will tell in which state the plugin is for a particular type of service 257 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 258 * state for one and in no_accept state for the other. 259 */ 260 int plugin_state; 261 kmutex_t plugin_state_lock; 262 263 ldi_ident_t rpcib_li; 264 265 /* 266 * RPCIB RDMATF operations 267 */ 268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 269 static rdma_stat rib_disconnect(CONN *conn); 270 static void rib_listen(struct rdma_svc_data *rd); 271 static void rib_listen_stop(struct rdma_svc_data *rd); 272 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 273 uint_t buflen, struct mrc *buf_handle); 274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 275 struct mrc buf_handle); 276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 277 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 279 struct mrc buf_handle); 280 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 281 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 282 void *lrc); 283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 284 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 286 caddr_t buf, int len, int cpu); 287 288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 289 290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 292 293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 294 295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 305 int addr_type, void *, CONN **); 306 static rdma_stat rib_conn_release(CONN *conn); 307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 308 rpcib_ping_t *, CONN **); 309 static rdma_stat rib_getinfo(rdma_info_t *info); 310 311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 313 static void rib_destroy_cache(rib_hca_t *hca); 314 static void rib_server_side_cache_reclaim(void *argp); 315 static int avl_compare(const void *t1, const void *t2); 316 317 static void rib_stop_services(rib_hca_t *); 318 static void rib_close_channels(rib_conn_list_t *); 319 static void rib_conn_close(void *); 320 static void rib_recv_rele(rib_qp_t *); 321 static rdma_stat rib_conn_release_locked(CONN *conn); 322 323 /* 324 * RPCIB addressing operations 325 */ 326 327 /* 328 * RDMA operations the RPCIB module exports 329 */ 330 static rdmaops_t rib_ops = { 331 rib_reachable, 332 rib_conn_get, 333 rib_conn_release, 334 rib_listen, 335 rib_listen_stop, 336 rib_registermem, 337 rib_deregistermem, 338 rib_registermemsync, 339 rib_deregistermemsync, 340 rib_syncmem, 341 rib_reg_buf_alloc, 342 rib_reg_buf_free, 343 rib_send, 344 rib_send_resp, 345 rib_post_resp, 346 rib_post_resp_remove, 347 rib_post_recv, 348 rib_recv, 349 rib_read, 350 rib_write, 351 rib_getinfo, 352 }; 353 354 /* 355 * RDMATF RPCIB plugin details 356 */ 357 static rdma_mod_t rib_mod = { 358 "ibtf", /* api name */ 359 RDMATF_VERS_1, 360 0, 361 &rib_ops, /* rdma op vector for ibtf */ 362 }; 363 364 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 365 static rdma_stat rib_qp_init(rib_qp_t *, int); 366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 372 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 374 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 377 rib_qp_t **); 378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 379 rib_qp_t **); 380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 382 static int rib_free_sendwait(struct send_wid *); 383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 385 static void rdma_done_rem_list(rib_qp_t *); 386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 387 388 static void rib_async_handler(void *, 389 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 392 static int rib_free_svc_recv(struct svc_recv *); 393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 394 static void rib_free_wid(struct recv_wid *); 395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 396 static void rib_detach_hca(ibt_hca_hdl_t); 397 static void rib_close_a_channel(CONN *); 398 static void rib_send_hold(rib_qp_t *); 399 static void rib_send_rele(rib_qp_t *); 400 401 /* 402 * Registration with IBTF as a consumer 403 */ 404 static struct ibt_clnt_modinfo_s rib_modinfo = { 405 IBTI_V_CURR, 406 IBT_GENERIC, 407 rib_async_handler, /* async event handler */ 408 NULL, /* Memory Region Handler */ 409 "nfs/ib" 410 }; 411 412 /* 413 * Global strucuture 414 */ 415 416 typedef struct rpcib_s { 417 dev_info_t *rpcib_dip; 418 kmutex_t rpcib_mutex; 419 } rpcib_t; 420 421 rpcib_t rpcib; 422 423 /* 424 * /etc/system controlled variable to control 425 * debugging in rpcib kernel module. 426 * Set it to values greater that 1 to control 427 * the amount of debugging messages required. 428 */ 429 int rib_debug = 0; 430 431 int 432 _init(void) 433 { 434 int error; 435 436 error = mod_install((struct modlinkage *)&rib_modlinkage); 437 if (error != 0) { 438 /* 439 * Could not load module 440 */ 441 return (error); 442 } 443 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 444 return (0); 445 } 446 447 int 448 _fini() 449 { 450 int status; 451 452 /* 453 * Remove module 454 */ 455 if ((status = mod_remove(&rib_modlinkage)) != 0) { 456 return (status); 457 } 458 mutex_destroy(&plugin_state_lock); 459 return (0); 460 } 461 462 int 463 _info(struct modinfo *modinfop) 464 { 465 return (mod_info(&rib_modlinkage, modinfop)); 466 } 467 468 /* 469 * rpcib_getinfo() 470 * Given the device number, return the devinfo pointer or the 471 * instance number. 472 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 473 */ 474 475 /*ARGSUSED*/ 476 static int 477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 478 { 479 int ret = DDI_SUCCESS; 480 481 switch (cmd) { 482 case DDI_INFO_DEVT2DEVINFO: 483 if (rpcib.rpcib_dip != NULL) 484 *result = rpcib.rpcib_dip; 485 else { 486 *result = NULL; 487 ret = DDI_FAILURE; 488 } 489 break; 490 491 case DDI_INFO_DEVT2INSTANCE: 492 *result = NULL; 493 break; 494 495 default: 496 ret = DDI_FAILURE; 497 } 498 return (ret); 499 } 500 501 static void 502 rpcib_free_hca_list() 503 { 504 rib_hca_t *hca, *hcap; 505 506 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 507 hca = rib_stat->hcas_list; 508 rib_stat->hcas_list = NULL; 509 rw_exit(&rib_stat->hcas_list_lock); 510 while (hca != NULL) { 511 rw_enter(&hca->state_lock, RW_WRITER); 512 hcap = hca; 513 hca = hca->next; 514 rib_stat->nhca_inited--; 515 rib_mod.rdma_count--; 516 hcap->state = HCA_DETACHED; 517 rw_exit(&hcap->state_lock); 518 rib_stop_hca_services(hcap); 519 520 kmem_free(hcap, sizeof (*hcap)); 521 } 522 } 523 524 static rdma_stat 525 rpcib_free_service_list() 526 { 527 rib_service_t *service; 528 ibt_status_t ret; 529 530 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 531 while (rib_stat->service_list != NULL) { 532 service = rib_stat->service_list; 533 ret = ibt_unbind_all_services(service->srv_hdl); 534 if (ret != IBT_SUCCESS) { 535 rw_exit(&rib_stat->service_list_lock); 536 #ifdef DEBUG 537 cmn_err(CE_NOTE, "rpcib_free_service_list: " 538 "ibt_unbind_all_services failed (%d)\n", (int)ret); 539 #endif 540 return (RDMA_FAILED); 541 } 542 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 543 service->srv_hdl); 544 if (ret != IBT_SUCCESS) { 545 rw_exit(&rib_stat->service_list_lock); 546 #ifdef DEBUG 547 cmn_err(CE_NOTE, "rpcib_free_service_list: " 548 "ibt_deregister_service failed (%d)\n", (int)ret); 549 #endif 550 return (RDMA_FAILED); 551 } 552 rib_stat->service_list = service->next; 553 kmem_free(service, sizeof (rib_service_t)); 554 } 555 rw_exit(&rib_stat->service_list_lock); 556 557 return (RDMA_SUCCESS); 558 } 559 560 static int 561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 562 { 563 ibt_status_t ibt_status; 564 rdma_stat r_status; 565 566 switch (cmd) { 567 case DDI_ATTACH: 568 break; 569 case DDI_RESUME: 570 return (DDI_SUCCESS); 571 default: 572 return (DDI_FAILURE); 573 } 574 575 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 576 577 mutex_enter(&rpcib.rpcib_mutex); 578 if (rpcib.rpcib_dip != NULL) { 579 mutex_exit(&rpcib.rpcib_mutex); 580 return (DDI_FAILURE); 581 } 582 rpcib.rpcib_dip = dip; 583 mutex_exit(&rpcib.rpcib_mutex); 584 /* 585 * Create the "rpcib" minor-node. 586 */ 587 if (ddi_create_minor_node(dip, 588 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 589 /* Error message, no cmn_err as they print on console */ 590 return (DDI_FAILURE); 591 } 592 593 if (rib_stat == NULL) { 594 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 595 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 596 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 597 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 598 } 599 600 rib_stat->hca_count = ibt_get_hca_list(NULL); 601 if (rib_stat->hca_count < 1) { 602 mutex_destroy(&rib_stat->listen_lock); 603 rw_destroy(&rib_stat->hcas_list_lock); 604 mutex_destroy(&rib_stat->open_hca_lock); 605 kmem_free(rib_stat, sizeof (*rib_stat)); 606 rib_stat = NULL; 607 return (DDI_FAILURE); 608 } 609 610 ibt_status = ibt_attach(&rib_modinfo, dip, 611 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 612 613 if (ibt_status != IBT_SUCCESS) { 614 mutex_destroy(&rib_stat->listen_lock); 615 rw_destroy(&rib_stat->hcas_list_lock); 616 mutex_destroy(&rib_stat->open_hca_lock); 617 kmem_free(rib_stat, sizeof (*rib_stat)); 618 rib_stat = NULL; 619 return (DDI_FAILURE); 620 } 621 622 rib_stat->service_list = NULL; 623 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 624 mutex_enter(&rib_stat->open_hca_lock); 625 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 626 mutex_exit(&rib_stat->open_hca_lock); 627 goto open_fail; 628 } 629 mutex_exit(&rib_stat->open_hca_lock); 630 631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 632 DDI_PROP_SUCCESS) { 633 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 634 "failed."); 635 goto register_fail; 636 } 637 638 /* 639 * Register with rdmatf 640 */ 641 r_status = rdma_register_mod(&rib_mod); 642 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 643 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 644 "status = %d", r_status); 645 goto register_fail; 646 } 647 648 return (DDI_SUCCESS); 649 650 register_fail: 651 652 open_fail: 653 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 654 rpcib_free_hca_list(); 655 (void) rpcib_free_service_list(); 656 mutex_destroy(&rib_stat->listen_lock); 657 rw_destroy(&rib_stat->hcas_list_lock); 658 mutex_destroy(&rib_stat->open_hca_lock); 659 rw_destroy(&rib_stat->service_list_lock); 660 kmem_free(rib_stat, sizeof (*rib_stat)); 661 rib_stat = NULL; 662 return (DDI_FAILURE); 663 } 664 665 /*ARGSUSED*/ 666 static int 667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 668 { 669 switch (cmd) { 670 671 case DDI_DETACH: 672 break; 673 674 case DDI_SUSPEND: 675 default: 676 return (DDI_FAILURE); 677 } 678 679 /* 680 * Detach the hca and free resources 681 */ 682 mutex_enter(&plugin_state_lock); 683 plugin_state = NO_ACCEPT; 684 mutex_exit(&plugin_state_lock); 685 686 if (rpcib_free_service_list() != RDMA_SUCCESS) 687 return (DDI_FAILURE); 688 rpcib_free_hca_list(); 689 690 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 691 mutex_destroy(&rib_stat->listen_lock); 692 rw_destroy(&rib_stat->hcas_list_lock); 693 mutex_destroy(&rib_stat->open_hca_lock); 694 rw_destroy(&rib_stat->service_list_lock); 695 696 kmem_free(rib_stat, sizeof (*rib_stat)); 697 rib_stat = NULL; 698 699 mutex_enter(&rpcib.rpcib_mutex); 700 rpcib.rpcib_dip = NULL; 701 mutex_exit(&rpcib.rpcib_mutex); 702 mutex_destroy(&rpcib.rpcib_mutex); 703 return (DDI_SUCCESS); 704 } 705 706 707 static void rib_rbufpool_free(rib_hca_t *, int); 708 static void rib_rbufpool_deregister(rib_hca_t *, int); 709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 711 static rdma_stat rib_rem_replylist(rib_qp_t *); 712 static int rib_remreply(rib_qp_t *, struct reply *); 713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 715 716 717 /* 718 * One CQ pair per HCA 719 */ 720 static rdma_stat 721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 722 rib_cq_t **cqp) 723 { 724 rib_cq_t *cq; 725 ibt_cq_attr_t cq_attr; 726 uint32_t real_size; 727 ibt_status_t status; 728 rdma_stat error = RDMA_SUCCESS; 729 730 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 731 cq->rib_hca = hca; 732 bzero(&cq_attr, sizeof (cq_attr)); 733 cq_attr.cq_size = cq_size; 734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 736 &real_size); 737 if (status != IBT_SUCCESS) { 738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 739 " status=%d", status); 740 error = RDMA_FAILED; 741 goto fail; 742 } 743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 744 745 /* 746 * Enable CQ callbacks. CQ Callbacks are single shot 747 * (e.g. you have to call ibt_enable_cq_notify() 748 * after each callback to get another one). 749 */ 750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 751 if (status != IBT_SUCCESS) { 752 cmn_err(CE_WARN, "rib_create_cq: " 753 "enable_cq_notify failed, status %d", status); 754 error = RDMA_FAILED; 755 goto fail; 756 } 757 *cqp = cq; 758 759 return (error); 760 fail: 761 if (cq->rib_cq_hdl) 762 (void) ibt_free_cq(cq->rib_cq_hdl); 763 if (cq) 764 kmem_free(cq, sizeof (rib_cq_t)); 765 return (error); 766 } 767 768 /* 769 * rpcib_find_hca 770 * 771 * Caller should have already locked the hcas_lock before calling 772 * this function. 773 */ 774 static rib_hca_t * 775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 776 { 777 rib_hca_t *hca = ribstat->hcas_list; 778 779 while (hca && hca->hca_guid != guid) 780 hca = hca->next; 781 782 return (hca); 783 } 784 785 static rdma_stat 786 rpcib_open_hcas(rpcib_state_t *ribstat) 787 { 788 rib_hca_t *hca; 789 ibt_status_t ibt_status; 790 rdma_stat status; 791 ibt_hca_portinfo_t *pinfop; 792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 793 uint_t size, cq_size; 794 int i; 795 kstat_t *ksp; 796 cache_avl_struct_t example_avl_node; 797 char rssc_name[32]; 798 int old_nhca_inited = ribstat->nhca_inited; 799 ib_guid_t *hca_guids; 800 801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 802 803 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 804 if (ribstat->hca_count == 0) 805 return (RDMA_FAILED); 806 807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 808 /* 809 * Open a hca and setup for RDMA 810 */ 811 for (i = 0; i < ribstat->hca_count; i++) { 812 if (rpcib_find_hca(ribstat, hca_guids[i])) 813 continue; 814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 815 816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 817 hca_guids[i], &hca->hca_hdl); 818 if (ibt_status != IBT_SUCCESS) { 819 kmem_free(hca, sizeof (rib_hca_t)); 820 continue; 821 } 822 hca->hca_guid = hca_guids[i]; 823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 824 hca->state = HCA_INITED; 825 826 /* 827 * query HCA info 828 */ 829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 830 if (ibt_status != IBT_SUCCESS) { 831 goto fail1; 832 } 833 834 /* 835 * One PD (Protection Domain) per HCA. 836 * A qp is allowed to access a memory region 837 * only when it's in the same PD as that of 838 * the memory region. 839 */ 840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 841 if (ibt_status != IBT_SUCCESS) { 842 goto fail1; 843 } 844 845 /* 846 * query HCA ports 847 */ 848 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 849 0, &pinfop, &hca->hca_nports, &size); 850 if (ibt_status != IBT_SUCCESS) { 851 goto fail2; 852 } 853 hca->hca_ports = pinfop; 854 hca->hca_pinfosz = size; 855 pinfop = NULL; 856 857 cq_size = DEF_CQ_SIZE; /* default cq size */ 858 /* 859 * Create 2 pairs of cq's (1 pair for client 860 * and the other pair for server) on this hca. 861 * If number of qp's gets too large, then several 862 * cq's will be needed. 863 */ 864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 865 &hca->svc_rcq); 866 if (status != RDMA_SUCCESS) { 867 goto fail3; 868 } 869 870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 871 &hca->svc_scq); 872 if (status != RDMA_SUCCESS) { 873 goto fail3; 874 } 875 876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 877 &hca->clnt_rcq); 878 if (status != RDMA_SUCCESS) { 879 goto fail3; 880 } 881 882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 883 &hca->clnt_scq); 884 if (status != RDMA_SUCCESS) { 885 goto fail3; 886 } 887 888 /* 889 * Create buffer pools. 890 * Note rib_rbuf_create also allocates memory windows. 891 */ 892 hca->recv_pool = rib_rbufpool_create(hca, 893 RECV_BUFFER, rib_max_rbufs); 894 if (hca->recv_pool == NULL) { 895 goto fail3; 896 } 897 898 hca->send_pool = rib_rbufpool_create(hca, 899 SEND_BUFFER, rib_max_rbufs); 900 if (hca->send_pool == NULL) { 901 rib_rbufpool_destroy(hca, RECV_BUFFER); 902 goto fail3; 903 } 904 905 if (hca->server_side_cache == NULL) { 906 (void) sprintf(rssc_name, 907 "rib_srvr_cache_%llx", 908 (long long unsigned int) hca->hca_guid); 909 hca->server_side_cache = kmem_cache_create( 910 rssc_name, 911 sizeof (cache_avl_struct_t), 0, 912 NULL, 913 NULL, 914 rib_server_side_cache_reclaim, 915 hca, NULL, 0); 916 } 917 918 avl_create(&hca->avl_tree, 919 avl_compare, 920 sizeof (cache_avl_struct_t), 921 (uint_t)(uintptr_t)&example_avl_node.avl_link- 922 (uint_t)(uintptr_t)&example_avl_node); 923 924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 925 hca->iblock); 926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 927 rw_init(&hca->avl_rw_lock, 928 NULL, RW_DRIVER, hca->iblock); 929 mutex_init(&hca->cache_allocation_lock, 930 NULL, MUTEX_DRIVER, NULL); 931 hca->avl_init = TRUE; 932 933 /* Create kstats for the cache */ 934 ASSERT(INGLOBALZONE(curproc)); 935 936 if (!stats_enabled) { 937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 938 KSTAT_TYPE_NAMED, 939 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 941 GLOBAL_ZONEID); 942 if (ksp) { 943 ksp->ks_data = (void *) &rpcib_kstat; 944 ksp->ks_update = rpcib_cache_kstat_update; 945 kstat_install(ksp); 946 stats_enabled = TRUE; 947 } 948 } 949 if (hca->cleanup_helper == NULL) { 950 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 951 952 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 953 (unsigned long long int) hca->hca_guid); 954 hca->cleanup_helper = ddi_taskq_create(NULL, 955 tq_name, 1, TASKQ_DEFAULTPRI, 0); 956 } 957 958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 961 hca->iblock); 962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 963 hca->iblock); 964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 965 hca->inuse = TRUE; 966 967 hca->next = ribstat->hcas_list; 968 ribstat->hcas_list = hca; 969 ribstat->nhca_inited++; 970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 971 continue; 972 973 fail3: 974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 975 fail2: 976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 977 fail1: 978 (void) ibt_close_hca(hca->hca_hdl); 979 kmem_free(hca, sizeof (rib_hca_t)); 980 } 981 rw_exit(&ribstat->hcas_list_lock); 982 ibt_free_hca_list(hca_guids, ribstat->hca_count); 983 rib_mod.rdma_count = rib_stat->nhca_inited; 984 985 /* 986 * return success if at least one new hca has been configured. 987 */ 988 if (ribstat->nhca_inited != old_nhca_inited) 989 return (RDMA_SUCCESS); 990 else 991 return (RDMA_FAILED); 992 } 993 994 /* 995 * Callback routines 996 */ 997 998 /* 999 * SCQ handlers 1000 */ 1001 /* ARGSUSED */ 1002 static void 1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1004 { 1005 ibt_status_t ibt_status; 1006 ibt_wc_t wc; 1007 struct send_wid *wd; 1008 CONN *conn; 1009 rib_qp_t *qp; 1010 int i; 1011 1012 /* 1013 * Re-enable cq notify here to avoid missing any 1014 * completion queue notification. 1015 */ 1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1017 1018 ibt_status = IBT_SUCCESS; 1019 while (ibt_status != IBT_CQ_EMPTY) { 1020 bzero(&wc, sizeof (wc)); 1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1022 if (ibt_status != IBT_SUCCESS) 1023 return; 1024 1025 /* 1026 * Got a send completion 1027 */ 1028 if (wc.wc_id != RDMA_DUMMY_WRID) { 1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1030 qp = wd->qp; 1031 conn = qptoc(qp); 1032 1033 mutex_enter(&wd->sendwait_lock); 1034 switch (wc.wc_status) { 1035 case IBT_WC_SUCCESS: 1036 wd->status = RDMA_SUCCESS; 1037 break; 1038 default: 1039 /* 1040 * RC Send Q Error Code Local state Remote State 1041 * ==================== =========== ============ 1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1043 * IBT_WC_LOCAL_LEN_ERR ERROR None 1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1052 * IBT_WC_WR_FLUSHED_ERR ERROR None 1053 */ 1054 /* 1055 * Channel in error state. Set connection to 1056 * ERROR and cleanup will happen either from 1057 * conn_release or from rib_conn_get 1058 */ 1059 wd->status = RDMA_FAILED; 1060 mutex_enter(&conn->c_lock); 1061 if (conn->c_state != C_DISCONN_PEND) 1062 conn->c_state = C_ERROR_CONN; 1063 mutex_exit(&conn->c_lock); 1064 break; 1065 } 1066 1067 if (wd->cv_sig == 1) { 1068 /* 1069 * Notify poster 1070 */ 1071 cv_signal(&wd->wait_cv); 1072 mutex_exit(&wd->sendwait_lock); 1073 } else { 1074 /* 1075 * Poster not waiting for notification. 1076 * Free the send buffers and send_wid 1077 */ 1078 for (i = 0; i < wd->nsbufs; i++) { 1079 rib_rbuf_free(qptoc(wd->qp), 1080 SEND_BUFFER, 1081 (void *)(uintptr_t)wd->sbufaddr[i]); 1082 } 1083 1084 /* decrement the send ref count */ 1085 rib_send_rele(qp); 1086 1087 mutex_exit(&wd->sendwait_lock); 1088 (void) rib_free_sendwait(wd); 1089 } 1090 } 1091 } 1092 } 1093 1094 /* ARGSUSED */ 1095 static void 1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1097 { 1098 ibt_status_t ibt_status; 1099 ibt_wc_t wc; 1100 struct send_wid *wd; 1101 rib_qp_t *qp; 1102 CONN *conn; 1103 int i; 1104 1105 /* 1106 * Re-enable cq notify here to avoid missing any 1107 * completion queue notification. 1108 */ 1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1110 1111 ibt_status = IBT_SUCCESS; 1112 while (ibt_status != IBT_CQ_EMPTY) { 1113 bzero(&wc, sizeof (wc)); 1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1115 if (ibt_status != IBT_SUCCESS) 1116 return; 1117 1118 /* 1119 * Got a send completion 1120 */ 1121 if (wc.wc_id != RDMA_DUMMY_WRID) { 1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1123 qp = wd->qp; 1124 conn = qptoc(qp); 1125 mutex_enter(&wd->sendwait_lock); 1126 1127 switch (wc.wc_status) { 1128 case IBT_WC_SUCCESS: 1129 wd->status = RDMA_SUCCESS; 1130 break; 1131 default: 1132 /* 1133 * Channel in error state. Set connection to 1134 * ERROR and cleanup will happen either from 1135 * conn_release or conn timeout. 1136 */ 1137 wd->status = RDMA_FAILED; 1138 mutex_enter(&conn->c_lock); 1139 if (conn->c_state != C_DISCONN_PEND) 1140 conn->c_state = C_ERROR_CONN; 1141 mutex_exit(&conn->c_lock); 1142 break; 1143 } 1144 1145 if (wd->cv_sig == 1) { 1146 /* 1147 * Update completion status and notify poster 1148 */ 1149 cv_signal(&wd->wait_cv); 1150 mutex_exit(&wd->sendwait_lock); 1151 } else { 1152 /* 1153 * Poster not waiting for notification. 1154 * Free the send buffers and send_wid 1155 */ 1156 for (i = 0; i < wd->nsbufs; i++) { 1157 rib_rbuf_free(qptoc(wd->qp), 1158 SEND_BUFFER, 1159 (void *)(uintptr_t)wd->sbufaddr[i]); 1160 } 1161 1162 /* decrement the send ref count */ 1163 rib_send_rele(qp); 1164 1165 mutex_exit(&wd->sendwait_lock); 1166 (void) rib_free_sendwait(wd); 1167 } 1168 } 1169 } 1170 } 1171 1172 /* 1173 * RCQ handler 1174 */ 1175 /* ARGSUSED */ 1176 static void 1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1178 { 1179 rib_qp_t *qp; 1180 ibt_status_t ibt_status; 1181 ibt_wc_t wc; 1182 struct recv_wid *rwid; 1183 1184 /* 1185 * Re-enable cq notify here to avoid missing any 1186 * completion queue notification. 1187 */ 1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1189 1190 ibt_status = IBT_SUCCESS; 1191 while (ibt_status != IBT_CQ_EMPTY) { 1192 bzero(&wc, sizeof (wc)); 1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1194 if (ibt_status != IBT_SUCCESS) 1195 return; 1196 1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1198 qp = rwid->qp; 1199 1200 if (wc.wc_status == IBT_WC_SUCCESS) { 1201 XDR inxdrs, *xdrs; 1202 uint_t xid, vers, op, find_xid = 0; 1203 struct reply *r; 1204 CONN *conn = qptoc(qp); 1205 uint32_t rdma_credit = 0; 1206 1207 xdrs = &inxdrs; 1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1209 wc.wc_bytes_xfer, XDR_DECODE); 1210 /* 1211 * Treat xid as opaque (xid is the first entity 1212 * in the rpc rdma message). 1213 */ 1214 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1215 1216 /* Skip xid and set the xdr position accordingly. */ 1217 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1218 (void) xdr_u_int(xdrs, &vers); 1219 (void) xdr_u_int(xdrs, &rdma_credit); 1220 (void) xdr_u_int(xdrs, &op); 1221 XDR_DESTROY(xdrs); 1222 1223 if (vers != RPCRDMA_VERS) { 1224 /* 1225 * Invalid RPC/RDMA version. Cannot 1226 * interoperate. Set connection to 1227 * ERROR state and bail out. 1228 */ 1229 mutex_enter(&conn->c_lock); 1230 if (conn->c_state != C_DISCONN_PEND) 1231 conn->c_state = C_ERROR_CONN; 1232 mutex_exit(&conn->c_lock); 1233 rib_rbuf_free(conn, RECV_BUFFER, 1234 (void *)(uintptr_t)rwid->addr); 1235 rib_free_wid(rwid); 1236 rib_recv_rele(qp); 1237 continue; 1238 } 1239 1240 mutex_enter(&qp->replylist_lock); 1241 for (r = qp->replylist; r != NULL; r = r->next) { 1242 if (r->xid == xid) { 1243 find_xid = 1; 1244 switch (op) { 1245 case RDMA_MSG: 1246 case RDMA_NOMSG: 1247 case RDMA_MSGP: 1248 r->status = RDMA_SUCCESS; 1249 r->vaddr_cq = rwid->addr; 1250 r->bytes_xfer = 1251 wc.wc_bytes_xfer; 1252 cv_signal(&r->wait_cv); 1253 break; 1254 default: 1255 rib_rbuf_free(qptoc(qp), 1256 RECV_BUFFER, 1257 (void *)(uintptr_t) 1258 rwid->addr); 1259 break; 1260 } 1261 break; 1262 } 1263 } 1264 mutex_exit(&qp->replylist_lock); 1265 if (find_xid == 0) { 1266 /* RPC caller not waiting for reply */ 1267 1268 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1269 int, xid); 1270 1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1272 (void *)(uintptr_t)rwid->addr); 1273 } 1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1275 CONN *conn = qptoc(qp); 1276 1277 /* 1278 * Connection being flushed. Just free 1279 * the posted buffer 1280 */ 1281 rib_rbuf_free(conn, RECV_BUFFER, 1282 (void *)(uintptr_t)rwid->addr); 1283 } else { 1284 CONN *conn = qptoc(qp); 1285 /* 1286 * RC Recv Q Error Code Local state Remote State 1287 * ==================== =========== ============ 1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1293 * IBT_WC_WR_FLUSHED_ERR None None 1294 */ 1295 /* 1296 * Channel in error state. Set connection 1297 * in ERROR state. 1298 */ 1299 mutex_enter(&conn->c_lock); 1300 if (conn->c_state != C_DISCONN_PEND) 1301 conn->c_state = C_ERROR_CONN; 1302 mutex_exit(&conn->c_lock); 1303 rib_rbuf_free(conn, RECV_BUFFER, 1304 (void *)(uintptr_t)rwid->addr); 1305 } 1306 rib_free_wid(rwid); 1307 rib_recv_rele(qp); 1308 } 1309 } 1310 1311 /* Server side */ 1312 /* ARGSUSED */ 1313 static void 1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1315 { 1316 rdma_recv_data_t *rdp; 1317 rib_qp_t *qp; 1318 ibt_status_t ibt_status; 1319 ibt_wc_t wc; 1320 struct svc_recv *s_recvp; 1321 CONN *conn; 1322 mblk_t *mp; 1323 1324 /* 1325 * Re-enable cq notify here to avoid missing any 1326 * completion queue notification. 1327 */ 1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1329 1330 ibt_status = IBT_SUCCESS; 1331 while (ibt_status != IBT_CQ_EMPTY) { 1332 bzero(&wc, sizeof (wc)); 1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1334 if (ibt_status != IBT_SUCCESS) 1335 return; 1336 1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1338 qp = s_recvp->qp; 1339 conn = qptoc(qp); 1340 1341 if (wc.wc_status == IBT_WC_SUCCESS) { 1342 XDR inxdrs, *xdrs; 1343 uint_t xid, vers, op; 1344 uint32_t rdma_credit; 1345 1346 xdrs = &inxdrs; 1347 /* s_recvp->vaddr stores data */ 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1349 wc.wc_bytes_xfer, XDR_DECODE); 1350 1351 /* 1352 * Treat xid as opaque (xid is the first entity 1353 * in the rpc rdma message). 1354 */ 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1356 /* Skip xid and set the xdr position accordingly. */ 1357 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1358 if (!xdr_u_int(xdrs, &vers) || 1359 !xdr_u_int(xdrs, &rdma_credit) || 1360 !xdr_u_int(xdrs, &op)) { 1361 rib_rbuf_free(conn, RECV_BUFFER, 1362 (void *)(uintptr_t)s_recvp->vaddr); 1363 XDR_DESTROY(xdrs); 1364 rib_recv_rele(qp); 1365 (void) rib_free_svc_recv(s_recvp); 1366 continue; 1367 } 1368 XDR_DESTROY(xdrs); 1369 1370 if (vers != RPCRDMA_VERS) { 1371 /* 1372 * Invalid RPC/RDMA version. 1373 * Drop rpc rdma message. 1374 */ 1375 rib_rbuf_free(conn, RECV_BUFFER, 1376 (void *)(uintptr_t)s_recvp->vaddr); 1377 rib_recv_rele(qp); 1378 (void) rib_free_svc_recv(s_recvp); 1379 continue; 1380 } 1381 /* 1382 * Is this for RDMA_DONE? 1383 */ 1384 if (op == RDMA_DONE) { 1385 rib_rbuf_free(conn, RECV_BUFFER, 1386 (void *)(uintptr_t)s_recvp->vaddr); 1387 /* 1388 * Wake up the thread waiting on 1389 * a RDMA_DONE for xid 1390 */ 1391 mutex_enter(&qp->rdlist_lock); 1392 rdma_done_notify(qp, xid); 1393 mutex_exit(&qp->rdlist_lock); 1394 rib_recv_rele(qp); 1395 (void) rib_free_svc_recv(s_recvp); 1396 continue; 1397 } 1398 1399 mutex_enter(&plugin_state_lock); 1400 mutex_enter(&conn->c_lock); 1401 if ((plugin_state == ACCEPT) && 1402 (conn->c_state == C_CONNECTED)) { 1403 conn->c_ref++; 1404 mutex_exit(&conn->c_lock); 1405 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1406 == NULL) 1407 (void) strwaitbuf( 1408 sizeof (*rdp), BPRI_LO); 1409 /* 1410 * Plugin is in accept state, hence the master 1411 * transport queue for this is still accepting 1412 * requests. Hence we can call svc_queuereq to 1413 * queue this recieved msg. 1414 */ 1415 rdp = (rdma_recv_data_t *)mp->b_rptr; 1416 rdp->conn = conn; 1417 rdp->rpcmsg.addr = 1418 (caddr_t)(uintptr_t)s_recvp->vaddr; 1419 rdp->rpcmsg.type = RECV_BUFFER; 1420 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1421 rdp->status = wc.wc_status; 1422 mp->b_wptr += sizeof (*rdp); 1423 svc_queuereq((queue_t *)rib_stat->q, mp); 1424 mutex_exit(&plugin_state_lock); 1425 } else { 1426 /* 1427 * The master transport for this is going 1428 * away and the queue is not accepting anymore 1429 * requests for krpc, so don't do anything, just 1430 * free the msg. 1431 */ 1432 mutex_exit(&conn->c_lock); 1433 mutex_exit(&plugin_state_lock); 1434 rib_rbuf_free(conn, RECV_BUFFER, 1435 (void *)(uintptr_t)s_recvp->vaddr); 1436 } 1437 } else { 1438 rib_rbuf_free(conn, RECV_BUFFER, 1439 (void *)(uintptr_t)s_recvp->vaddr); 1440 } 1441 rib_recv_rele(qp); 1442 (void) rib_free_svc_recv(s_recvp); 1443 } 1444 } 1445 1446 static void 1447 rib_attach_hca() 1448 { 1449 mutex_enter(&rib_stat->open_hca_lock); 1450 (void) rpcib_open_hcas(rib_stat); 1451 rib_listen(NULL); 1452 mutex_exit(&rib_stat->open_hca_lock); 1453 } 1454 1455 /* 1456 * Handles DR event of IBT_HCA_DETACH_EVENT. 1457 */ 1458 /* ARGSUSED */ 1459 static void 1460 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1461 ibt_async_code_t code, ibt_async_event_t *event) 1462 { 1463 switch (code) { 1464 case IBT_HCA_ATTACH_EVENT: 1465 rib_attach_hca(); 1466 break; 1467 case IBT_HCA_DETACH_EVENT: 1468 rib_detach_hca(hca_hdl); 1469 #ifdef DEBUG 1470 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1471 #endif 1472 break; 1473 case IBT_EVENT_PORT_UP: 1474 /* 1475 * A port is up. We should call rib_listen() since there is 1476 * a chance that rib_listen() may have failed during 1477 * rib_attach_hca() because the port had not been up yet. 1478 */ 1479 rib_listen(NULL); 1480 #ifdef DEBUG 1481 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1482 #endif 1483 break; 1484 #ifdef DEBUG 1485 case IBT_EVENT_PATH_MIGRATED: 1486 cmn_err(CE_NOTE, "rib_async_handler(): " 1487 "IBT_EVENT_PATH_MIGRATED\n"); 1488 break; 1489 case IBT_EVENT_SQD: 1490 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1491 break; 1492 case IBT_EVENT_COM_EST: 1493 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1494 break; 1495 case IBT_ERROR_CATASTROPHIC_CHAN: 1496 cmn_err(CE_NOTE, "rib_async_handler(): " 1497 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1498 break; 1499 case IBT_ERROR_INVALID_REQUEST_CHAN: 1500 cmn_err(CE_NOTE, "rib_async_handler(): " 1501 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1502 break; 1503 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1504 cmn_err(CE_NOTE, "rib_async_handler(): " 1505 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1506 break; 1507 case IBT_ERROR_PATH_MIGRATE_REQ: 1508 cmn_err(CE_NOTE, "rib_async_handler(): " 1509 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1510 break; 1511 case IBT_ERROR_CQ: 1512 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1513 break; 1514 case IBT_ERROR_PORT_DOWN: 1515 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1516 break; 1517 case IBT_ASYNC_OPAQUE1: 1518 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1519 break; 1520 case IBT_ASYNC_OPAQUE2: 1521 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1522 break; 1523 case IBT_ASYNC_OPAQUE3: 1524 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1525 break; 1526 case IBT_ASYNC_OPAQUE4: 1527 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1528 break; 1529 #endif 1530 default: 1531 break; 1532 } 1533 } 1534 1535 /* 1536 * Client's reachable function. 1537 */ 1538 static rdma_stat 1539 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1540 { 1541 rdma_stat status; 1542 rpcib_ping_t rpt; 1543 struct netbuf saddr; 1544 CONN *conn; 1545 1546 bzero(&saddr, sizeof (struct netbuf)); 1547 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1548 1549 if (status == RDMA_SUCCESS) { 1550 *handle = (void *)rpt.hca; 1551 /* release the reference */ 1552 (void) rib_conn_release(conn); 1553 return (RDMA_SUCCESS); 1554 } else { 1555 *handle = NULL; 1556 DTRACE_PROBE(rpcib__i__pingfailed); 1557 return (RDMA_FAILED); 1558 } 1559 } 1560 1561 /* Client side qp creation */ 1562 static rdma_stat 1563 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1564 { 1565 rib_qp_t *kqp = NULL; 1566 CONN *conn; 1567 rdma_clnt_cred_ctrl_t *cc_info; 1568 1569 ASSERT(qp != NULL); 1570 *qp = NULL; 1571 1572 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1573 conn = qptoc(kqp); 1574 kqp->hca = hca; 1575 kqp->rdmaconn.c_rdmamod = &rib_mod; 1576 kqp->rdmaconn.c_private = (caddr_t)kqp; 1577 1578 kqp->mode = RIB_CLIENT; 1579 kqp->chan_flags = IBT_BLOCKING; 1580 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1581 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1582 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1583 /* 1584 * Initialize 1585 */ 1586 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1587 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1588 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1589 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1590 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1591 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1592 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1593 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1594 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1595 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1596 /* 1597 * Initialize the client credit control 1598 * portion of the rdmaconn struct. 1599 */ 1600 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1601 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1602 cc_info->clnt_cc_granted_ops = 0; 1603 cc_info->clnt_cc_in_flight_ops = 0; 1604 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1605 1606 *qp = kqp; 1607 return (RDMA_SUCCESS); 1608 } 1609 1610 /* Server side qp creation */ 1611 static rdma_stat 1612 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1613 { 1614 rib_qp_t *kqp = NULL; 1615 ibt_chan_sizes_t chan_sizes; 1616 ibt_rc_chan_alloc_args_t qp_attr; 1617 ibt_status_t ibt_status; 1618 rdma_srv_cred_ctrl_t *cc_info; 1619 1620 *qp = NULL; 1621 1622 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1623 kqp->hca = hca; 1624 kqp->port_num = port; 1625 kqp->rdmaconn.c_rdmamod = &rib_mod; 1626 kqp->rdmaconn.c_private = (caddr_t)kqp; 1627 1628 /* 1629 * Create the qp handle 1630 */ 1631 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1632 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1633 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1634 qp_attr.rc_pd = hca->pd_hdl; 1635 qp_attr.rc_hca_port_num = port; 1636 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1637 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1638 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1639 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1640 qp_attr.rc_clone_chan = NULL; 1641 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1642 qp_attr.rc_flags = IBT_WR_SIGNALED; 1643 1644 rw_enter(&hca->state_lock, RW_READER); 1645 if (hca->state != HCA_DETACHED) { 1646 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1647 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1648 &chan_sizes); 1649 } else { 1650 rw_exit(&hca->state_lock); 1651 goto fail; 1652 } 1653 rw_exit(&hca->state_lock); 1654 1655 if (ibt_status != IBT_SUCCESS) { 1656 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1657 int, ibt_status); 1658 goto fail; 1659 } 1660 1661 kqp->mode = RIB_SERVER; 1662 kqp->chan_flags = IBT_BLOCKING; 1663 kqp->q = q; /* server ONLY */ 1664 1665 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1666 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1667 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1668 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1669 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1670 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1671 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1672 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1673 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1674 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1675 /* 1676 * Set the private data area to qp to be used in callbacks 1677 */ 1678 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1679 kqp->rdmaconn.c_state = C_CONNECTED; 1680 1681 /* 1682 * Initialize the server credit control 1683 * portion of the rdmaconn struct. 1684 */ 1685 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1686 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1687 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1688 cc_info->srv_cc_cur_buffers_used = 0; 1689 cc_info->srv_cc_posted = preposted_rbufs; 1690 1691 *qp = kqp; 1692 1693 return (RDMA_SUCCESS); 1694 fail: 1695 if (kqp) 1696 kmem_free(kqp, sizeof (rib_qp_t)); 1697 1698 return (RDMA_FAILED); 1699 } 1700 1701 /* ARGSUSED */ 1702 ibt_cm_status_t 1703 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1704 ibt_cm_return_args_t *ret_args, void *priv_data, 1705 ibt_priv_data_len_t len) 1706 { 1707 rib_hca_t *hca; 1708 1709 hca = (rib_hca_t *)clnt_hdl; 1710 1711 switch (event->cm_type) { 1712 1713 /* got a connection close event */ 1714 case IBT_CM_EVENT_CONN_CLOSED: 1715 { 1716 CONN *conn; 1717 rib_qp_t *qp; 1718 1719 /* check reason why connection was closed */ 1720 switch (event->cm_event.closed) { 1721 case IBT_CM_CLOSED_DREP_RCVD: 1722 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1723 case IBT_CM_CLOSED_DUP: 1724 case IBT_CM_CLOSED_ABORT: 1725 case IBT_CM_CLOSED_ALREADY: 1726 /* 1727 * These cases indicate the local end initiated 1728 * the closing of the channel. Nothing to do here. 1729 */ 1730 break; 1731 default: 1732 /* 1733 * Reason for CONN_CLOSED event must be one of 1734 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1735 * or IBT_CM_CLOSED_STALE. These indicate cases were 1736 * the remote end is closing the channel. In these 1737 * cases free the channel and transition to error 1738 * state 1739 */ 1740 qp = ibt_get_chan_private(event->cm_channel); 1741 conn = qptoc(qp); 1742 mutex_enter(&conn->c_lock); 1743 if (conn->c_state == C_DISCONN_PEND) { 1744 mutex_exit(&conn->c_lock); 1745 break; 1746 } 1747 1748 conn->c_state = C_ERROR_CONN; 1749 1750 /* 1751 * Free the conn if c_ref is down to 0 already 1752 */ 1753 if (conn->c_ref == 0) { 1754 /* 1755 * Remove from list and free conn 1756 */ 1757 conn->c_state = C_DISCONN_PEND; 1758 mutex_exit(&conn->c_lock); 1759 rw_enter(&hca->state_lock, RW_READER); 1760 if (hca->state != HCA_DETACHED) 1761 (void) rib_disconnect_channel(conn, 1762 &hca->cl_conn_list); 1763 rw_exit(&hca->state_lock); 1764 } else { 1765 /* 1766 * conn will be freed when c_ref goes to 0. 1767 * Indicate to cleaning thread not to close 1768 * the connection, but just free the channel. 1769 */ 1770 conn->c_flags |= C_CLOSE_NOTNEEDED; 1771 mutex_exit(&conn->c_lock); 1772 } 1773 #ifdef DEBUG 1774 if (rib_debug) 1775 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1776 "(CONN_CLOSED) channel disconnected"); 1777 #endif 1778 break; 1779 } 1780 break; 1781 } 1782 default: 1783 break; 1784 } 1785 return (IBT_CM_ACCEPT); 1786 } 1787 1788 /* 1789 * Connect to the server. 1790 */ 1791 rdma_stat 1792 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1793 { 1794 ibt_chan_open_args_t chan_args; /* channel args */ 1795 ibt_chan_sizes_t chan_sizes; 1796 ibt_rc_chan_alloc_args_t qp_attr; 1797 ibt_status_t ibt_status; 1798 ibt_rc_returns_t ret_args; /* conn reject info */ 1799 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1800 ibt_ip_cm_info_t ipcm_info; 1801 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1802 1803 1804 (void) bzero(&chan_args, sizeof (chan_args)); 1805 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1806 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1807 1808 ipcm_info.src_addr.family = rptp->srcip.family; 1809 switch (ipcm_info.src_addr.family) { 1810 case AF_INET: 1811 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1812 break; 1813 case AF_INET6: 1814 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1815 break; 1816 } 1817 1818 ipcm_info.dst_addr.family = rptp->srcip.family; 1819 switch (ipcm_info.dst_addr.family) { 1820 case AF_INET: 1821 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1822 break; 1823 case AF_INET6: 1824 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1825 break; 1826 } 1827 1828 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1829 1830 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1831 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1832 1833 if (ibt_status != IBT_SUCCESS) { 1834 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1835 return (-1); 1836 } 1837 1838 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1839 /* Alloc a RC channel */ 1840 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1841 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1842 qp_attr.rc_pd = hca->pd_hdl; 1843 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1844 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1845 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1846 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1847 qp_attr.rc_clone_chan = NULL; 1848 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1849 qp_attr.rc_flags = IBT_WR_SIGNALED; 1850 1851 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1852 chan_args.oc_path = &rptp->path; 1853 1854 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1855 chan_args.oc_cm_clnt_private = (void *)hca; 1856 chan_args.oc_rdma_ra_out = 4; 1857 chan_args.oc_rdma_ra_in = 4; 1858 chan_args.oc_path_retry_cnt = 2; 1859 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1860 chan_args.oc_priv_data = cmp_ip_pvt; 1861 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1862 1863 refresh: 1864 rw_enter(&hca->state_lock, RW_READER); 1865 if (hca->state != HCA_DETACHED) { 1866 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1867 IBT_ACHAN_NO_FLAGS, 1868 &qp_attr, &qp->qp_hdl, 1869 &chan_sizes); 1870 } else { 1871 rw_exit(&hca->state_lock); 1872 return (RDMA_FAILED); 1873 } 1874 rw_exit(&hca->state_lock); 1875 1876 if (ibt_status != IBT_SUCCESS) { 1877 DTRACE_PROBE1(rpcib__i_conntosrv, 1878 int, ibt_status); 1879 return (RDMA_FAILED); 1880 } 1881 1882 /* Connect to the Server */ 1883 (void) bzero(&ret_args, sizeof (ret_args)); 1884 mutex_enter(&qp->cb_lock); 1885 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1886 IBT_BLOCKING, &chan_args, &ret_args); 1887 if (ibt_status != IBT_SUCCESS) { 1888 DTRACE_PROBE2(rpcib__i_openrctosrv, 1889 int, ibt_status, int, ret_args.rc_status); 1890 1891 (void) ibt_free_channel(qp->qp_hdl); 1892 qp->qp_hdl = NULL; 1893 mutex_exit(&qp->cb_lock); 1894 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1895 ret_args.rc_status == IBT_CM_CONN_STALE) { 1896 /* 1897 * Got IBT_CM_CONN_STALE probably because of stale 1898 * data on the passive end of a channel that existed 1899 * prior to reboot. Retry establishing a channel 1900 * REFRESH_ATTEMPTS times, during which time the 1901 * stale conditions on the server might clear up. 1902 */ 1903 goto refresh; 1904 } 1905 return (RDMA_FAILED); 1906 } 1907 mutex_exit(&qp->cb_lock); 1908 /* 1909 * Set the private data area to qp to be used in callbacks 1910 */ 1911 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1912 return (RDMA_SUCCESS); 1913 } 1914 1915 rdma_stat 1916 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1917 { 1918 uint_t i, addr_count; 1919 ibt_status_t ibt_status; 1920 uint8_t num_paths_p; 1921 ibt_ip_path_attr_t ipattr; 1922 ibt_path_ip_src_t srcip; 1923 rpcib_ipaddrs_t addrs4; 1924 rpcib_ipaddrs_t addrs6; 1925 struct sockaddr_in *sinp; 1926 struct sockaddr_in6 *sin6p; 1927 rdma_stat retval = RDMA_FAILED; 1928 rib_hca_t *hca; 1929 1930 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1931 return (RDMA_INVAL); 1932 ASSERT(raddr->buf != NULL); 1933 1934 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1935 1936 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1937 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1938 retval = RDMA_FAILED; 1939 goto done2; 1940 } 1941 1942 if (addr_type == AF_INET) { 1943 addr_count = addrs4.ri_count; 1944 sinp = (struct sockaddr_in *)raddr->buf; 1945 rptp->dstip.family = AF_INET; 1946 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1947 sinp = addrs4.ri_list; 1948 } else { 1949 addr_count = addrs6.ri_count; 1950 sin6p = (struct sockaddr_in6 *)raddr->buf; 1951 rptp->dstip.family = AF_INET6; 1952 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1953 sin6p = addrs6.ri_list; 1954 } 1955 1956 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1957 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1958 rw_enter(&hca->state_lock, RW_READER); 1959 if (hca->state == HCA_DETACHED) { 1960 rw_exit(&hca->state_lock); 1961 continue; 1962 } 1963 1964 ipattr.ipa_dst_ip = &rptp->dstip; 1965 ipattr.ipa_hca_guid = hca->hca_guid; 1966 ipattr.ipa_ndst = 1; 1967 ipattr.ipa_max_paths = 1; 1968 ipattr.ipa_src_ip.family = rptp->dstip.family; 1969 for (i = 0; i < addr_count; i++) { 1970 num_paths_p = 0; 1971 if (addr_type == AF_INET) { 1972 ipattr.ipa_src_ip.un.ip4addr = 1973 sinp[i].sin_addr.s_addr; 1974 } else { 1975 ipattr.ipa_src_ip.un.ip6addr = 1976 sin6p[i].sin6_addr; 1977 } 1978 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1979 1980 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1981 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1982 &num_paths_p, &srcip); 1983 if (ibt_status == IBT_SUCCESS && 1984 num_paths_p != 0 && 1985 rptp->path.pi_hca_guid == hca->hca_guid) { 1986 rptp->hca = hca; 1987 rw_exit(&hca->state_lock); 1988 if (addr_type == AF_INET) { 1989 rptp->srcip.family = AF_INET; 1990 rptp->srcip.un.ip4addr = 1991 srcip.ip_primary.un.ip4addr; 1992 } else { 1993 rptp->srcip.family = AF_INET6; 1994 rptp->srcip.un.ip6addr = 1995 srcip.ip_primary.un.ip6addr; 1996 1997 } 1998 retval = RDMA_SUCCESS; 1999 goto done1; 2000 } 2001 } 2002 rw_exit(&hca->state_lock); 2003 } 2004 done1: 2005 rw_exit(&rib_stat->hcas_list_lock); 2006 done2: 2007 if (addrs4.ri_size > 0) 2008 kmem_free(addrs4.ri_list, addrs4.ri_size); 2009 if (addrs6.ri_size > 0) 2010 kmem_free(addrs6.ri_list, addrs6.ri_size); 2011 return (retval); 2012 } 2013 2014 /* 2015 * Close channel, remove from connection list and 2016 * free up resources allocated for that channel. 2017 */ 2018 rdma_stat 2019 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2020 { 2021 rib_qp_t *qp = ctoqp(conn); 2022 rib_hca_t *hca; 2023 2024 mutex_enter(&conn->c_lock); 2025 if (conn->c_timeout != NULL) { 2026 mutex_exit(&conn->c_lock); 2027 (void) untimeout(conn->c_timeout); 2028 mutex_enter(&conn->c_lock); 2029 } 2030 2031 while (conn->c_flags & C_CLOSE_PENDING) { 2032 cv_wait(&conn->c_cv, &conn->c_lock); 2033 } 2034 mutex_exit(&conn->c_lock); 2035 2036 /* 2037 * c_ref == 0 and connection is in C_DISCONN_PEND 2038 */ 2039 hca = qp->hca; 2040 if (conn_list != NULL) 2041 (void) rib_rm_conn(conn, conn_list); 2042 2043 /* 2044 * There is only one case where we get here with 2045 * qp_hdl = NULL, which is during connection setup on 2046 * the client. In such a case there are no posted 2047 * send/recv buffers. 2048 */ 2049 if (qp->qp_hdl != NULL) { 2050 mutex_enter(&qp->posted_rbufs_lock); 2051 while (qp->n_posted_rbufs) 2052 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2053 mutex_exit(&qp->posted_rbufs_lock); 2054 2055 mutex_enter(&qp->send_rbufs_lock); 2056 while (qp->n_send_rbufs) 2057 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2058 mutex_exit(&qp->send_rbufs_lock); 2059 2060 (void) ibt_free_channel(qp->qp_hdl); 2061 qp->qp_hdl = NULL; 2062 } 2063 2064 ASSERT(qp->rdlist == NULL); 2065 2066 if (qp->replylist != NULL) { 2067 (void) rib_rem_replylist(qp); 2068 } 2069 2070 cv_destroy(&qp->cb_conn_cv); 2071 cv_destroy(&qp->posted_rbufs_cv); 2072 cv_destroy(&qp->send_rbufs_cv); 2073 mutex_destroy(&qp->cb_lock); 2074 mutex_destroy(&qp->replylist_lock); 2075 mutex_destroy(&qp->posted_rbufs_lock); 2076 mutex_destroy(&qp->send_rbufs_lock); 2077 mutex_destroy(&qp->rdlist_lock); 2078 2079 cv_destroy(&conn->c_cv); 2080 mutex_destroy(&conn->c_lock); 2081 2082 if (conn->c_raddr.buf != NULL) { 2083 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2084 } 2085 if (conn->c_laddr.buf != NULL) { 2086 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2087 } 2088 if (conn->c_netid != NULL) { 2089 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2090 } 2091 if (conn->c_addrmask.buf != NULL) { 2092 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len); 2093 } 2094 2095 /* 2096 * Credit control cleanup. 2097 */ 2098 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2099 rdma_clnt_cred_ctrl_t *cc_info; 2100 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2101 cv_destroy(&cc_info->clnt_cc_cv); 2102 } 2103 2104 kmem_free(qp, sizeof (rib_qp_t)); 2105 2106 /* 2107 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2108 * then the hca is no longer being used. 2109 */ 2110 if (conn_list != NULL) { 2111 rw_enter(&hca->state_lock, RW_READER); 2112 if (hca->state == HCA_DETACHED) { 2113 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2114 if (hca->srv_conn_list.conn_hd == NULL) { 2115 rw_enter(&hca->cl_conn_list.conn_lock, 2116 RW_READER); 2117 2118 if (hca->cl_conn_list.conn_hd == NULL) { 2119 mutex_enter(&hca->inuse_lock); 2120 hca->inuse = FALSE; 2121 cv_signal(&hca->cb_cv); 2122 mutex_exit(&hca->inuse_lock); 2123 } 2124 rw_exit(&hca->cl_conn_list.conn_lock); 2125 } 2126 rw_exit(&hca->srv_conn_list.conn_lock); 2127 } 2128 rw_exit(&hca->state_lock); 2129 } 2130 2131 return (RDMA_SUCCESS); 2132 } 2133 2134 /* 2135 * All sends are done under the protection of 2136 * the wdesc->sendwait_lock. n_send_rbufs count 2137 * is protected using the send_rbufs_lock. 2138 * lock ordering is: 2139 * sendwait_lock -> send_rbufs_lock 2140 */ 2141 2142 void 2143 rib_send_hold(rib_qp_t *qp) 2144 { 2145 mutex_enter(&qp->send_rbufs_lock); 2146 qp->n_send_rbufs++; 2147 mutex_exit(&qp->send_rbufs_lock); 2148 } 2149 2150 void 2151 rib_send_rele(rib_qp_t *qp) 2152 { 2153 mutex_enter(&qp->send_rbufs_lock); 2154 qp->n_send_rbufs--; 2155 if (qp->n_send_rbufs == 0) 2156 cv_signal(&qp->send_rbufs_cv); 2157 mutex_exit(&qp->send_rbufs_lock); 2158 } 2159 2160 void 2161 rib_recv_rele(rib_qp_t *qp) 2162 { 2163 mutex_enter(&qp->posted_rbufs_lock); 2164 qp->n_posted_rbufs--; 2165 if (qp->n_posted_rbufs == 0) 2166 cv_signal(&qp->posted_rbufs_cv); 2167 mutex_exit(&qp->posted_rbufs_lock); 2168 } 2169 2170 /* 2171 * Wait for send completion notification. Only on receiving a 2172 * notification be it a successful or error completion, free the 2173 * send_wid. 2174 */ 2175 static rdma_stat 2176 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2177 { 2178 clock_t timout, cv_wait_ret; 2179 rdma_stat error = RDMA_SUCCESS; 2180 int i; 2181 2182 /* 2183 * Wait for send to complete 2184 */ 2185 ASSERT(wd != NULL); 2186 mutex_enter(&wd->sendwait_lock); 2187 if (wd->status == (uint_t)SEND_WAIT) { 2188 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2189 ddi_get_lbolt(); 2190 2191 if (qp->mode == RIB_SERVER) { 2192 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2193 &wd->sendwait_lock, timout)) > 0 && 2194 wd->status == (uint_t)SEND_WAIT) 2195 ; 2196 switch (cv_wait_ret) { 2197 case -1: /* timeout */ 2198 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2199 2200 wd->cv_sig = 0; /* no signal needed */ 2201 error = RDMA_TIMEDOUT; 2202 break; 2203 default: /* got send completion */ 2204 break; 2205 } 2206 } else { 2207 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2208 &wd->sendwait_lock, timout)) > 0 && 2209 wd->status == (uint_t)SEND_WAIT) 2210 ; 2211 switch (cv_wait_ret) { 2212 case -1: /* timeout */ 2213 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2214 2215 wd->cv_sig = 0; /* no signal needed */ 2216 error = RDMA_TIMEDOUT; 2217 break; 2218 case 0: /* interrupted */ 2219 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2220 2221 wd->cv_sig = 0; /* no signal needed */ 2222 error = RDMA_INTR; 2223 break; 2224 default: /* got send completion */ 2225 break; 2226 } 2227 } 2228 } 2229 2230 if (wd->status != (uint_t)SEND_WAIT) { 2231 /* got send completion */ 2232 if (wd->status != RDMA_SUCCESS) { 2233 switch (wd->status) { 2234 case RDMA_CONNLOST: 2235 error = RDMA_CONNLOST; 2236 break; 2237 default: 2238 error = RDMA_FAILED; 2239 break; 2240 } 2241 } 2242 for (i = 0; i < wd->nsbufs; i++) { 2243 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2244 (void *)(uintptr_t)wd->sbufaddr[i]); 2245 } 2246 2247 rib_send_rele(qp); 2248 2249 mutex_exit(&wd->sendwait_lock); 2250 (void) rib_free_sendwait(wd); 2251 2252 } else { 2253 mutex_exit(&wd->sendwait_lock); 2254 } 2255 return (error); 2256 } 2257 2258 static struct send_wid * 2259 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2260 { 2261 struct send_wid *wd; 2262 2263 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2264 wd->xid = xid; 2265 wd->cv_sig = cv_sig; 2266 wd->qp = qp; 2267 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2268 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2269 wd->status = (uint_t)SEND_WAIT; 2270 2271 return (wd); 2272 } 2273 2274 static int 2275 rib_free_sendwait(struct send_wid *wdesc) 2276 { 2277 cv_destroy(&wdesc->wait_cv); 2278 mutex_destroy(&wdesc->sendwait_lock); 2279 kmem_free(wdesc, sizeof (*wdesc)); 2280 2281 return (0); 2282 } 2283 2284 static rdma_stat 2285 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2286 { 2287 mutex_enter(&qp->replylist_lock); 2288 if (rep != NULL) { 2289 (void) rib_remreply(qp, rep); 2290 mutex_exit(&qp->replylist_lock); 2291 return (RDMA_SUCCESS); 2292 } 2293 mutex_exit(&qp->replylist_lock); 2294 return (RDMA_FAILED); 2295 } 2296 2297 /* 2298 * Send buffers are freed here only in case of error in posting 2299 * on QP. If the post succeeded, the send buffers are freed upon 2300 * send completion in rib_sendwait() or in the scq_handler. 2301 */ 2302 rdma_stat 2303 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2304 int send_sig, int cv_sig, caddr_t *swid) 2305 { 2306 struct send_wid *wdesc; 2307 struct clist *clp; 2308 ibt_status_t ibt_status = IBT_SUCCESS; 2309 rdma_stat ret = RDMA_SUCCESS; 2310 ibt_send_wr_t tx_wr; 2311 int i, nds; 2312 ibt_wr_ds_t sgl[DSEG_MAX]; 2313 uint_t total_msg_size; 2314 rib_qp_t *qp; 2315 2316 qp = ctoqp(conn); 2317 2318 ASSERT(cl != NULL); 2319 2320 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2321 2322 nds = 0; 2323 total_msg_size = 0; 2324 clp = cl; 2325 while (clp != NULL) { 2326 if (nds >= DSEG_MAX) { 2327 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2328 return (RDMA_FAILED); 2329 } 2330 sgl[nds].ds_va = clp->w.c_saddr; 2331 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2332 sgl[nds].ds_len = clp->c_len; 2333 total_msg_size += clp->c_len; 2334 clp = clp->c_next; 2335 nds++; 2336 } 2337 2338 if (send_sig) { 2339 /* Set SEND_SIGNAL flag. */ 2340 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2341 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2342 *swid = (caddr_t)wdesc; 2343 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2344 mutex_enter(&wdesc->sendwait_lock); 2345 wdesc->nsbufs = nds; 2346 for (i = 0; i < nds; i++) { 2347 wdesc->sbufaddr[i] = sgl[i].ds_va; 2348 } 2349 } else { 2350 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2351 *swid = NULL; 2352 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2353 } 2354 2355 tx_wr.wr_opcode = IBT_WRC_SEND; 2356 tx_wr.wr_trans = IBT_RC_SRV; 2357 tx_wr.wr_nds = nds; 2358 tx_wr.wr_sgl = sgl; 2359 2360 mutex_enter(&conn->c_lock); 2361 if (conn->c_state == C_CONNECTED) { 2362 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2363 } 2364 if (conn->c_state != C_CONNECTED || 2365 ibt_status != IBT_SUCCESS) { 2366 if (conn->c_state != C_DISCONN_PEND) 2367 conn->c_state = C_ERROR_CONN; 2368 mutex_exit(&conn->c_lock); 2369 if (send_sig) { 2370 for (i = 0; i < nds; i++) { 2371 rib_rbuf_free(conn, SEND_BUFFER, 2372 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2373 } 2374 mutex_exit(&wdesc->sendwait_lock); 2375 (void) rib_free_sendwait(wdesc); 2376 } 2377 return (RDMA_CONNLOST); 2378 } 2379 2380 mutex_exit(&conn->c_lock); 2381 2382 if (send_sig) { 2383 rib_send_hold(qp); 2384 mutex_exit(&wdesc->sendwait_lock); 2385 if (cv_sig) { 2386 /* 2387 * cv_wait for send to complete. 2388 * We can fail due to a timeout or signal or 2389 * unsuccessful send. 2390 */ 2391 ret = rib_sendwait(qp, wdesc); 2392 2393 return (ret); 2394 } 2395 } 2396 2397 return (RDMA_SUCCESS); 2398 } 2399 2400 2401 rdma_stat 2402 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2403 { 2404 rdma_stat ret; 2405 caddr_t wd; 2406 2407 /* send-wait & cv_signal */ 2408 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2409 return (ret); 2410 } 2411 2412 /* 2413 * Deprecated/obsolete interface not used currently 2414 * but earlier used for READ-READ protocol. 2415 * Send RPC reply and wait for RDMA_DONE. 2416 */ 2417 rdma_stat 2418 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2419 { 2420 rdma_stat ret = RDMA_SUCCESS; 2421 struct rdma_done_list *rd; 2422 clock_t cv_wait_ret; 2423 caddr_t *wid = NULL; 2424 rib_qp_t *qp = ctoqp(conn); 2425 2426 mutex_enter(&qp->rdlist_lock); 2427 rd = rdma_done_add(qp, msgid); 2428 2429 /* No cv_signal (whether send-wait or no-send-wait) */ 2430 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2431 2432 if (ret != RDMA_SUCCESS) { 2433 rdma_done_rm(qp, rd); 2434 } else { 2435 /* 2436 * Wait for RDMA_DONE from remote end 2437 */ 2438 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2439 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2440 TR_CLOCK_TICK); 2441 2442 rdma_done_rm(qp, rd); 2443 2444 if (cv_wait_ret < 0) { 2445 ret = RDMA_TIMEDOUT; 2446 } 2447 } 2448 2449 mutex_exit(&qp->rdlist_lock); 2450 return (ret); 2451 } 2452 2453 static struct recv_wid * 2454 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2455 { 2456 struct recv_wid *rwid; 2457 2458 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2459 rwid->xid = msgid; 2460 rwid->addr = sgl->ds_va; 2461 rwid->qp = qp; 2462 2463 return (rwid); 2464 } 2465 2466 static void 2467 rib_free_wid(struct recv_wid *rwid) 2468 { 2469 kmem_free(rwid, sizeof (struct recv_wid)); 2470 } 2471 2472 rdma_stat 2473 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2474 { 2475 rib_qp_t *qp = ctoqp(conn); 2476 struct clist *clp = cl; 2477 struct reply *rep; 2478 struct recv_wid *rwid; 2479 int nds; 2480 ibt_wr_ds_t sgl[DSEG_MAX]; 2481 ibt_recv_wr_t recv_wr; 2482 rdma_stat ret; 2483 ibt_status_t ibt_status; 2484 2485 /* 2486 * rdma_clnt_postrecv uses RECV_BUFFER. 2487 */ 2488 2489 nds = 0; 2490 while (cl != NULL) { 2491 if (nds >= DSEG_MAX) { 2492 ret = RDMA_FAILED; 2493 goto done; 2494 } 2495 sgl[nds].ds_va = cl->w.c_saddr; 2496 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2497 sgl[nds].ds_len = cl->c_len; 2498 cl = cl->c_next; 2499 nds++; 2500 } 2501 2502 if (nds != 1) { 2503 ret = RDMA_FAILED; 2504 goto done; 2505 } 2506 2507 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2508 recv_wr.wr_nds = nds; 2509 recv_wr.wr_sgl = sgl; 2510 2511 rwid = rib_create_wid(qp, &sgl[0], msgid); 2512 if (rwid) { 2513 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2514 } else { 2515 ret = RDMA_NORESOURCE; 2516 goto done; 2517 } 2518 rep = rib_addreplylist(qp, msgid); 2519 if (!rep) { 2520 rib_free_wid(rwid); 2521 ret = RDMA_NORESOURCE; 2522 goto done; 2523 } 2524 2525 mutex_enter(&conn->c_lock); 2526 2527 if (conn->c_state == C_CONNECTED) { 2528 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2529 } 2530 2531 if (conn->c_state != C_CONNECTED || 2532 ibt_status != IBT_SUCCESS) { 2533 if (conn->c_state != C_DISCONN_PEND) 2534 conn->c_state = C_ERROR_CONN; 2535 mutex_exit(&conn->c_lock); 2536 rib_free_wid(rwid); 2537 (void) rib_rem_rep(qp, rep); 2538 ret = RDMA_CONNLOST; 2539 goto done; 2540 } 2541 2542 mutex_enter(&qp->posted_rbufs_lock); 2543 qp->n_posted_rbufs++; 2544 mutex_exit(&qp->posted_rbufs_lock); 2545 2546 mutex_exit(&conn->c_lock); 2547 return (RDMA_SUCCESS); 2548 2549 done: 2550 while (clp != NULL) { 2551 rib_rbuf_free(conn, RECV_BUFFER, 2552 (void *)(uintptr_t)clp->w.c_saddr3); 2553 clp = clp->c_next; 2554 } 2555 return (ret); 2556 } 2557 2558 rdma_stat 2559 rib_svc_post(CONN* conn, struct clist *cl) 2560 { 2561 rib_qp_t *qp = ctoqp(conn); 2562 struct svc_recv *s_recvp; 2563 int nds; 2564 ibt_wr_ds_t sgl[DSEG_MAX]; 2565 ibt_recv_wr_t recv_wr; 2566 ibt_status_t ibt_status; 2567 2568 nds = 0; 2569 while (cl != NULL) { 2570 if (nds >= DSEG_MAX) { 2571 return (RDMA_FAILED); 2572 } 2573 sgl[nds].ds_va = cl->w.c_saddr; 2574 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2575 sgl[nds].ds_len = cl->c_len; 2576 cl = cl->c_next; 2577 nds++; 2578 } 2579 2580 if (nds != 1) { 2581 rib_rbuf_free(conn, RECV_BUFFER, 2582 (caddr_t)(uintptr_t)sgl[0].ds_va); 2583 2584 return (RDMA_FAILED); 2585 } 2586 2587 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2588 recv_wr.wr_nds = nds; 2589 recv_wr.wr_sgl = sgl; 2590 2591 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2592 /* Use s_recvp's addr as wr id */ 2593 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2594 mutex_enter(&conn->c_lock); 2595 if (conn->c_state == C_CONNECTED) { 2596 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2597 } 2598 if (conn->c_state != C_CONNECTED || 2599 ibt_status != IBT_SUCCESS) { 2600 if (conn->c_state != C_DISCONN_PEND) 2601 conn->c_state = C_ERROR_CONN; 2602 mutex_exit(&conn->c_lock); 2603 rib_rbuf_free(conn, RECV_BUFFER, 2604 (caddr_t)(uintptr_t)sgl[0].ds_va); 2605 (void) rib_free_svc_recv(s_recvp); 2606 2607 return (RDMA_CONNLOST); 2608 } 2609 mutex_exit(&conn->c_lock); 2610 2611 return (RDMA_SUCCESS); 2612 } 2613 2614 /* Client */ 2615 rdma_stat 2616 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2617 { 2618 return (rib_clnt_post(conn, cl, msgid)); 2619 } 2620 2621 /* Client */ 2622 rdma_stat 2623 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2624 { 2625 rib_qp_t *qp = ctoqp(conn); 2626 struct reply *rep; 2627 2628 mutex_enter(&qp->replylist_lock); 2629 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2630 if (rep->xid == msgid) { 2631 if (rep->vaddr_cq) { 2632 rib_rbuf_free(conn, RECV_BUFFER, 2633 (caddr_t)(uintptr_t)rep->vaddr_cq); 2634 } 2635 (void) rib_remreply(qp, rep); 2636 break; 2637 } 2638 } 2639 mutex_exit(&qp->replylist_lock); 2640 2641 return (RDMA_SUCCESS); 2642 } 2643 2644 /* Server */ 2645 rdma_stat 2646 rib_post_recv(CONN *conn, struct clist *cl) 2647 { 2648 rib_qp_t *qp = ctoqp(conn); 2649 2650 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2651 mutex_enter(&qp->posted_rbufs_lock); 2652 qp->n_posted_rbufs++; 2653 mutex_exit(&qp->posted_rbufs_lock); 2654 return (RDMA_SUCCESS); 2655 } 2656 return (RDMA_FAILED); 2657 } 2658 2659 /* 2660 * Client side only interface to "recv" the rpc reply buf 2661 * posted earlier by rib_post_resp(conn, cl, msgid). 2662 */ 2663 rdma_stat 2664 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2665 { 2666 struct reply *rep = NULL; 2667 clock_t timout, cv_wait_ret; 2668 rdma_stat ret = RDMA_SUCCESS; 2669 rib_qp_t *qp = ctoqp(conn); 2670 2671 /* 2672 * Find the reply structure for this msgid 2673 */ 2674 mutex_enter(&qp->replylist_lock); 2675 2676 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2677 if (rep->xid == msgid) 2678 break; 2679 } 2680 2681 if (rep != NULL) { 2682 /* 2683 * If message not yet received, wait. 2684 */ 2685 if (rep->status == (uint_t)REPLY_WAIT) { 2686 timout = ddi_get_lbolt() + 2687 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2688 2689 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2690 &qp->replylist_lock, timout)) > 0 && 2691 rep->status == (uint_t)REPLY_WAIT) 2692 ; 2693 2694 switch (cv_wait_ret) { 2695 case -1: /* timeout */ 2696 ret = RDMA_TIMEDOUT; 2697 break; 2698 case 0: 2699 ret = RDMA_INTR; 2700 break; 2701 default: 2702 break; 2703 } 2704 } 2705 2706 if (rep->status == RDMA_SUCCESS) { 2707 struct clist *cl = NULL; 2708 2709 /* 2710 * Got message successfully 2711 */ 2712 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2713 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2714 *clp = cl; 2715 } else { 2716 if (rep->status != (uint_t)REPLY_WAIT) { 2717 /* 2718 * Got error in reply message. Free 2719 * recv buffer here. 2720 */ 2721 ret = rep->status; 2722 rib_rbuf_free(conn, RECV_BUFFER, 2723 (caddr_t)(uintptr_t)rep->vaddr_cq); 2724 } 2725 } 2726 (void) rib_remreply(qp, rep); 2727 } else { 2728 /* 2729 * No matching reply structure found for given msgid on the 2730 * reply wait list. 2731 */ 2732 ret = RDMA_INVAL; 2733 DTRACE_PROBE(rpcib__i__nomatchxid2); 2734 } 2735 2736 /* 2737 * Done. 2738 */ 2739 mutex_exit(&qp->replylist_lock); 2740 return (ret); 2741 } 2742 2743 /* 2744 * RDMA write a buffer to the remote address. 2745 */ 2746 rdma_stat 2747 rib_write(CONN *conn, struct clist *cl, int wait) 2748 { 2749 ibt_send_wr_t tx_wr; 2750 int cv_sig; 2751 ibt_wr_ds_t sgl[DSEG_MAX]; 2752 struct send_wid *wdesc; 2753 ibt_status_t ibt_status; 2754 rdma_stat ret = RDMA_SUCCESS; 2755 rib_qp_t *qp = ctoqp(conn); 2756 uint64_t n_writes = 0; 2757 2758 if (cl == NULL) { 2759 return (RDMA_FAILED); 2760 } 2761 2762 while ((cl != NULL)) { 2763 if (cl->c_len > 0) { 2764 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2765 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2766 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2767 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2768 sgl[0].ds_va = cl->w.c_saddr; 2769 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2770 sgl[0].ds_len = cl->c_len; 2771 2772 if (wait) { 2773 cv_sig = 1; 2774 } else { 2775 if (n_writes > max_unsignaled_rws) { 2776 n_writes = 0; 2777 cv_sig = 1; 2778 } else { 2779 cv_sig = 0; 2780 } 2781 } 2782 2783 if (cv_sig) { 2784 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2785 wdesc = rib_init_sendwait(0, cv_sig, qp); 2786 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2787 mutex_enter(&wdesc->sendwait_lock); 2788 } else { 2789 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2790 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2791 } 2792 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2793 tx_wr.wr_trans = IBT_RC_SRV; 2794 tx_wr.wr_nds = 1; 2795 tx_wr.wr_sgl = sgl; 2796 2797 mutex_enter(&conn->c_lock); 2798 if (conn->c_state == C_CONNECTED) { 2799 ibt_status = 2800 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2801 } 2802 if (conn->c_state != C_CONNECTED || 2803 ibt_status != IBT_SUCCESS) { 2804 if (conn->c_state != C_DISCONN_PEND) 2805 conn->c_state = C_ERROR_CONN; 2806 mutex_exit(&conn->c_lock); 2807 if (cv_sig) { 2808 mutex_exit(&wdesc->sendwait_lock); 2809 (void) rib_free_sendwait(wdesc); 2810 } 2811 return (RDMA_CONNLOST); 2812 } 2813 2814 mutex_exit(&conn->c_lock); 2815 2816 /* 2817 * Wait for send to complete 2818 */ 2819 if (cv_sig) { 2820 2821 rib_send_hold(qp); 2822 mutex_exit(&wdesc->sendwait_lock); 2823 2824 ret = rib_sendwait(qp, wdesc); 2825 if (ret != 0) 2826 return (ret); 2827 } 2828 n_writes ++; 2829 } 2830 cl = cl->c_next; 2831 } 2832 return (RDMA_SUCCESS); 2833 } 2834 2835 /* 2836 * RDMA Read a buffer from the remote address. 2837 */ 2838 rdma_stat 2839 rib_read(CONN *conn, struct clist *cl, int wait) 2840 { 2841 ibt_send_wr_t rx_wr; 2842 int cv_sig = 0; 2843 ibt_wr_ds_t sgl; 2844 struct send_wid *wdesc; 2845 ibt_status_t ibt_status = IBT_SUCCESS; 2846 rdma_stat ret = RDMA_SUCCESS; 2847 rib_qp_t *qp = ctoqp(conn); 2848 2849 if (cl == NULL) { 2850 return (RDMA_FAILED); 2851 } 2852 2853 while (cl != NULL) { 2854 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2855 /* 2856 * Remote address is at the head chunk item in list. 2857 */ 2858 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2859 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2860 2861 sgl.ds_va = cl->u.c_daddr; 2862 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2863 sgl.ds_len = cl->c_len; 2864 2865 /* 2866 * If there are multiple chunks to be read, and 2867 * wait is set, ask for signal only for the last chunk 2868 * and wait only on the last chunk. The completion of 2869 * RDMA_READ on last chunk ensures that reads on all 2870 * previous chunks are also completed. 2871 */ 2872 if (wait && (cl->c_next == NULL)) { 2873 cv_sig = 1; 2874 wdesc = rib_init_sendwait(0, cv_sig, qp); 2875 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2876 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2877 mutex_enter(&wdesc->sendwait_lock); 2878 } else { 2879 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2880 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2881 } 2882 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2883 rx_wr.wr_trans = IBT_RC_SRV; 2884 rx_wr.wr_nds = 1; 2885 rx_wr.wr_sgl = &sgl; 2886 2887 mutex_enter(&conn->c_lock); 2888 if (conn->c_state == C_CONNECTED) { 2889 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2890 } 2891 if (conn->c_state != C_CONNECTED || 2892 ibt_status != IBT_SUCCESS) { 2893 if (conn->c_state != C_DISCONN_PEND) 2894 conn->c_state = C_ERROR_CONN; 2895 mutex_exit(&conn->c_lock); 2896 if (wait && (cl->c_next == NULL)) { 2897 mutex_exit(&wdesc->sendwait_lock); 2898 (void) rib_free_sendwait(wdesc); 2899 } 2900 return (RDMA_CONNLOST); 2901 } 2902 2903 mutex_exit(&conn->c_lock); 2904 2905 /* 2906 * Wait for send to complete if this is the 2907 * last item in the list. 2908 */ 2909 if (wait && cl->c_next == NULL) { 2910 rib_send_hold(qp); 2911 mutex_exit(&wdesc->sendwait_lock); 2912 2913 ret = rib_sendwait(qp, wdesc); 2914 2915 if (ret != 0) 2916 return (ret); 2917 } 2918 cl = cl->c_next; 2919 } 2920 return (RDMA_SUCCESS); 2921 } 2922 2923 /* 2924 * rib_srv_cm_handler() 2925 * Connection Manager callback to handle RC connection requests. 2926 */ 2927 /* ARGSUSED */ 2928 static ibt_cm_status_t 2929 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2930 ibt_cm_return_args_t *ret_args, void *priv_data, 2931 ibt_priv_data_len_t len) 2932 { 2933 queue_t *q; 2934 rib_qp_t *qp; 2935 rib_hca_t *hca; 2936 rdma_stat status = RDMA_SUCCESS; 2937 int i; 2938 struct clist cl; 2939 rdma_buf_t rdbuf = {0}; 2940 void *buf = NULL; 2941 CONN *conn; 2942 ibt_ip_cm_info_t ipinfo; 2943 struct sockaddr_in *s; 2944 struct sockaddr_in6 *s6; 2945 int sin_size = sizeof (struct sockaddr_in); 2946 int in_size = sizeof (struct in_addr); 2947 int sin6_size = sizeof (struct sockaddr_in6); 2948 2949 ASSERT(any != NULL); 2950 ASSERT(event != NULL); 2951 2952 hca = (rib_hca_t *)any; 2953 2954 /* got a connection request */ 2955 switch (event->cm_type) { 2956 case IBT_CM_EVENT_REQ_RCV: 2957 /* 2958 * If the plugin is in the NO_ACCEPT state, bail out. 2959 */ 2960 mutex_enter(&plugin_state_lock); 2961 if (plugin_state == NO_ACCEPT) { 2962 mutex_exit(&plugin_state_lock); 2963 return (IBT_CM_REJECT); 2964 } 2965 mutex_exit(&plugin_state_lock); 2966 2967 /* 2968 * Need to send a MRA MAD to CM so that it does not 2969 * timeout on us. 2970 */ 2971 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2972 event->cm_event.req.req_timeout * 8, NULL, 0); 2973 2974 mutex_enter(&rib_stat->open_hca_lock); 2975 q = rib_stat->q; 2976 mutex_exit(&rib_stat->open_hca_lock); 2977 2978 status = rib_svc_create_chan(hca, (caddr_t)q, 2979 event->cm_event.req.req_prim_hca_port, &qp); 2980 2981 if (status) { 2982 return (IBT_CM_REJECT); 2983 } 2984 2985 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2986 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2987 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2988 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2989 2990 /* 2991 * Pre-posts RECV buffers 2992 */ 2993 conn = qptoc(qp); 2994 for (i = 0; i < preposted_rbufs; i++) { 2995 bzero(&rdbuf, sizeof (rdbuf)); 2996 rdbuf.type = RECV_BUFFER; 2997 buf = rib_rbuf_alloc(conn, &rdbuf); 2998 if (buf == NULL) { 2999 /* 3000 * A connection is not established yet. 3001 * Just flush the channel. Buffers 3002 * posted till now will error out with 3003 * IBT_WC_WR_FLUSHED_ERR. 3004 */ 3005 (void) ibt_flush_channel(qp->qp_hdl); 3006 (void) rib_disconnect_channel(conn, NULL); 3007 return (IBT_CM_REJECT); 3008 } 3009 3010 bzero(&cl, sizeof (cl)); 3011 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3012 cl.c_len = rdbuf.len; 3013 cl.c_smemhandle.mrc_lmr = 3014 rdbuf.handle.mrc_lmr; /* lkey */ 3015 cl.c_next = NULL; 3016 status = rib_post_recv(conn, &cl); 3017 if (status != RDMA_SUCCESS) { 3018 /* 3019 * A connection is not established yet. 3020 * Just flush the channel. Buffers 3021 * posted till now will error out with 3022 * IBT_WC_WR_FLUSHED_ERR. 3023 */ 3024 (void) ibt_flush_channel(qp->qp_hdl); 3025 (void) rib_disconnect_channel(conn, NULL); 3026 return (IBT_CM_REJECT); 3027 } 3028 } 3029 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3030 3031 /* 3032 * Get the address translation 3033 */ 3034 rw_enter(&hca->state_lock, RW_READER); 3035 if (hca->state == HCA_DETACHED) { 3036 rw_exit(&hca->state_lock); 3037 return (IBT_CM_REJECT); 3038 } 3039 rw_exit(&hca->state_lock); 3040 3041 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3042 3043 if (ibt_get_ip_data(event->cm_priv_data_len, 3044 event->cm_priv_data, 3045 &ipinfo) != IBT_SUCCESS) { 3046 3047 return (IBT_CM_REJECT); 3048 } 3049 3050 switch (ipinfo.src_addr.family) { 3051 case AF_INET: 3052 3053 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3054 KM_SLEEP); 3055 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3056 3057 conn->c_raddr.maxlen = 3058 conn->c_raddr.len = sin_size; 3059 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3060 3061 s = (struct sockaddr_in *)conn->c_raddr.buf; 3062 s->sin_family = AF_INET; 3063 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3064 &s->sin_addr, in_size); 3065 3066 conn->c_laddr.maxlen = 3067 conn->c_laddr.len = sin_size; 3068 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3069 3070 s = (struct sockaddr_in *)conn->c_laddr.buf; 3071 s->sin_family = AF_INET; 3072 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3073 &s->sin_addr, in_size); 3074 3075 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3076 sizeof (struct sockaddr_in); 3077 conn->c_addrmask.buf = 3078 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3079 ((struct sockaddr_in *) 3080 conn->c_addrmask.buf)->sin_addr.s_addr = 3081 (uint32_t)~0; 3082 ((struct sockaddr_in *) 3083 conn->c_addrmask.buf)->sin_family = 3084 (sa_family_t)~0; 3085 break; 3086 3087 case AF_INET6: 3088 3089 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3090 KM_SLEEP); 3091 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3092 3093 conn->c_raddr.maxlen = 3094 conn->c_raddr.len = sin6_size; 3095 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3096 3097 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3098 s6->sin6_family = AF_INET6; 3099 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3100 &s6->sin6_addr, 3101 sizeof (struct in6_addr)); 3102 3103 conn->c_laddr.maxlen = 3104 conn->c_laddr.len = sin6_size; 3105 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3106 3107 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3108 s6->sin6_family = AF_INET6; 3109 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3110 &s6->sin6_addr, 3111 sizeof (struct in6_addr)); 3112 3113 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3114 sizeof (struct sockaddr_in6); 3115 conn->c_addrmask.buf = 3116 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3117 (void) memset(&((struct sockaddr_in6 *) 3118 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0, 3119 sizeof (struct in6_addr)); 3120 ((struct sockaddr_in6 *) 3121 conn->c_addrmask.buf)->sin6_family = 3122 (sa_family_t)~0; 3123 break; 3124 3125 default: 3126 return (IBT_CM_REJECT); 3127 } 3128 3129 break; 3130 3131 case IBT_CM_EVENT_CONN_CLOSED: 3132 { 3133 CONN *conn; 3134 rib_qp_t *qp; 3135 3136 switch (event->cm_event.closed) { 3137 case IBT_CM_CLOSED_DREP_RCVD: 3138 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3139 case IBT_CM_CLOSED_DUP: 3140 case IBT_CM_CLOSED_ABORT: 3141 case IBT_CM_CLOSED_ALREADY: 3142 /* 3143 * These cases indicate the local end initiated 3144 * the closing of the channel. Nothing to do here. 3145 */ 3146 break; 3147 default: 3148 /* 3149 * Reason for CONN_CLOSED event must be one of 3150 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3151 * or IBT_CM_CLOSED_STALE. These indicate cases were 3152 * the remote end is closing the channel. In these 3153 * cases free the channel and transition to error 3154 * state 3155 */ 3156 qp = ibt_get_chan_private(event->cm_channel); 3157 conn = qptoc(qp); 3158 mutex_enter(&conn->c_lock); 3159 if (conn->c_state == C_DISCONN_PEND) { 3160 mutex_exit(&conn->c_lock); 3161 break; 3162 } 3163 conn->c_state = C_ERROR_CONN; 3164 3165 /* 3166 * Free the conn if c_ref goes down to 0 3167 */ 3168 if (conn->c_ref == 0) { 3169 /* 3170 * Remove from list and free conn 3171 */ 3172 conn->c_state = C_DISCONN_PEND; 3173 mutex_exit(&conn->c_lock); 3174 (void) rib_disconnect_channel(conn, 3175 &hca->srv_conn_list); 3176 } else { 3177 /* 3178 * conn will be freed when c_ref goes to 0. 3179 * Indicate to cleaning thread not to close 3180 * the connection, but just free the channel. 3181 */ 3182 conn->c_flags |= C_CLOSE_NOTNEEDED; 3183 mutex_exit(&conn->c_lock); 3184 } 3185 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3186 break; 3187 } 3188 break; 3189 } 3190 case IBT_CM_EVENT_CONN_EST: 3191 /* 3192 * RTU received, hence connection established. 3193 */ 3194 if (rib_debug > 1) 3195 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3196 "(CONN_EST) channel established"); 3197 break; 3198 3199 default: 3200 if (rib_debug > 2) { 3201 /* Let CM handle the following events. */ 3202 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3203 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3204 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3205 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3206 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3207 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3208 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3209 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3210 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3211 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3212 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3213 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3214 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3215 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3216 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3217 } 3218 } 3219 return (IBT_CM_DEFAULT); 3220 } 3221 3222 /* accept all other CM messages (i.e. let the CM handle them) */ 3223 return (IBT_CM_ACCEPT); 3224 } 3225 3226 static rdma_stat 3227 rib_register_service(rib_hca_t *hca, int service_type, 3228 uint8_t protocol_num, in_port_t dst_port) 3229 { 3230 ibt_srv_desc_t sdesc; 3231 ibt_hca_portinfo_t *port_infop; 3232 ib_svc_id_t srv_id; 3233 ibt_srv_hdl_t srv_hdl; 3234 uint_t port_size; 3235 uint_t pki, i, num_ports, nbinds; 3236 ibt_status_t ibt_status; 3237 rib_service_t *service; 3238 ib_pkey_t pkey; 3239 3240 /* 3241 * Query all ports for the given HCA 3242 */ 3243 rw_enter(&hca->state_lock, RW_READER); 3244 if (hca->state != HCA_DETACHED) { 3245 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3246 &num_ports, &port_size); 3247 rw_exit(&hca->state_lock); 3248 } else { 3249 rw_exit(&hca->state_lock); 3250 return (RDMA_FAILED); 3251 } 3252 if (ibt_status != IBT_SUCCESS) { 3253 return (RDMA_FAILED); 3254 } 3255 3256 DTRACE_PROBE1(rpcib__i__regservice_numports, 3257 int, num_ports); 3258 3259 for (i = 0; i < num_ports; i++) { 3260 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3261 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3262 int, i+1); 3263 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3264 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3265 int, i+1); 3266 } 3267 } 3268 3269 /* 3270 * Get all the IP addresses on this system to register the 3271 * given "service type" on all DNS recognized IP addrs. 3272 * Each service type such as NFS will have all the systems 3273 * IP addresses as its different names. For now the only 3274 * type of service we support in RPCIB is NFS. 3275 */ 3276 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3277 /* 3278 * Start registering and binding service to active 3279 * on active ports on this HCA. 3280 */ 3281 nbinds = 0; 3282 for (service = rib_stat->service_list; 3283 service && (service->srv_type != service_type); 3284 service = service->next) 3285 ; 3286 3287 if (service == NULL) { 3288 /* 3289 * We use IP addresses as the service names for 3290 * service registration. Register each of them 3291 * with CM to obtain a svc_id and svc_hdl. We do not 3292 * register the service with machine's loopback address. 3293 */ 3294 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3295 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3296 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3297 sdesc.sd_handler = rib_srv_cm_handler; 3298 sdesc.sd_flags = 0; 3299 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3300 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3301 1, &srv_hdl, &srv_id); 3302 if ((ibt_status != IBT_SUCCESS) && 3303 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3304 rw_exit(&rib_stat->service_list_lock); 3305 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3306 int, ibt_status); 3307 ibt_free_portinfo(port_infop, port_size); 3308 return (RDMA_FAILED); 3309 } 3310 3311 /* 3312 * Allocate and prepare a service entry 3313 */ 3314 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3315 3316 service->srv_type = service_type; 3317 service->srv_hdl = srv_hdl; 3318 service->srv_id = srv_id; 3319 3320 service->next = rib_stat->service_list; 3321 rib_stat->service_list = service; 3322 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3323 int, service->srv_type); 3324 } else { 3325 srv_hdl = service->srv_hdl; 3326 srv_id = service->srv_id; 3327 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3328 int, service->srv_type); 3329 } 3330 3331 for (i = 0; i < num_ports; i++) { 3332 ibt_sbind_hdl_t sbp; 3333 rib_hca_service_t *hca_srv; 3334 ib_gid_t gid; 3335 3336 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3337 continue; 3338 3339 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3340 pkey = port_infop[i].p_pkey_tbl[pki]; 3341 3342 rw_enter(&hca->bound_services_lock, RW_READER); 3343 gid = port_infop[i].p_sgid_tbl[0]; 3344 for (hca_srv = hca->bound_services; hca_srv; 3345 hca_srv = hca_srv->next) { 3346 if ((hca_srv->srv_id == service->srv_id) && 3347 (hca_srv->gid.gid_prefix == 3348 gid.gid_prefix) && 3349 (hca_srv->gid.gid_guid == gid.gid_guid)) 3350 break; 3351 } 3352 rw_exit(&hca->bound_services_lock); 3353 if (hca_srv != NULL) { 3354 /* 3355 * port is alreay bound the the service 3356 */ 3357 DTRACE_PROBE1( 3358 rpcib__i__regservice__already__bound, 3359 int, i+1); 3360 nbinds++; 3361 continue; 3362 } 3363 3364 if ((pkey & IBSRM_HB) && 3365 (pkey != IB_PKEY_INVALID_FULL)) { 3366 3367 sbp = NULL; 3368 ibt_status = ibt_bind_service(srv_hdl, 3369 gid, NULL, hca, &sbp); 3370 3371 if (ibt_status == IBT_SUCCESS) { 3372 hca_srv = kmem_zalloc( 3373 sizeof (rib_hca_service_t), 3374 KM_SLEEP); 3375 hca_srv->srv_id = srv_id; 3376 hca_srv->gid = gid; 3377 hca_srv->sbind_hdl = sbp; 3378 3379 rw_enter(&hca->bound_services_lock, 3380 RW_WRITER); 3381 hca_srv->next = hca->bound_services; 3382 hca->bound_services = hca_srv; 3383 rw_exit(&hca->bound_services_lock); 3384 nbinds++; 3385 } 3386 3387 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3388 int, ibt_status); 3389 } 3390 } 3391 } 3392 rw_exit(&rib_stat->service_list_lock); 3393 3394 ibt_free_portinfo(port_infop, port_size); 3395 3396 if (nbinds == 0) { 3397 return (RDMA_FAILED); 3398 } else { 3399 /* 3400 * Put this plugin into accept state, since atleast 3401 * one registration was successful. 3402 */ 3403 mutex_enter(&plugin_state_lock); 3404 plugin_state = ACCEPT; 3405 mutex_exit(&plugin_state_lock); 3406 return (RDMA_SUCCESS); 3407 } 3408 } 3409 3410 void 3411 rib_listen(struct rdma_svc_data *rd) 3412 { 3413 rdma_stat status; 3414 int n_listening = 0; 3415 rib_hca_t *hca; 3416 3417 mutex_enter(&rib_stat->listen_lock); 3418 /* 3419 * if rd parameter is NULL then it means that rib_stat->q is 3420 * already initialized by a call from RDMA and we just want to 3421 * add a newly attached HCA to the same listening state as other 3422 * HCAs. 3423 */ 3424 if (rd == NULL) { 3425 if (rib_stat->q == NULL) { 3426 mutex_exit(&rib_stat->listen_lock); 3427 return; 3428 } 3429 } else { 3430 rib_stat->q = &rd->q; 3431 } 3432 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3433 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3434 /* 3435 * First check if a hca is still attached 3436 */ 3437 rw_enter(&hca->state_lock, RW_READER); 3438 if (hca->state != HCA_INITED) { 3439 rw_exit(&hca->state_lock); 3440 continue; 3441 } 3442 rw_exit(&hca->state_lock); 3443 3444 /* 3445 * Right now the only service type is NFS. Hence 3446 * force feed this value. Ideally to communicate 3447 * the service type it should be passed down in 3448 * rdma_svc_data. 3449 */ 3450 status = rib_register_service(hca, NFS, 3451 IPPROTO_TCP, nfs_rdma_port); 3452 if (status == RDMA_SUCCESS) 3453 n_listening++; 3454 } 3455 rw_exit(&rib_stat->hcas_list_lock); 3456 3457 /* 3458 * Service active on an HCA, check rd->err_code for more 3459 * explainable errors. 3460 */ 3461 if (rd) { 3462 if (n_listening > 0) { 3463 rd->active = 1; 3464 rd->err_code = RDMA_SUCCESS; 3465 } else { 3466 rd->active = 0; 3467 rd->err_code = RDMA_FAILED; 3468 } 3469 } 3470 mutex_exit(&rib_stat->listen_lock); 3471 } 3472 3473 /* XXXX */ 3474 /* ARGSUSED */ 3475 static void 3476 rib_listen_stop(struct rdma_svc_data *svcdata) 3477 { 3478 rib_hca_t *hca; 3479 3480 mutex_enter(&rib_stat->listen_lock); 3481 /* 3482 * KRPC called the RDMATF to stop the listeners, this means 3483 * stop sending incomming or recieved requests to KRPC master 3484 * transport handle for RDMA-IB. This is also means that the 3485 * master transport handle, responsible for us, is going away. 3486 */ 3487 mutex_enter(&plugin_state_lock); 3488 plugin_state = NO_ACCEPT; 3489 if (svcdata != NULL) 3490 svcdata->active = 0; 3491 mutex_exit(&plugin_state_lock); 3492 3493 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3494 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3495 /* 3496 * First check if a hca is still attached 3497 */ 3498 rw_enter(&hca->state_lock, RW_READER); 3499 if (hca->state == HCA_DETACHED) { 3500 rw_exit(&hca->state_lock); 3501 continue; 3502 } 3503 rib_close_channels(&hca->srv_conn_list); 3504 rib_stop_services(hca); 3505 rw_exit(&hca->state_lock); 3506 } 3507 rw_exit(&rib_stat->hcas_list_lock); 3508 3509 /* 3510 * Avoid rib_listen() using the stale q field. 3511 * This could happen if a port goes up after all services 3512 * are already unregistered. 3513 */ 3514 rib_stat->q = NULL; 3515 mutex_exit(&rib_stat->listen_lock); 3516 } 3517 3518 /* 3519 * Traverse the HCA's service list to unbind and deregister services. 3520 * For each bound service of HCA to be removed, first find the corresponding 3521 * service handle (srv_hdl) and then unbind the service by calling 3522 * ibt_unbind_service(). 3523 */ 3524 static void 3525 rib_stop_services(rib_hca_t *hca) 3526 { 3527 rib_hca_service_t *srv_list, *to_remove; 3528 3529 /* 3530 * unbind and deregister the services for this service type. 3531 * Right now there is only one service type. In future it will 3532 * be passed down to this function. 3533 */ 3534 rw_enter(&hca->bound_services_lock, RW_READER); 3535 srv_list = hca->bound_services; 3536 hca->bound_services = NULL; 3537 rw_exit(&hca->bound_services_lock); 3538 3539 while (srv_list != NULL) { 3540 rib_service_t *sc; 3541 3542 to_remove = srv_list; 3543 srv_list = to_remove->next; 3544 rw_enter(&rib_stat->service_list_lock, RW_READER); 3545 for (sc = rib_stat->service_list; 3546 sc && (sc->srv_id != to_remove->srv_id); 3547 sc = sc->next) 3548 ; 3549 /* 3550 * if sc is NULL then the service doesn't exist anymore, 3551 * probably just removed completely through rib_stat. 3552 */ 3553 if (sc != NULL) 3554 (void) ibt_unbind_service(sc->srv_hdl, 3555 to_remove->sbind_hdl); 3556 rw_exit(&rib_stat->service_list_lock); 3557 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3558 } 3559 } 3560 3561 static struct svc_recv * 3562 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3563 { 3564 struct svc_recv *recvp; 3565 3566 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3567 recvp->vaddr = sgl->ds_va; 3568 recvp->qp = qp; 3569 recvp->bytes_xfer = 0; 3570 return (recvp); 3571 } 3572 3573 static int 3574 rib_free_svc_recv(struct svc_recv *recvp) 3575 { 3576 kmem_free(recvp, sizeof (*recvp)); 3577 3578 return (0); 3579 } 3580 3581 static struct reply * 3582 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3583 { 3584 struct reply *rep; 3585 3586 3587 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3588 if (rep == NULL) { 3589 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3590 return (NULL); 3591 } 3592 rep->xid = msgid; 3593 rep->vaddr_cq = NULL; 3594 rep->bytes_xfer = 0; 3595 rep->status = (uint_t)REPLY_WAIT; 3596 rep->prev = NULL; 3597 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3598 3599 mutex_enter(&qp->replylist_lock); 3600 if (qp->replylist) { 3601 rep->next = qp->replylist; 3602 qp->replylist->prev = rep; 3603 } 3604 qp->rep_list_size++; 3605 3606 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3607 int, qp->rep_list_size); 3608 3609 qp->replylist = rep; 3610 mutex_exit(&qp->replylist_lock); 3611 3612 return (rep); 3613 } 3614 3615 static rdma_stat 3616 rib_rem_replylist(rib_qp_t *qp) 3617 { 3618 struct reply *r, *n; 3619 3620 mutex_enter(&qp->replylist_lock); 3621 for (r = qp->replylist; r != NULL; r = n) { 3622 n = r->next; 3623 (void) rib_remreply(qp, r); 3624 } 3625 mutex_exit(&qp->replylist_lock); 3626 3627 return (RDMA_SUCCESS); 3628 } 3629 3630 static int 3631 rib_remreply(rib_qp_t *qp, struct reply *rep) 3632 { 3633 3634 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3635 if (rep->prev) { 3636 rep->prev->next = rep->next; 3637 } 3638 if (rep->next) { 3639 rep->next->prev = rep->prev; 3640 } 3641 if (qp->replylist == rep) 3642 qp->replylist = rep->next; 3643 3644 cv_destroy(&rep->wait_cv); 3645 qp->rep_list_size--; 3646 3647 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3648 int, qp->rep_list_size); 3649 3650 kmem_free(rep, sizeof (*rep)); 3651 3652 return (0); 3653 } 3654 3655 rdma_stat 3656 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3657 struct mrc *buf_handle) 3658 { 3659 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3660 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3661 rdma_stat status; 3662 rib_hca_t *hca = (ctoqp(conn))->hca; 3663 3664 /* 3665 * Note: ALL buffer pools use the same memory type RDMARW. 3666 */ 3667 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3668 if (status == RDMA_SUCCESS) { 3669 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3670 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3671 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3672 } else { 3673 buf_handle->mrc_linfo = NULL; 3674 buf_handle->mrc_lmr = 0; 3675 buf_handle->mrc_rmr = 0; 3676 } 3677 return (status); 3678 } 3679 3680 static rdma_stat 3681 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3682 ibt_mr_flags_t spec, 3683 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3684 { 3685 ibt_mr_attr_t mem_attr; 3686 ibt_status_t ibt_status; 3687 mem_attr.mr_vaddr = (uintptr_t)buf; 3688 mem_attr.mr_len = (ib_msglen_t)size; 3689 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3690 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3691 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3692 IBT_MR_ENABLE_WINDOW_BIND | spec; 3693 3694 rw_enter(&hca->state_lock, RW_READER); 3695 if (hca->state != HCA_DETACHED) { 3696 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3697 &mem_attr, mr_hdlp, mr_descp); 3698 rw_exit(&hca->state_lock); 3699 } else { 3700 rw_exit(&hca->state_lock); 3701 return (RDMA_FAILED); 3702 } 3703 3704 if (ibt_status != IBT_SUCCESS) { 3705 return (RDMA_FAILED); 3706 } 3707 return (RDMA_SUCCESS); 3708 } 3709 3710 rdma_stat 3711 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3712 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3713 { 3714 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3715 rib_lrc_entry_t *l; 3716 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3717 rdma_stat status; 3718 rib_hca_t *hca = (ctoqp(conn))->hca; 3719 3720 /* 3721 * Non-coherent memory registration. 3722 */ 3723 l = (rib_lrc_entry_t *)lrc; 3724 if (l) { 3725 if (l->registered) { 3726 buf_handle->mrc_linfo = 3727 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3728 buf_handle->mrc_lmr = 3729 (uint32_t)l->lrc_mhandle.mrc_lmr; 3730 buf_handle->mrc_rmr = 3731 (uint32_t)l->lrc_mhandle.mrc_rmr; 3732 *sync_handle = (RIB_SYNCMEM_HANDLE) 3733 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3734 return (RDMA_SUCCESS); 3735 } else { 3736 /* Always register the whole buffer */ 3737 buf = (caddr_t)l->lrc_buf; 3738 buflen = l->lrc_len; 3739 } 3740 } 3741 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3742 3743 if (status == RDMA_SUCCESS) { 3744 if (l) { 3745 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3746 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3747 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3748 l->registered = TRUE; 3749 } 3750 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3751 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3752 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3753 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3754 } else { 3755 buf_handle->mrc_linfo = NULL; 3756 buf_handle->mrc_lmr = 0; 3757 buf_handle->mrc_rmr = 0; 3758 } 3759 return (status); 3760 } 3761 3762 /* ARGSUSED */ 3763 rdma_stat 3764 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3765 { 3766 rib_hca_t *hca = (ctoqp(conn))->hca; 3767 /* 3768 * Allow memory deregistration even if HCA is 3769 * getting detached. Need all outstanding 3770 * memory registrations to be deregistered 3771 * before HCA_DETACH_EVENT can be accepted. 3772 */ 3773 (void) ibt_deregister_mr(hca->hca_hdl, 3774 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3775 return (RDMA_SUCCESS); 3776 } 3777 3778 /* ARGSUSED */ 3779 rdma_stat 3780 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3781 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3782 { 3783 rib_lrc_entry_t *l; 3784 l = (rib_lrc_entry_t *)lrc; 3785 if (l) 3786 if (l->registered) 3787 return (RDMA_SUCCESS); 3788 3789 (void) rib_deregistermem(conn, buf, buf_handle); 3790 3791 return (RDMA_SUCCESS); 3792 } 3793 3794 /* ARGSUSED */ 3795 rdma_stat 3796 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3797 int len, int cpu) 3798 { 3799 ibt_status_t status; 3800 rib_hca_t *hca = (ctoqp(conn))->hca; 3801 ibt_mr_sync_t mr_segment; 3802 3803 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3804 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3805 mr_segment.ms_len = (ib_memlen_t)len; 3806 if (cpu) { 3807 /* make incoming data visible to memory */ 3808 mr_segment.ms_flags = IBT_SYNC_WRITE; 3809 } else { 3810 /* make memory changes visible to IO */ 3811 mr_segment.ms_flags = IBT_SYNC_READ; 3812 } 3813 rw_enter(&hca->state_lock, RW_READER); 3814 if (hca->state != HCA_DETACHED) { 3815 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3816 rw_exit(&hca->state_lock); 3817 } else { 3818 rw_exit(&hca->state_lock); 3819 return (RDMA_FAILED); 3820 } 3821 3822 if (status == IBT_SUCCESS) 3823 return (RDMA_SUCCESS); 3824 else { 3825 return (RDMA_FAILED); 3826 } 3827 } 3828 3829 /* 3830 * XXXX ???? 3831 */ 3832 static rdma_stat 3833 rib_getinfo(rdma_info_t *info) 3834 { 3835 /* 3836 * XXXX Hack! 3837 */ 3838 info->addrlen = 16; 3839 info->mts = 1000000; 3840 info->mtu = 1000000; 3841 3842 return (RDMA_SUCCESS); 3843 } 3844 3845 rib_bufpool_t * 3846 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3847 { 3848 rib_bufpool_t *rbp = NULL; 3849 bufpool_t *bp = NULL; 3850 caddr_t buf; 3851 ibt_mr_attr_t mem_attr; 3852 ibt_status_t ibt_status; 3853 int i, j; 3854 3855 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3856 3857 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3858 num * sizeof (void *), KM_SLEEP); 3859 3860 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3861 bp->numelems = num; 3862 3863 3864 switch (ptype) { 3865 case SEND_BUFFER: 3866 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3867 bp->rsize = RPC_MSG_SZ; 3868 break; 3869 case RECV_BUFFER: 3870 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3871 bp->rsize = RPC_BUF_SIZE; 3872 break; 3873 default: 3874 goto fail; 3875 } 3876 3877 /* 3878 * Register the pool. 3879 */ 3880 bp->bufsize = num * bp->rsize; 3881 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3882 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3883 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3884 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3885 sizeof (ibt_mr_desc_t), KM_SLEEP); 3886 rw_enter(&hca->state_lock, RW_READER); 3887 3888 if (hca->state == HCA_DETACHED) { 3889 rw_exit(&hca->state_lock); 3890 goto fail; 3891 } 3892 3893 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3894 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3895 mem_attr.mr_vaddr = (uintptr_t)buf; 3896 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3897 mem_attr.mr_as = NULL; 3898 ibt_status = ibt_register_mr(hca->hca_hdl, 3899 hca->pd_hdl, &mem_attr, 3900 &rbp->mr_hdl[i], 3901 &rbp->mr_desc[i]); 3902 if (ibt_status != IBT_SUCCESS) { 3903 for (j = 0; j < i; j++) { 3904 (void) ibt_deregister_mr(hca->hca_hdl, 3905 rbp->mr_hdl[j]); 3906 } 3907 rw_exit(&hca->state_lock); 3908 goto fail; 3909 } 3910 } 3911 rw_exit(&hca->state_lock); 3912 buf = (caddr_t)bp->buf; 3913 for (i = 0; i < num; i++, buf += bp->rsize) { 3914 bp->buflist[i] = (void *)buf; 3915 } 3916 bp->buffree = num - 1; /* no. of free buffers */ 3917 rbp->bpool = bp; 3918 3919 return (rbp); 3920 fail: 3921 if (bp) { 3922 if (bp->buf) 3923 kmem_free(bp->buf, bp->bufsize); 3924 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3925 } 3926 if (rbp) { 3927 if (rbp->mr_hdl) 3928 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3929 if (rbp->mr_desc) 3930 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3931 kmem_free(rbp, sizeof (rib_bufpool_t)); 3932 } 3933 return (NULL); 3934 } 3935 3936 static void 3937 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3938 { 3939 int i; 3940 rib_bufpool_t *rbp = NULL; 3941 bufpool_t *bp; 3942 3943 /* 3944 * Obtain pool address based on type of pool 3945 */ 3946 switch (ptype) { 3947 case SEND_BUFFER: 3948 rbp = hca->send_pool; 3949 break; 3950 case RECV_BUFFER: 3951 rbp = hca->recv_pool; 3952 break; 3953 default: 3954 return; 3955 } 3956 if (rbp == NULL) 3957 return; 3958 3959 bp = rbp->bpool; 3960 3961 /* 3962 * Deregister the pool memory and free it. 3963 */ 3964 for (i = 0; i < bp->numelems; i++) { 3965 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3966 } 3967 } 3968 3969 static void 3970 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3971 { 3972 3973 rib_bufpool_t *rbp = NULL; 3974 bufpool_t *bp; 3975 3976 /* 3977 * Obtain pool address based on type of pool 3978 */ 3979 switch (ptype) { 3980 case SEND_BUFFER: 3981 rbp = hca->send_pool; 3982 break; 3983 case RECV_BUFFER: 3984 rbp = hca->recv_pool; 3985 break; 3986 default: 3987 return; 3988 } 3989 if (rbp == NULL) 3990 return; 3991 3992 bp = rbp->bpool; 3993 3994 /* 3995 * Free the pool memory. 3996 */ 3997 if (rbp->mr_hdl) 3998 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3999 4000 if (rbp->mr_desc) 4001 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4002 if (bp->buf) 4003 kmem_free(bp->buf, bp->bufsize); 4004 mutex_destroy(&bp->buflock); 4005 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4006 kmem_free(rbp, sizeof (rib_bufpool_t)); 4007 } 4008 4009 void 4010 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4011 { 4012 /* 4013 * Deregister the pool memory and free it. 4014 */ 4015 rib_rbufpool_deregister(hca, ptype); 4016 rib_rbufpool_free(hca, ptype); 4017 } 4018 4019 /* 4020 * Fetch a buffer from the pool of type specified in rdbuf->type. 4021 */ 4022 static rdma_stat 4023 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4024 { 4025 rib_lrc_entry_t *rlep; 4026 4027 if (rdbuf->type == RDMA_LONG_BUFFER) { 4028 rlep = rib_get_cache_buf(conn, rdbuf->len); 4029 rdbuf->rb_private = (caddr_t)rlep; 4030 rdbuf->addr = rlep->lrc_buf; 4031 rdbuf->handle = rlep->lrc_mhandle; 4032 return (RDMA_SUCCESS); 4033 } 4034 4035 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4036 if (rdbuf->addr) { 4037 switch (rdbuf->type) { 4038 case SEND_BUFFER: 4039 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4040 break; 4041 case RECV_BUFFER: 4042 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4043 break; 4044 default: 4045 rdbuf->len = 0; 4046 } 4047 return (RDMA_SUCCESS); 4048 } else 4049 return (RDMA_FAILED); 4050 } 4051 4052 /* 4053 * Fetch a buffer of specified type. 4054 * Note that rdbuf->handle is mw's rkey. 4055 */ 4056 static void * 4057 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4058 { 4059 rib_qp_t *qp = ctoqp(conn); 4060 rib_hca_t *hca = qp->hca; 4061 rdma_btype ptype = rdbuf->type; 4062 void *buf; 4063 rib_bufpool_t *rbp = NULL; 4064 bufpool_t *bp; 4065 int i; 4066 4067 /* 4068 * Obtain pool address based on type of pool 4069 */ 4070 switch (ptype) { 4071 case SEND_BUFFER: 4072 rbp = hca->send_pool; 4073 break; 4074 case RECV_BUFFER: 4075 rbp = hca->recv_pool; 4076 break; 4077 default: 4078 return (NULL); 4079 } 4080 if (rbp == NULL) 4081 return (NULL); 4082 4083 bp = rbp->bpool; 4084 4085 mutex_enter(&bp->buflock); 4086 if (bp->buffree < 0) { 4087 mutex_exit(&bp->buflock); 4088 return (NULL); 4089 } 4090 4091 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4092 buf = bp->buflist[bp->buffree]; 4093 rdbuf->addr = buf; 4094 rdbuf->len = bp->rsize; 4095 for (i = bp->numelems - 1; i >= 0; i--) { 4096 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4097 rdbuf->handle.mrc_rmr = 4098 (uint32_t)rbp->mr_desc[i].md_rkey; 4099 rdbuf->handle.mrc_linfo = 4100 (uintptr_t)rbp->mr_hdl[i]; 4101 rdbuf->handle.mrc_lmr = 4102 (uint32_t)rbp->mr_desc[i].md_lkey; 4103 bp->buffree--; 4104 4105 mutex_exit(&bp->buflock); 4106 4107 return (buf); 4108 } 4109 } 4110 4111 mutex_exit(&bp->buflock); 4112 4113 return (NULL); 4114 } 4115 4116 static void 4117 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4118 { 4119 4120 if (rdbuf->type == RDMA_LONG_BUFFER) { 4121 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4122 rdbuf->rb_private = NULL; 4123 return; 4124 } 4125 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4126 } 4127 4128 static void 4129 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4130 { 4131 rib_qp_t *qp = ctoqp(conn); 4132 rib_hca_t *hca = qp->hca; 4133 rib_bufpool_t *rbp = NULL; 4134 bufpool_t *bp; 4135 4136 /* 4137 * Obtain pool address based on type of pool 4138 */ 4139 switch (ptype) { 4140 case SEND_BUFFER: 4141 rbp = hca->send_pool; 4142 break; 4143 case RECV_BUFFER: 4144 rbp = hca->recv_pool; 4145 break; 4146 default: 4147 return; 4148 } 4149 if (rbp == NULL) 4150 return; 4151 4152 bp = rbp->bpool; 4153 4154 mutex_enter(&bp->buflock); 4155 if (++bp->buffree >= bp->numelems) { 4156 /* 4157 * Should never happen 4158 */ 4159 bp->buffree--; 4160 } else { 4161 bp->buflist[bp->buffree] = buf; 4162 } 4163 mutex_exit(&bp->buflock); 4164 } 4165 4166 static rdma_stat 4167 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4168 { 4169 rw_enter(&connlist->conn_lock, RW_WRITER); 4170 if (connlist->conn_hd) { 4171 cn->c_next = connlist->conn_hd; 4172 connlist->conn_hd->c_prev = cn; 4173 } 4174 connlist->conn_hd = cn; 4175 rw_exit(&connlist->conn_lock); 4176 4177 return (RDMA_SUCCESS); 4178 } 4179 4180 static rdma_stat 4181 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4182 { 4183 rw_enter(&connlist->conn_lock, RW_WRITER); 4184 if (cn->c_prev) { 4185 cn->c_prev->c_next = cn->c_next; 4186 } 4187 if (cn->c_next) { 4188 cn->c_next->c_prev = cn->c_prev; 4189 } 4190 if (connlist->conn_hd == cn) 4191 connlist->conn_hd = cn->c_next; 4192 rw_exit(&connlist->conn_lock); 4193 4194 return (RDMA_SUCCESS); 4195 } 4196 4197 /* ARGSUSED */ 4198 static rdma_stat 4199 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4200 int addr_type, void *handle, CONN **conn) 4201 { 4202 rdma_stat status; 4203 rpcib_ping_t rpt; 4204 4205 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4206 return (status); 4207 } 4208 4209 /* 4210 * rib_find_hca_connection 4211 * 4212 * if there is an existing connection to the specified address then 4213 * it will be returned in conn, otherwise conn will be set to NULL. 4214 * Also cleans up any connection that is in error state. 4215 */ 4216 static int 4217 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4218 struct netbuf *d_svcaddr, CONN **conn) 4219 { 4220 CONN *cn; 4221 clock_t cv_stat, timout; 4222 4223 *conn = NULL; 4224 again: 4225 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4226 cn = hca->cl_conn_list.conn_hd; 4227 while (cn != NULL) { 4228 /* 4229 * First, clear up any connection in the ERROR state 4230 */ 4231 mutex_enter(&cn->c_lock); 4232 if (cn->c_state == C_ERROR_CONN) { 4233 if (cn->c_ref == 0) { 4234 /* 4235 * Remove connection from list and destroy it. 4236 */ 4237 cn->c_state = C_DISCONN_PEND; 4238 mutex_exit(&cn->c_lock); 4239 rw_exit(&hca->cl_conn_list.conn_lock); 4240 rib_conn_close((void *)cn); 4241 goto again; 4242 } 4243 mutex_exit(&cn->c_lock); 4244 cn = cn->c_next; 4245 continue; 4246 } 4247 if (cn->c_state == C_DISCONN_PEND) { 4248 mutex_exit(&cn->c_lock); 4249 cn = cn->c_next; 4250 continue; 4251 } 4252 4253 /* 4254 * source address is only checked for if there is one, 4255 * this is the case for retries. 4256 */ 4257 if ((cn->c_raddr.len == d_svcaddr->len) && 4258 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4259 d_svcaddr->len) == 0) && 4260 ((s_svcaddr->len == 0) || 4261 ((cn->c_laddr.len == s_svcaddr->len) && 4262 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4263 s_svcaddr->len) == 0)))) { 4264 /* 4265 * Our connection. Give up conn list lock 4266 * as we are done traversing the list. 4267 */ 4268 rw_exit(&hca->cl_conn_list.conn_lock); 4269 if (cn->c_state == C_CONNECTED) { 4270 cn->c_ref++; /* sharing a conn */ 4271 mutex_exit(&cn->c_lock); 4272 *conn = cn; 4273 return (RDMA_SUCCESS); 4274 } 4275 if (cn->c_state == C_CONN_PEND) { 4276 /* 4277 * Hold a reference to this conn before 4278 * we give up the lock. 4279 */ 4280 cn->c_ref++; 4281 timout = ddi_get_lbolt() + 4282 drv_usectohz(CONN_WAIT_TIME * 1000000); 4283 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4284 &cn->c_lock, timout)) > 0 && 4285 cn->c_state == C_CONN_PEND) 4286 ; 4287 if (cv_stat == 0) { 4288 (void) rib_conn_release_locked(cn); 4289 return (RDMA_INTR); 4290 } 4291 if (cv_stat < 0) { 4292 (void) rib_conn_release_locked(cn); 4293 return (RDMA_TIMEDOUT); 4294 } 4295 if (cn->c_state == C_CONNECTED) { 4296 *conn = cn; 4297 mutex_exit(&cn->c_lock); 4298 return (RDMA_SUCCESS); 4299 } else { 4300 (void) rib_conn_release_locked(cn); 4301 return (RDMA_TIMEDOUT); 4302 } 4303 } 4304 } 4305 mutex_exit(&cn->c_lock); 4306 cn = cn->c_next; 4307 } 4308 rw_exit(&hca->cl_conn_list.conn_lock); 4309 *conn = NULL; 4310 return (RDMA_FAILED); 4311 } 4312 4313 /* 4314 * Connection management. 4315 * IBTF does not support recycling of channels. So connections are only 4316 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4317 * C_DISCONN_PEND state. No C_IDLE state. 4318 * C_CONN_PEND state: Connection establishment in progress to the server. 4319 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4320 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4321 * only in this state. 4322 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4323 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4324 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4325 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4326 * c_ref drops to 0 (this indicates that RPC has no more references to this 4327 * connection), the connection should be destroyed. A connection transitions 4328 * into this state when it is being destroyed. 4329 */ 4330 /* ARGSUSED */ 4331 static rdma_stat 4332 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4333 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4334 { 4335 CONN *cn; 4336 int status; 4337 rib_hca_t *hca; 4338 rib_qp_t *qp; 4339 int s_addr_len; 4340 char *s_addr_buf; 4341 4342 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4343 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4344 rw_enter(&hca->state_lock, RW_READER); 4345 if (hca->state != HCA_DETACHED) { 4346 status = rib_find_hca_connection(hca, s_svcaddr, 4347 d_svcaddr, conn); 4348 rw_exit(&hca->state_lock); 4349 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4350 rw_exit(&rib_stat->hcas_list_lock); 4351 return (status); 4352 } 4353 } else 4354 rw_exit(&hca->state_lock); 4355 } 4356 rw_exit(&rib_stat->hcas_list_lock); 4357 4358 /* 4359 * No existing connection found, establish a new connection. 4360 */ 4361 bzero(rpt, sizeof (rpcib_ping_t)); 4362 4363 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4364 if (status != RDMA_SUCCESS) { 4365 return (RDMA_FAILED); 4366 } 4367 hca = rpt->hca; 4368 4369 if (rpt->srcip.family == AF_INET) { 4370 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4371 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4372 } else if (rpt->srcip.family == AF_INET6) { 4373 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4374 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4375 } else { 4376 return (RDMA_FAILED); 4377 } 4378 4379 /* 4380 * Channel to server doesn't exist yet, create one. 4381 */ 4382 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4383 return (RDMA_FAILED); 4384 } 4385 cn = qptoc(qp); 4386 cn->c_state = C_CONN_PEND; 4387 cn->c_ref = 1; 4388 4389 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4390 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4391 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4392 4393 if (rpt->srcip.family == AF_INET) { 4394 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4395 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4396 4397 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4398 sizeof (struct sockaddr_in); 4399 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4400 4401 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr = 4402 (uint32_t)~0; 4403 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family = 4404 (ushort_t)~0; 4405 4406 } else { 4407 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4408 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4409 4410 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4411 sizeof (struct sockaddr_in6); 4412 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4413 4414 (void) memset( 4415 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr, 4416 (uchar_t)~0, sizeof (struct in6_addr)); 4417 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family = 4418 (sa_family_t)~0; 4419 } 4420 4421 /* 4422 * Add to conn list. 4423 * We had given up the READER lock. In the time since then, 4424 * another thread might have created the connection we are 4425 * trying here. But for now, that is quiet alright - there 4426 * might be two connections between a pair of hosts instead 4427 * of one. If we really want to close that window, 4428 * then need to check the list after acquiring the 4429 * WRITER lock. 4430 */ 4431 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4432 status = rib_conn_to_srv(hca, qp, rpt); 4433 mutex_enter(&cn->c_lock); 4434 4435 if (cn->c_flags & C_CLOSE_PENDING) { 4436 /* 4437 * This handles a case where the module or 4438 * HCA detached in the time a connection is 4439 * established. In such a case close the 4440 * connection immediately if this is the 4441 * only reference. 4442 */ 4443 if (cn->c_ref == 1) { 4444 cn->c_ref--; 4445 cn->c_state = C_DISCONN_PEND; 4446 mutex_exit(&cn->c_lock); 4447 rib_conn_close((void *)cn); 4448 return (RDMA_FAILED); 4449 } 4450 4451 /* 4452 * Connection to be closed later when c_ref = 0 4453 */ 4454 status = RDMA_FAILED; 4455 } 4456 4457 if (status == RDMA_SUCCESS) { 4458 cn->c_state = C_CONNECTED; 4459 *conn = cn; 4460 } else { 4461 cn->c_state = C_ERROR_CONN; 4462 cn->c_ref--; 4463 } 4464 cv_signal(&cn->c_cv); 4465 mutex_exit(&cn->c_lock); 4466 return (status); 4467 } 4468 4469 static void 4470 rib_conn_close(void *rarg) 4471 { 4472 CONN *conn = (CONN *)rarg; 4473 rib_qp_t *qp = ctoqp(conn); 4474 4475 mutex_enter(&conn->c_lock); 4476 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4477 4478 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4479 4480 /* 4481 * Live connection in CONNECTED state. 4482 */ 4483 if (conn->c_state == C_CONNECTED) { 4484 conn->c_state = C_ERROR_CONN; 4485 } 4486 mutex_exit(&conn->c_lock); 4487 4488 rib_close_a_channel(conn); 4489 4490 mutex_enter(&conn->c_lock); 4491 conn->c_flags &= ~C_CLOSE_PENDING; 4492 } 4493 4494 mutex_exit(&conn->c_lock); 4495 4496 if (qp->mode == RIB_SERVER) 4497 (void) rib_disconnect_channel(conn, 4498 &qp->hca->srv_conn_list); 4499 else 4500 (void) rib_disconnect_channel(conn, 4501 &qp->hca->cl_conn_list); 4502 } 4503 4504 static void 4505 rib_conn_timeout_call(void *carg) 4506 { 4507 time_t idle_time; 4508 CONN *conn = (CONN *)carg; 4509 rib_hca_t *hca = ctoqp(conn)->hca; 4510 int error; 4511 4512 mutex_enter(&conn->c_lock); 4513 if ((conn->c_ref > 0) || 4514 (conn->c_state == C_DISCONN_PEND)) { 4515 conn->c_timeout = NULL; 4516 mutex_exit(&conn->c_lock); 4517 return; 4518 } 4519 4520 idle_time = (gethrestime_sec() - conn->c_last_used); 4521 4522 if ((idle_time <= rib_conn_timeout) && 4523 (conn->c_state != C_ERROR_CONN)) { 4524 /* 4525 * There was activity after the last timeout. 4526 * Extend the conn life. Unless the conn is 4527 * already in error state. 4528 */ 4529 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4530 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4531 mutex_exit(&conn->c_lock); 4532 return; 4533 } 4534 4535 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4536 (void *)conn, DDI_NOSLEEP); 4537 4538 /* 4539 * If taskq dispatch fails above, then reset the timeout 4540 * to try again after 10 secs. 4541 */ 4542 4543 if (error != DDI_SUCCESS) { 4544 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4545 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4546 mutex_exit(&conn->c_lock); 4547 return; 4548 } 4549 4550 conn->c_state = C_DISCONN_PEND; 4551 mutex_exit(&conn->c_lock); 4552 } 4553 4554 static rdma_stat 4555 rib_conn_release(CONN *conn) 4556 { 4557 mutex_enter(&conn->c_lock); 4558 return (rib_conn_release_locked(conn)); 4559 } 4560 4561 /* 4562 * Expects conn->c_lock to be held on entry. 4563 * c_lock released on return 4564 */ 4565 static rdma_stat 4566 rib_conn_release_locked(CONN *conn) 4567 { 4568 conn->c_ref--; 4569 4570 conn->c_last_used = gethrestime_sec(); 4571 if (conn->c_ref > 0) { 4572 mutex_exit(&conn->c_lock); 4573 return (RDMA_SUCCESS); 4574 } 4575 4576 /* 4577 * If a conn is C_ERROR_CONN, close the channel. 4578 */ 4579 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4580 conn->c_state = C_DISCONN_PEND; 4581 mutex_exit(&conn->c_lock); 4582 rib_conn_close((void *)conn); 4583 return (RDMA_SUCCESS); 4584 } 4585 4586 /* 4587 * c_ref == 0, set a timeout for conn release 4588 */ 4589 4590 if (conn->c_timeout == NULL) { 4591 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4592 SEC_TO_TICK(rib_conn_timeout)); 4593 } 4594 4595 mutex_exit(&conn->c_lock); 4596 return (RDMA_SUCCESS); 4597 } 4598 4599 /* 4600 * Add at front of list 4601 */ 4602 static struct rdma_done_list * 4603 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4604 { 4605 struct rdma_done_list *rd; 4606 4607 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4608 4609 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4610 rd->xid = xid; 4611 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4612 4613 rd->prev = NULL; 4614 rd->next = qp->rdlist; 4615 if (qp->rdlist != NULL) 4616 qp->rdlist->prev = rd; 4617 qp->rdlist = rd; 4618 4619 return (rd); 4620 } 4621 4622 static void 4623 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4624 { 4625 struct rdma_done_list *r; 4626 4627 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4628 4629 r = rd->next; 4630 if (r != NULL) { 4631 r->prev = rd->prev; 4632 } 4633 4634 r = rd->prev; 4635 if (r != NULL) { 4636 r->next = rd->next; 4637 } else { 4638 qp->rdlist = rd->next; 4639 } 4640 4641 cv_destroy(&rd->rdma_done_cv); 4642 kmem_free(rd, sizeof (*rd)); 4643 } 4644 4645 static void 4646 rdma_done_rem_list(rib_qp_t *qp) 4647 { 4648 struct rdma_done_list *r, *n; 4649 4650 mutex_enter(&qp->rdlist_lock); 4651 for (r = qp->rdlist; r != NULL; r = n) { 4652 n = r->next; 4653 rdma_done_rm(qp, r); 4654 } 4655 mutex_exit(&qp->rdlist_lock); 4656 } 4657 4658 static void 4659 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4660 { 4661 struct rdma_done_list *r = qp->rdlist; 4662 4663 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4664 4665 while (r) { 4666 if (r->xid == xid) { 4667 cv_signal(&r->rdma_done_cv); 4668 return; 4669 } else { 4670 r = r->next; 4671 } 4672 } 4673 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4674 int, xid); 4675 } 4676 4677 /* 4678 * Expects conn->c_lock to be held by the caller. 4679 */ 4680 4681 static void 4682 rib_close_a_channel(CONN *conn) 4683 { 4684 rib_qp_t *qp; 4685 qp = ctoqp(conn); 4686 4687 if (qp->qp_hdl == NULL) { 4688 /* channel already freed */ 4689 return; 4690 } 4691 4692 /* 4693 * Call ibt_close_rc_channel in blocking mode 4694 * with no callbacks. 4695 */ 4696 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4697 NULL, 0, NULL, NULL, 0); 4698 } 4699 4700 /* 4701 * Goes through all connections and closes the channel 4702 * This will cause all the WRs on those channels to be 4703 * flushed. 4704 */ 4705 static void 4706 rib_close_channels(rib_conn_list_t *connlist) 4707 { 4708 CONN *conn, *tmp; 4709 4710 rw_enter(&connlist->conn_lock, RW_READER); 4711 conn = connlist->conn_hd; 4712 while (conn != NULL) { 4713 mutex_enter(&conn->c_lock); 4714 tmp = conn->c_next; 4715 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4716 4717 if (conn->c_state == C_CONN_PEND) { 4718 conn->c_flags |= C_CLOSE_PENDING; 4719 goto next; 4720 } 4721 4722 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4723 4724 /* 4725 * Live connection in CONNECTED state. 4726 */ 4727 if (conn->c_state == C_CONNECTED) 4728 conn->c_state = C_ERROR_CONN; 4729 mutex_exit(&conn->c_lock); 4730 4731 rib_close_a_channel(conn); 4732 4733 mutex_enter(&conn->c_lock); 4734 conn->c_flags &= ~C_CLOSE_PENDING; 4735 /* Signal a pending rib_disconnect_channel() */ 4736 cv_signal(&conn->c_cv); 4737 } 4738 next: 4739 mutex_exit(&conn->c_lock); 4740 conn = tmp; 4741 } 4742 rw_exit(&connlist->conn_lock); 4743 } 4744 4745 /* 4746 * Frees up all connections that are no longer being referenced 4747 */ 4748 static void 4749 rib_purge_connlist(rib_conn_list_t *connlist) 4750 { 4751 CONN *conn; 4752 4753 top: 4754 rw_enter(&connlist->conn_lock, RW_READER); 4755 conn = connlist->conn_hd; 4756 while (conn != NULL) { 4757 mutex_enter(&conn->c_lock); 4758 4759 /* 4760 * At this point connection is either in ERROR 4761 * or DISCONN_PEND state. If in DISCONN_PEND state 4762 * then some other thread is culling that connection. 4763 * If not and if c_ref is 0, then destroy the connection. 4764 */ 4765 if (conn->c_ref == 0 && 4766 conn->c_state != C_DISCONN_PEND) { 4767 /* 4768 * Cull the connection 4769 */ 4770 conn->c_state = C_DISCONN_PEND; 4771 mutex_exit(&conn->c_lock); 4772 rw_exit(&connlist->conn_lock); 4773 (void) rib_disconnect_channel(conn, connlist); 4774 goto top; 4775 } else { 4776 /* 4777 * conn disconnect already scheduled or will 4778 * happen from conn_release when c_ref drops to 0. 4779 */ 4780 mutex_exit(&conn->c_lock); 4781 } 4782 conn = conn->c_next; 4783 } 4784 rw_exit(&connlist->conn_lock); 4785 4786 /* 4787 * At this point, only connections with c_ref != 0 are on the list 4788 */ 4789 } 4790 4791 /* 4792 * Free all the HCA resources and close 4793 * the hca. 4794 */ 4795 4796 static void 4797 rib_free_hca(rib_hca_t *hca) 4798 { 4799 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4800 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4801 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4802 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4803 4804 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4805 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4806 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4807 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4808 4809 rib_rbufpool_destroy(hca, RECV_BUFFER); 4810 rib_rbufpool_destroy(hca, SEND_BUFFER); 4811 rib_destroy_cache(hca); 4812 if (rib_mod.rdma_count == 0) 4813 (void) rdma_unregister_mod(&rib_mod); 4814 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4815 (void) ibt_close_hca(hca->hca_hdl); 4816 hca->hca_hdl = NULL; 4817 } 4818 4819 4820 static void 4821 rib_stop_hca_services(rib_hca_t *hca) 4822 { 4823 rib_stop_services(hca); 4824 rib_close_channels(&hca->cl_conn_list); 4825 rib_close_channels(&hca->srv_conn_list); 4826 4827 rib_purge_connlist(&hca->cl_conn_list); 4828 rib_purge_connlist(&hca->srv_conn_list); 4829 4830 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4831 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4832 GLOBAL_ZONEID); 4833 stats_enabled = FALSE; 4834 } 4835 4836 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4837 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4838 if (hca->srv_conn_list.conn_hd == NULL && 4839 hca->cl_conn_list.conn_hd == NULL) { 4840 /* 4841 * conn_lists are NULL, so destroy 4842 * buffers, close hca and be done. 4843 */ 4844 rib_free_hca(hca); 4845 } 4846 rw_exit(&hca->cl_conn_list.conn_lock); 4847 rw_exit(&hca->srv_conn_list.conn_lock); 4848 4849 if (hca->hca_hdl != NULL) { 4850 mutex_enter(&hca->inuse_lock); 4851 while (hca->inuse) 4852 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4853 mutex_exit(&hca->inuse_lock); 4854 4855 rib_free_hca(hca); 4856 } 4857 rw_destroy(&hca->bound_services_lock); 4858 4859 if (hca->cleanup_helper != NULL) { 4860 ddi_taskq_destroy(hca->cleanup_helper); 4861 hca->cleanup_helper = NULL; 4862 } 4863 } 4864 4865 /* 4866 * Cleans and closes up all uses of the HCA 4867 */ 4868 static void 4869 rib_detach_hca(ibt_hca_hdl_t hca_hdl) 4870 { 4871 rib_hca_t *hca = NULL; 4872 rib_hca_t **hcap; 4873 4874 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4875 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { 4876 hca = *hcap; 4877 rw_enter(&hca->state_lock, RW_WRITER); 4878 if (hca->hca_hdl == hca_hdl) { 4879 /* 4880 * Mark as detached and remove from 4881 * hca list. 4882 */ 4883 hca->state = HCA_DETACHED; 4884 *hcap = hca->next; 4885 rib_stat->nhca_inited--; 4886 rib_mod.rdma_count--; 4887 rw_exit(&hca->state_lock); 4888 break; 4889 } 4890 rw_exit(&hca->state_lock); 4891 } 4892 rw_exit(&rib_stat->hcas_list_lock); 4893 4894 if (hca == NULL) 4895 return; 4896 ASSERT(hca->hca_hdl == hca_hdl); 4897 4898 /* 4899 * Stop all services on the HCA 4900 * Go through cl_conn_list and close all rc_channels 4901 * Go through svr_conn_list and close all rc_channels 4902 * Free connections whose c_ref has dropped to 0 4903 * Destroy all CQs 4904 * Deregister and released all buffer pool memory after all 4905 * connections are destroyed 4906 * Free the protection domain 4907 * ibt_close_hca() 4908 */ 4909 rib_stop_hca_services(hca); 4910 4911 kmem_free(hca, sizeof (*hca)); 4912 } 4913 4914 static void 4915 rib_server_side_cache_reclaim(void *argp) 4916 { 4917 cache_avl_struct_t *rcas; 4918 rib_lrc_entry_t *rb; 4919 rib_hca_t *hca = (rib_hca_t *)argp; 4920 4921 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4922 rcas = avl_first(&hca->avl_tree); 4923 if (rcas != NULL) 4924 avl_remove(&hca->avl_tree, rcas); 4925 4926 while (rcas != NULL) { 4927 while (rcas->r.forw != &rcas->r) { 4928 rcas->elements--; 4929 rb = rcas->r.forw; 4930 remque(rb); 4931 if (rb->registered) 4932 (void) rib_deregistermem_via_hca(hca, 4933 rb->lrc_buf, rb->lrc_mhandle); 4934 4935 hca->cache_allocation -= rb->lrc_len; 4936 kmem_free(rb->lrc_buf, rb->lrc_len); 4937 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4938 } 4939 mutex_destroy(&rcas->node_lock); 4940 kmem_cache_free(hca->server_side_cache, rcas); 4941 rcas = avl_first(&hca->avl_tree); 4942 if (rcas != NULL) 4943 avl_remove(&hca->avl_tree, rcas); 4944 } 4945 rw_exit(&hca->avl_rw_lock); 4946 } 4947 4948 static void 4949 rib_server_side_cache_cleanup(void *argp) 4950 { 4951 cache_avl_struct_t *rcas; 4952 rib_lrc_entry_t *rb; 4953 rib_hca_t *hca = (rib_hca_t *)argp; 4954 4955 mutex_enter(&hca->cache_allocation_lock); 4956 if (hca->cache_allocation < cache_limit) { 4957 mutex_exit(&hca->cache_allocation_lock); 4958 return; 4959 } 4960 mutex_exit(&hca->cache_allocation_lock); 4961 4962 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4963 rcas = avl_last(&hca->avl_tree); 4964 if (rcas != NULL) 4965 avl_remove(&hca->avl_tree, rcas); 4966 4967 while (rcas != NULL) { 4968 while (rcas->r.forw != &rcas->r) { 4969 rcas->elements--; 4970 rb = rcas->r.forw; 4971 remque(rb); 4972 if (rb->registered) 4973 (void) rib_deregistermem_via_hca(hca, 4974 rb->lrc_buf, rb->lrc_mhandle); 4975 4976 hca->cache_allocation -= rb->lrc_len; 4977 4978 kmem_free(rb->lrc_buf, rb->lrc_len); 4979 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4980 } 4981 mutex_destroy(&rcas->node_lock); 4982 if (hca->server_side_cache) { 4983 kmem_cache_free(hca->server_side_cache, rcas); 4984 } 4985 4986 if (hca->cache_allocation < cache_limit) { 4987 rw_exit(&hca->avl_rw_lock); 4988 return; 4989 } 4990 4991 rcas = avl_last(&hca->avl_tree); 4992 if (rcas != NULL) 4993 avl_remove(&hca->avl_tree, rcas); 4994 } 4995 rw_exit(&hca->avl_rw_lock); 4996 } 4997 4998 static int 4999 avl_compare(const void *t1, const void *t2) 5000 { 5001 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 5002 return (0); 5003 5004 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 5005 return (-1); 5006 5007 return (1); 5008 } 5009 5010 static void 5011 rib_destroy_cache(rib_hca_t *hca) 5012 { 5013 if (hca->avl_init) { 5014 rib_server_side_cache_reclaim((void *)hca); 5015 if (hca->server_side_cache) { 5016 kmem_cache_destroy(hca->server_side_cache); 5017 hca->server_side_cache = NULL; 5018 } 5019 avl_destroy(&hca->avl_tree); 5020 mutex_destroy(&hca->cache_allocation_lock); 5021 rw_destroy(&hca->avl_rw_lock); 5022 } 5023 hca->avl_init = FALSE; 5024 } 5025 5026 static void 5027 rib_force_cleanup(void *hca) 5028 { 5029 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 5030 (void) ddi_taskq_dispatch( 5031 ((rib_hca_t *)hca)->cleanup_helper, 5032 rib_server_side_cache_cleanup, 5033 (void *)hca, DDI_NOSLEEP); 5034 } 5035 5036 static rib_lrc_entry_t * 5037 rib_get_cache_buf(CONN *conn, uint32_t len) 5038 { 5039 cache_avl_struct_t cas, *rcas; 5040 rib_hca_t *hca = (ctoqp(conn))->hca; 5041 rib_lrc_entry_t *reply_buf; 5042 avl_index_t where = NULL; 5043 uint64_t c_alloc = 0; 5044 5045 if (!hca->avl_init) 5046 goto error_alloc; 5047 5048 cas.len = len; 5049 5050 rw_enter(&hca->avl_rw_lock, RW_READER); 5051 5052 mutex_enter(&hca->cache_allocation_lock); 5053 c_alloc = hca->cache_allocation; 5054 mutex_exit(&hca->cache_allocation_lock); 5055 5056 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 5057 &where)) == NULL) { 5058 /* Am I above the cache limit */ 5059 if ((c_alloc + len) >= cache_limit) { 5060 rib_force_cleanup((void *)hca); 5061 rw_exit(&hca->avl_rw_lock); 5062 mutex_enter(&hca->cache_allocation_lock); 5063 hca->cache_misses_above_the_limit ++; 5064 mutex_exit(&hca->cache_allocation_lock); 5065 5066 /* Allocate and register the buffer directly */ 5067 goto error_alloc; 5068 } 5069 5070 rw_exit(&hca->avl_rw_lock); 5071 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5072 5073 /* Recheck to make sure no other thread added the entry in */ 5074 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 5075 &cas, &where)) == NULL) { 5076 /* Allocate an avl tree entry */ 5077 rcas = (cache_avl_struct_t *) 5078 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 5079 5080 bzero(rcas, sizeof (cache_avl_struct_t)); 5081 rcas->elements = 0; 5082 rcas->r.forw = &rcas->r; 5083 rcas->r.back = &rcas->r; 5084 rcas->len = len; 5085 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5086 avl_insert(&hca->avl_tree, rcas, where); 5087 } 5088 } 5089 5090 mutex_enter(&rcas->node_lock); 5091 5092 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5093 reply_buf = rcas->r.forw; 5094 remque(reply_buf); 5095 rcas->elements--; 5096 mutex_exit(&rcas->node_lock); 5097 rw_exit(&hca->avl_rw_lock); 5098 5099 mutex_enter(&hca->cache_allocation_lock); 5100 hca->cache_hits++; 5101 hca->cache_allocation -= len; 5102 mutex_exit(&hca->cache_allocation_lock); 5103 } else { 5104 /* Am I above the cache limit */ 5105 mutex_exit(&rcas->node_lock); 5106 if ((c_alloc + len) >= cache_limit) { 5107 rib_force_cleanup((void *)hca); 5108 rw_exit(&hca->avl_rw_lock); 5109 5110 mutex_enter(&hca->cache_allocation_lock); 5111 hca->cache_misses_above_the_limit++; 5112 mutex_exit(&hca->cache_allocation_lock); 5113 /* Allocate and register the buffer directly */ 5114 goto error_alloc; 5115 } 5116 rw_exit(&hca->avl_rw_lock); 5117 mutex_enter(&hca->cache_allocation_lock); 5118 hca->cache_misses++; 5119 mutex_exit(&hca->cache_allocation_lock); 5120 /* Allocate a reply_buf entry */ 5121 reply_buf = (rib_lrc_entry_t *) 5122 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5123 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5124 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5125 reply_buf->lrc_len = len; 5126 reply_buf->registered = FALSE; 5127 reply_buf->avl_node = (void *)rcas; 5128 } 5129 5130 return (reply_buf); 5131 5132 error_alloc: 5133 reply_buf = (rib_lrc_entry_t *) 5134 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5135 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5136 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5137 reply_buf->lrc_len = len; 5138 reply_buf->registered = FALSE; 5139 reply_buf->avl_node = NULL; 5140 5141 return (reply_buf); 5142 } 5143 5144 /* 5145 * Return a pre-registered back to the cache (without 5146 * unregistering the buffer).. 5147 */ 5148 5149 static void 5150 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5151 { 5152 cache_avl_struct_t cas, *rcas; 5153 avl_index_t where = NULL; 5154 rib_hca_t *hca = (ctoqp(conn))->hca; 5155 5156 if (!hca->avl_init) 5157 goto error_free; 5158 5159 cas.len = reg_buf->lrc_len; 5160 rw_enter(&hca->avl_rw_lock, RW_READER); 5161 if ((rcas = (cache_avl_struct_t *) 5162 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5163 rw_exit(&hca->avl_rw_lock); 5164 goto error_free; 5165 } else { 5166 cas.len = reg_buf->lrc_len; 5167 mutex_enter(&rcas->node_lock); 5168 insque(reg_buf, &rcas->r); 5169 rcas->elements ++; 5170 mutex_exit(&rcas->node_lock); 5171 rw_exit(&hca->avl_rw_lock); 5172 mutex_enter(&hca->cache_allocation_lock); 5173 hca->cache_allocation += cas.len; 5174 mutex_exit(&hca->cache_allocation_lock); 5175 } 5176 5177 return; 5178 5179 error_free: 5180 5181 if (reg_buf->registered) 5182 (void) rib_deregistermem_via_hca(hca, 5183 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5184 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5185 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5186 } 5187 5188 static rdma_stat 5189 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5190 uint_t buflen, struct mrc *buf_handle) 5191 { 5192 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5193 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5194 rdma_stat status; 5195 5196 5197 /* 5198 * Note: ALL buffer pools use the same memory type RDMARW. 5199 */ 5200 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5201 if (status == RDMA_SUCCESS) { 5202 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5203 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5204 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5205 } else { 5206 buf_handle->mrc_linfo = NULL; 5207 buf_handle->mrc_lmr = 0; 5208 buf_handle->mrc_rmr = 0; 5209 } 5210 return (status); 5211 } 5212 5213 /* ARGSUSED */ 5214 static rdma_stat 5215 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5216 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5217 { 5218 5219 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5220 return (RDMA_SUCCESS); 5221 } 5222 5223 /* ARGSUSED */ 5224 static rdma_stat 5225 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5226 { 5227 5228 (void) ibt_deregister_mr(hca->hca_hdl, 5229 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5230 return (RDMA_SUCCESS); 5231 } 5232 5233 /* 5234 * Check if the IP interface named by `lifrp' is RDMA-capable. 5235 */ 5236 static boolean_t 5237 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5238 { 5239 char ifname[LIFNAMSIZ]; 5240 char *cp; 5241 5242 if (lifrp->lifr_type == IFT_IB) 5243 return (B_TRUE); 5244 5245 /* 5246 * Strip off the logical interface portion before getting 5247 * intimate with the name. 5248 */ 5249 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5250 if ((cp = strchr(ifname, ':')) != NULL) 5251 *cp = '\0'; 5252 5253 return (strcmp("lo0", ifname) == 0); 5254 } 5255 5256 static int 5257 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5258 { 5259 vnode_t *kkvp, *vp; 5260 TIUSER *tiptr; 5261 struct strioctl iocb; 5262 k_sigset_t smask; 5263 int err = 0; 5264 5265 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { 5266 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, 5267 &tiptr, CRED()) == 0) { 5268 vp = tiptr->fp->f_vnode; 5269 } else { 5270 VN_RELE(kkvp); 5271 return (EPROTO); 5272 } 5273 } else { 5274 return (EPROTO); 5275 } 5276 5277 iocb.ic_cmd = cmd; 5278 iocb.ic_timout = 0; 5279 iocb.ic_len = len; 5280 iocb.ic_dp = (caddr_t)arg; 5281 sigintr(&smask, 0); 5282 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5283 sigunintr(&smask); 5284 (void) t_kclose(tiptr, 0); 5285 VN_RELE(kkvp); 5286 return (err); 5287 } 5288 5289 /* 5290 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5291 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5292 */ 5293 static int 5294 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5295 { 5296 int err; 5297 struct lifnum lifn; 5298 5299 bzero(&lifn, sizeof (struct lifnum)); 5300 lifn.lifn_family = AF_UNSPEC; 5301 5302 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5303 if (err != 0) 5304 return (err); 5305 5306 /* 5307 * Pad the interface count to account for additional interfaces that 5308 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5309 */ 5310 lifn.lifn_count += 4; 5311 5312 bzero(lifcp, sizeof (struct lifconf)); 5313 lifcp->lifc_family = AF_UNSPEC; 5314 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5315 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5316 5317 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5318 if (err != 0) { 5319 kmem_free(lifcp->lifc_buf, *bufsizep); 5320 return (err); 5321 } 5322 return (0); 5323 } 5324 5325 static boolean_t 5326 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5327 { 5328 uint_t i, nifs; 5329 uint_t bufsize; 5330 struct lifconf lifc; 5331 struct lifreq *lifrp; 5332 struct sockaddr_in *sinp; 5333 struct sockaddr_in6 *sin6p; 5334 5335 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5336 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5337 5338 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5339 return (B_FALSE); 5340 5341 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5342 kmem_free(lifc.lifc_buf, bufsize); 5343 return (B_FALSE); 5344 } 5345 5346 /* 5347 * Worst case is that all of the addresses are IB-capable and have 5348 * the same address family, so size our buffers accordingly. 5349 */ 5350 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5351 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5352 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5353 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5354 5355 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5356 if (!rpcib_rdma_capable_interface(lifrp)) 5357 continue; 5358 5359 if (lifrp->lifr_addr.ss_family == AF_INET) { 5360 sinp = addrs4->ri_list; 5361 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5362 sizeof (struct sockaddr_in)); 5363 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5364 sin6p = addrs6->ri_list; 5365 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5366 sizeof (struct sockaddr_in6)); 5367 } 5368 } 5369 5370 kmem_free(lifc.lifc_buf, bufsize); 5371 return (B_TRUE); 5372 } 5373 5374 /* ARGSUSED */ 5375 static int 5376 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5377 { 5378 rib_hca_t *hca; 5379 5380 if (KSTAT_WRITE == rw) { 5381 return (EACCES); 5382 } 5383 5384 rpcib_kstat.cache_limit.value.ui64 = 5385 (uint64_t)cache_limit; 5386 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5387 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5388 rpcib_kstat.cache_allocation.value.ui64 += 5389 (uint64_t)hca->cache_allocation; 5390 rpcib_kstat.cache_hits.value.ui64 += 5391 (uint64_t)hca->cache_hits; 5392 rpcib_kstat.cache_misses.value.ui64 += 5393 (uint64_t)hca->cache_misses; 5394 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5395 (uint64_t)hca->cache_misses_above_the_limit; 5396 } 5397 rw_exit(&rib_stat->hcas_list_lock); 5398 return (0); 5399 } 5400