1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2007, The Ohio State University. All rights reserved. 27 * 28 * Portions of this source code is developed by the team members of 29 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 30 * headed by Professor Dhabaleswar K. (DK) Panda. 31 * 32 * Acknowledgements to contributions from developors: 33 * Ranjit Noronha: noronha@cse.ohio-state.edu 34 * Lei Chai : chail@cse.ohio-state.edu 35 * Weikuan Yu : yuw@cse.ohio-state.edu 36 * 37 */ 38 39 /* 40 * The rpcib plugin. Implements the interface for RDMATF's 41 * interaction with IBTF. 42 */ 43 44 #include <sys/param.h> 45 #include <sys/types.h> 46 #include <sys/user.h> 47 #include <sys/systm.h> 48 #include <sys/sysmacros.h> 49 #include <sys/proc.h> 50 #include <sys/socket.h> 51 #include <sys/file.h> 52 #include <sys/stream.h> 53 #include <sys/strsubr.h> 54 #include <sys/stropts.h> 55 #include <sys/errno.h> 56 #include <sys/kmem.h> 57 #include <sys/debug.h> 58 #include <sys/pathname.h> 59 #include <sys/kstat.h> 60 #include <sys/t_lock.h> 61 #include <sys/ddi.h> 62 #include <sys/cmn_err.h> 63 #include <sys/time.h> 64 #include <sys/isa_defs.h> 65 #include <sys/callb.h> 66 #include <sys/sunddi.h> 67 #include <sys/sunndi.h> 68 #include <sys/sdt.h> 69 #include <sys/ib/ibtl/ibti.h> 70 #include <rpc/rpc.h> 71 #include <rpc/ib.h> 72 #include <sys/modctl.h> 73 #include <sys/kstr.h> 74 #include <sys/sockio.h> 75 #include <sys/vnode.h> 76 #include <sys/tiuser.h> 77 #include <net/if.h> 78 #include <net/if_types.h> 79 #include <sys/cred.h> 80 #include <rpc/rpc_rdma.h> 81 #include <nfs/nfs.h> 82 #include <sys/atomic.h> 83 84 #define NFS_RDMA_PORT 20049 85 86 87 /* 88 * Convenience structures for connection management 89 */ 90 typedef struct rpcib_ipaddrs { 91 void *ri_list; /* pointer to list of addresses */ 92 uint_t ri_count; /* number of addresses in list */ 93 uint_t ri_size; /* size of ri_list in bytes */ 94 } rpcib_ipaddrs_t; 95 96 97 typedef struct rpcib_ping { 98 rib_hca_t *hca; 99 ibt_path_info_t path; 100 ibt_ip_addr_t srcip; 101 ibt_ip_addr_t dstip; 102 } rpcib_ping_t; 103 104 /* 105 * Prototype declarations for driver ops 106 */ 107 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 108 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 109 void *, void **); 110 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 112 static int rpcib_do_ip_ioctl(int, int, void *); 113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 114 static int rpcib_cache_kstat_update(kstat_t *, int); 115 static void rib_force_cleanup(void *); 116 static void rib_stop_hca_services(rib_hca_t *); 117 static void rib_attach_hca(void); 118 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 119 struct netbuf *d_svcaddr, CONN **conn); 120 121 struct { 122 kstat_named_t cache_limit; 123 kstat_named_t cache_allocation; 124 kstat_named_t cache_hits; 125 kstat_named_t cache_misses; 126 kstat_named_t cache_misses_above_the_limit; 127 } rpcib_kstat = { 128 {"cache_limit", KSTAT_DATA_UINT64 }, 129 {"cache_allocation", KSTAT_DATA_UINT64 }, 130 {"cache_hits", KSTAT_DATA_UINT64 }, 131 {"cache_misses", KSTAT_DATA_UINT64 }, 132 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 133 }; 134 135 /* rpcib cb_ops */ 136 static struct cb_ops rpcib_cbops = { 137 nulldev, /* open */ 138 nulldev, /* close */ 139 nodev, /* strategy */ 140 nodev, /* print */ 141 nodev, /* dump */ 142 nodev, /* read */ 143 nodev, /* write */ 144 nodev, /* ioctl */ 145 nodev, /* devmap */ 146 nodev, /* mmap */ 147 nodev, /* segmap */ 148 nochpoll, /* poll */ 149 ddi_prop_op, /* prop_op */ 150 NULL, /* stream */ 151 D_MP, /* cb_flag */ 152 CB_REV, /* rev */ 153 nodev, /* int (*cb_aread)() */ 154 nodev /* int (*cb_awrite)() */ 155 }; 156 157 /* 158 * Device options 159 */ 160 static struct dev_ops rpcib_ops = { 161 DEVO_REV, /* devo_rev, */ 162 0, /* refcnt */ 163 rpcib_getinfo, /* info */ 164 nulldev, /* identify */ 165 nulldev, /* probe */ 166 rpcib_attach, /* attach */ 167 rpcib_detach, /* detach */ 168 nodev, /* reset */ 169 &rpcib_cbops, /* driver ops - devctl interfaces */ 170 NULL, /* bus operations */ 171 NULL, /* power */ 172 ddi_quiesce_not_needed, /* quiesce */ 173 }; 174 175 /* 176 * Module linkage information. 177 */ 178 179 static struct modldrv rib_modldrv = { 180 &mod_driverops, /* Driver module */ 181 "RPCIB plugin driver", /* Driver name and version */ 182 &rpcib_ops, /* Driver ops */ 183 }; 184 185 static struct modlinkage rib_modlinkage = { 186 MODREV_1, 187 (void *)&rib_modldrv, 188 NULL 189 }; 190 191 typedef struct rib_lrc_entry { 192 struct rib_lrc_entry *forw; 193 struct rib_lrc_entry *back; 194 char *lrc_buf; 195 196 uint32_t lrc_len; 197 void *avl_node; 198 bool_t registered; 199 200 struct mrc lrc_mhandle; 201 bool_t lrc_on_freed_list; 202 } rib_lrc_entry_t; 203 204 typedef struct cache_struct { 205 rib_lrc_entry_t r; 206 uint32_t len; 207 uint32_t elements; 208 kmutex_t node_lock; 209 avl_node_t avl_link; 210 } cache_avl_struct_t; 211 212 uint64_t cache_limit = 100 * 1024 * 1024; 213 static uint64_t cache_watermark = 80 * 1024 * 1024; 214 static bool_t stats_enabled = FALSE; 215 216 static uint64_t max_unsignaled_rws = 5; 217 int nfs_rdma_port = NFS_RDMA_PORT; 218 219 #define RIBNETID_TCP "tcp" 220 #define RIBNETID_TCP6 "tcp6" 221 222 /* 223 * rib_stat: private data pointer used when registering 224 * with the IBTF. It is returned to the consumer 225 * in all callbacks. 226 */ 227 static rpcib_state_t *rib_stat = NULL; 228 229 #define RNR_RETRIES IBT_RNR_RETRY_1 230 #define MAX_PORTS 2 231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 232 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 233 234 int preposted_rbufs = RDMA_BUFS_GRANT; 235 int send_threshold = 1; 236 237 /* 238 * Old cards with Tavor driver have limited memory footprint 239 * when booted in 32bit. The rib_max_rbufs tunable can be 240 * tuned for more buffers if needed. 241 */ 242 243 #if !defined(_ELF64) && !defined(__sparc) 244 int rib_max_rbufs = MAX_BUFS; 245 #else 246 int rib_max_rbufs = 10 * MAX_BUFS; 247 #endif /* !(_ELF64) && !(__sparc) */ 248 249 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 250 251 /* 252 * State of the plugin. 253 * ACCEPT = accepting new connections and requests. 254 * NO_ACCEPT = not accepting new connection and requests. 255 * This should eventually move to rpcib_state_t structure, since this 256 * will tell in which state the plugin is for a particular type of service 257 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 258 * state for one and in no_accept state for the other. 259 */ 260 int plugin_state; 261 kmutex_t plugin_state_lock; 262 263 ldi_ident_t rpcib_li; 264 265 /* 266 * RPCIB RDMATF operations 267 */ 268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 269 static rdma_stat rib_disconnect(CONN *conn); 270 static void rib_listen(struct rdma_svc_data *rd); 271 static void rib_listen_stop(struct rdma_svc_data *rd); 272 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 273 uint_t buflen, struct mrc *buf_handle); 274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 275 struct mrc buf_handle); 276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 277 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 279 struct mrc buf_handle); 280 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 281 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 282 void *lrc); 283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 284 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 286 caddr_t buf, int len, int cpu); 287 288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 289 290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 292 293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 294 295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 305 int addr_type, void *, CONN **); 306 static rdma_stat rib_conn_release(CONN *conn); 307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 308 rpcib_ping_t *, CONN **); 309 static rdma_stat rib_getinfo(rdma_info_t *info); 310 311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 313 static void rib_destroy_cache(rib_hca_t *hca); 314 static void rib_server_side_cache_reclaim(void *argp); 315 static int avl_compare(const void *t1, const void *t2); 316 317 static void rib_stop_services(rib_hca_t *); 318 static void rib_close_channels(rib_conn_list_t *); 319 static void rib_conn_close(void *); 320 static void rib_recv_rele(rib_qp_t *); 321 static rdma_stat rib_conn_release_locked(CONN *conn); 322 323 /* 324 * RPCIB addressing operations 325 */ 326 327 /* 328 * RDMA operations the RPCIB module exports 329 */ 330 static rdmaops_t rib_ops = { 331 rib_reachable, 332 rib_conn_get, 333 rib_conn_release, 334 rib_listen, 335 rib_listen_stop, 336 rib_registermem, 337 rib_deregistermem, 338 rib_registermemsync, 339 rib_deregistermemsync, 340 rib_syncmem, 341 rib_reg_buf_alloc, 342 rib_reg_buf_free, 343 rib_send, 344 rib_send_resp, 345 rib_post_resp, 346 rib_post_resp_remove, 347 rib_post_recv, 348 rib_recv, 349 rib_read, 350 rib_write, 351 rib_getinfo, 352 }; 353 354 /* 355 * RDMATF RPCIB plugin details 356 */ 357 static rdma_mod_t rib_mod = { 358 "ibtf", /* api name */ 359 RDMATF_VERS_1, 360 0, 361 &rib_ops, /* rdma op vector for ibtf */ 362 }; 363 364 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 365 static rdma_stat rib_qp_init(rib_qp_t *, int); 366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 372 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 374 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 377 rib_qp_t **); 378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 379 rib_qp_t **); 380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 382 static int rib_free_sendwait(struct send_wid *); 383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 385 static void rdma_done_rem_list(rib_qp_t *); 386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 387 388 static void rib_async_handler(void *, 389 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 392 static int rib_free_svc_recv(struct svc_recv *); 393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 394 static void rib_free_wid(struct recv_wid *); 395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 396 static void rib_detach_hca(ibt_hca_hdl_t); 397 static void rib_close_a_channel(CONN *); 398 static void rib_send_hold(rib_qp_t *); 399 static void rib_send_rele(rib_qp_t *); 400 401 /* 402 * Registration with IBTF as a consumer 403 */ 404 static struct ibt_clnt_modinfo_s rib_modinfo = { 405 IBTI_V_CURR, 406 IBT_GENERIC, 407 rib_async_handler, /* async event handler */ 408 NULL, /* Memory Region Handler */ 409 "nfs/ib" 410 }; 411 412 /* 413 * Global strucuture 414 */ 415 416 typedef struct rpcib_s { 417 dev_info_t *rpcib_dip; 418 kmutex_t rpcib_mutex; 419 } rpcib_t; 420 421 rpcib_t rpcib; 422 423 /* 424 * /etc/system controlled variable to control 425 * debugging in rpcib kernel module. 426 * Set it to values greater that 1 to control 427 * the amount of debugging messages required. 428 */ 429 int rib_debug = 0; 430 431 int 432 _init(void) 433 { 434 int error; 435 436 error = mod_install((struct modlinkage *)&rib_modlinkage); 437 if (error != 0) { 438 /* 439 * Could not load module 440 */ 441 return (error); 442 } 443 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 444 return (0); 445 } 446 447 int 448 _fini() 449 { 450 int status; 451 452 /* 453 * Remove module 454 */ 455 if ((status = mod_remove(&rib_modlinkage)) != 0) { 456 return (status); 457 } 458 mutex_destroy(&plugin_state_lock); 459 return (0); 460 } 461 462 int 463 _info(struct modinfo *modinfop) 464 { 465 return (mod_info(&rib_modlinkage, modinfop)); 466 } 467 468 /* 469 * rpcib_getinfo() 470 * Given the device number, return the devinfo pointer or the 471 * instance number. 472 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 473 */ 474 475 /*ARGSUSED*/ 476 static int 477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 478 { 479 int ret = DDI_SUCCESS; 480 481 switch (cmd) { 482 case DDI_INFO_DEVT2DEVINFO: 483 if (rpcib.rpcib_dip != NULL) 484 *result = rpcib.rpcib_dip; 485 else { 486 *result = NULL; 487 ret = DDI_FAILURE; 488 } 489 break; 490 491 case DDI_INFO_DEVT2INSTANCE: 492 *result = NULL; 493 break; 494 495 default: 496 ret = DDI_FAILURE; 497 } 498 return (ret); 499 } 500 501 static void 502 rpcib_free_hca_list() 503 { 504 rib_hca_t *hca, *hcap; 505 506 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 507 hca = rib_stat->hcas_list; 508 rib_stat->hcas_list = NULL; 509 rw_exit(&rib_stat->hcas_list_lock); 510 while (hca != NULL) { 511 rw_enter(&hca->state_lock, RW_WRITER); 512 hcap = hca; 513 hca = hca->next; 514 rib_stat->nhca_inited--; 515 rib_mod.rdma_count--; 516 hcap->state = HCA_DETACHED; 517 rw_exit(&hcap->state_lock); 518 rib_stop_hca_services(hcap); 519 520 kmem_free(hcap, sizeof (*hcap)); 521 } 522 } 523 524 static rdma_stat 525 rpcib_free_service_list() 526 { 527 rib_service_t *service; 528 ibt_status_t ret; 529 530 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 531 while (rib_stat->service_list != NULL) { 532 service = rib_stat->service_list; 533 ret = ibt_unbind_all_services(service->srv_hdl); 534 if (ret != IBT_SUCCESS) { 535 rw_exit(&rib_stat->service_list_lock); 536 #ifdef DEBUG 537 cmn_err(CE_NOTE, "rpcib_free_service_list: " 538 "ibt_unbind_all_services failed (%d)\n", (int)ret); 539 #endif 540 return (RDMA_FAILED); 541 } 542 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 543 service->srv_hdl); 544 if (ret != IBT_SUCCESS) { 545 rw_exit(&rib_stat->service_list_lock); 546 #ifdef DEBUG 547 cmn_err(CE_NOTE, "rpcib_free_service_list: " 548 "ibt_deregister_service failed (%d)\n", (int)ret); 549 #endif 550 return (RDMA_FAILED); 551 } 552 rib_stat->service_list = service->next; 553 kmem_free(service, sizeof (rib_service_t)); 554 } 555 rw_exit(&rib_stat->service_list_lock); 556 557 return (RDMA_SUCCESS); 558 } 559 560 static int 561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 562 { 563 ibt_status_t ibt_status; 564 rdma_stat r_status; 565 566 switch (cmd) { 567 case DDI_ATTACH: 568 break; 569 case DDI_RESUME: 570 return (DDI_SUCCESS); 571 default: 572 return (DDI_FAILURE); 573 } 574 575 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 576 577 mutex_enter(&rpcib.rpcib_mutex); 578 if (rpcib.rpcib_dip != NULL) { 579 mutex_exit(&rpcib.rpcib_mutex); 580 return (DDI_FAILURE); 581 } 582 rpcib.rpcib_dip = dip; 583 mutex_exit(&rpcib.rpcib_mutex); 584 /* 585 * Create the "rpcib" minor-node. 586 */ 587 if (ddi_create_minor_node(dip, 588 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 589 /* Error message, no cmn_err as they print on console */ 590 return (DDI_FAILURE); 591 } 592 593 if (rib_stat == NULL) { 594 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 595 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 596 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 597 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 598 } 599 600 rib_stat->hca_count = ibt_get_hca_list(NULL); 601 if (rib_stat->hca_count < 1) { 602 mutex_destroy(&rib_stat->listen_lock); 603 rw_destroy(&rib_stat->hcas_list_lock); 604 mutex_destroy(&rib_stat->open_hca_lock); 605 kmem_free(rib_stat, sizeof (*rib_stat)); 606 rib_stat = NULL; 607 return (DDI_FAILURE); 608 } 609 610 ibt_status = ibt_attach(&rib_modinfo, dip, 611 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 612 613 if (ibt_status != IBT_SUCCESS) { 614 mutex_destroy(&rib_stat->listen_lock); 615 rw_destroy(&rib_stat->hcas_list_lock); 616 mutex_destroy(&rib_stat->open_hca_lock); 617 kmem_free(rib_stat, sizeof (*rib_stat)); 618 rib_stat = NULL; 619 return (DDI_FAILURE); 620 } 621 622 rib_stat->service_list = NULL; 623 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 624 mutex_enter(&rib_stat->open_hca_lock); 625 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 626 mutex_exit(&rib_stat->open_hca_lock); 627 goto open_fail; 628 } 629 mutex_exit(&rib_stat->open_hca_lock); 630 631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 632 DDI_PROP_SUCCESS) { 633 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 634 "failed."); 635 goto register_fail; 636 } 637 638 /* 639 * Register with rdmatf 640 */ 641 r_status = rdma_register_mod(&rib_mod); 642 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 643 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 644 "status = %d", r_status); 645 goto register_fail; 646 } 647 648 return (DDI_SUCCESS); 649 650 register_fail: 651 652 open_fail: 653 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 654 rpcib_free_hca_list(); 655 (void) rpcib_free_service_list(); 656 mutex_destroy(&rib_stat->listen_lock); 657 rw_destroy(&rib_stat->hcas_list_lock); 658 mutex_destroy(&rib_stat->open_hca_lock); 659 rw_destroy(&rib_stat->service_list_lock); 660 kmem_free(rib_stat, sizeof (*rib_stat)); 661 rib_stat = NULL; 662 return (DDI_FAILURE); 663 } 664 665 /*ARGSUSED*/ 666 static int 667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 668 { 669 switch (cmd) { 670 671 case DDI_DETACH: 672 break; 673 674 case DDI_SUSPEND: 675 default: 676 return (DDI_FAILURE); 677 } 678 679 /* 680 * Detach the hca and free resources 681 */ 682 mutex_enter(&plugin_state_lock); 683 plugin_state = NO_ACCEPT; 684 mutex_exit(&plugin_state_lock); 685 686 if (rpcib_free_service_list() != RDMA_SUCCESS) 687 return (DDI_FAILURE); 688 rpcib_free_hca_list(); 689 690 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 691 mutex_destroy(&rib_stat->listen_lock); 692 rw_destroy(&rib_stat->hcas_list_lock); 693 mutex_destroy(&rib_stat->open_hca_lock); 694 rw_destroy(&rib_stat->service_list_lock); 695 696 kmem_free(rib_stat, sizeof (*rib_stat)); 697 rib_stat = NULL; 698 699 mutex_enter(&rpcib.rpcib_mutex); 700 rpcib.rpcib_dip = NULL; 701 mutex_exit(&rpcib.rpcib_mutex); 702 mutex_destroy(&rpcib.rpcib_mutex); 703 return (DDI_SUCCESS); 704 } 705 706 707 static void rib_rbufpool_free(rib_hca_t *, int); 708 static void rib_rbufpool_deregister(rib_hca_t *, int); 709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 711 static rdma_stat rib_rem_replylist(rib_qp_t *); 712 static int rib_remreply(rib_qp_t *, struct reply *); 713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 715 716 717 /* 718 * One CQ pair per HCA 719 */ 720 static rdma_stat 721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 722 rib_cq_t **cqp) 723 { 724 rib_cq_t *cq; 725 ibt_cq_attr_t cq_attr; 726 uint32_t real_size; 727 ibt_status_t status; 728 rdma_stat error = RDMA_SUCCESS; 729 730 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 731 cq->rib_hca = hca; 732 cq_attr.cq_size = cq_size; 733 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 734 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 735 &real_size); 736 if (status != IBT_SUCCESS) { 737 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 738 " status=%d", status); 739 error = RDMA_FAILED; 740 goto fail; 741 } 742 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 743 744 /* 745 * Enable CQ callbacks. CQ Callbacks are single shot 746 * (e.g. you have to call ibt_enable_cq_notify() 747 * after each callback to get another one). 748 */ 749 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 750 if (status != IBT_SUCCESS) { 751 cmn_err(CE_WARN, "rib_create_cq: " 752 "enable_cq_notify failed, status %d", status); 753 error = RDMA_FAILED; 754 goto fail; 755 } 756 *cqp = cq; 757 758 return (error); 759 fail: 760 if (cq->rib_cq_hdl) 761 (void) ibt_free_cq(cq->rib_cq_hdl); 762 if (cq) 763 kmem_free(cq, sizeof (rib_cq_t)); 764 return (error); 765 } 766 767 /* 768 * rpcib_find_hca 769 * 770 * Caller should have already locked the hcas_lock before calling 771 * this function. 772 */ 773 static rib_hca_t * 774 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 775 { 776 rib_hca_t *hca = ribstat->hcas_list; 777 778 while (hca && hca->hca_guid != guid) 779 hca = hca->next; 780 781 return (hca); 782 } 783 784 static rdma_stat 785 rpcib_open_hcas(rpcib_state_t *ribstat) 786 { 787 rib_hca_t *hca; 788 ibt_status_t ibt_status; 789 rdma_stat status; 790 ibt_hca_portinfo_t *pinfop; 791 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 792 uint_t size, cq_size; 793 int i; 794 kstat_t *ksp; 795 cache_avl_struct_t example_avl_node; 796 char rssc_name[32]; 797 int old_nhca_inited = ribstat->nhca_inited; 798 ib_guid_t *hca_guids; 799 800 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 801 802 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 803 if (ribstat->hca_count == 0) 804 return (RDMA_FAILED); 805 806 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 807 /* 808 * Open a hca and setup for RDMA 809 */ 810 for (i = 0; i < ribstat->hca_count; i++) { 811 if (rpcib_find_hca(ribstat, hca_guids[i])) 812 continue; 813 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 814 815 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 816 hca_guids[i], &hca->hca_hdl); 817 if (ibt_status != IBT_SUCCESS) { 818 kmem_free(hca, sizeof (rib_hca_t)); 819 continue; 820 } 821 hca->hca_guid = hca_guids[i]; 822 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 823 hca->state = HCA_INITED; 824 825 /* 826 * query HCA info 827 */ 828 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 829 if (ibt_status != IBT_SUCCESS) { 830 goto fail1; 831 } 832 833 /* 834 * One PD (Protection Domain) per HCA. 835 * A qp is allowed to access a memory region 836 * only when it's in the same PD as that of 837 * the memory region. 838 */ 839 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 840 if (ibt_status != IBT_SUCCESS) { 841 goto fail1; 842 } 843 844 /* 845 * query HCA ports 846 */ 847 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 848 0, &pinfop, &hca->hca_nports, &size); 849 if (ibt_status != IBT_SUCCESS) { 850 goto fail2; 851 } 852 hca->hca_ports = pinfop; 853 hca->hca_pinfosz = size; 854 pinfop = NULL; 855 856 cq_size = DEF_CQ_SIZE; /* default cq size */ 857 /* 858 * Create 2 pairs of cq's (1 pair for client 859 * and the other pair for server) on this hca. 860 * If number of qp's gets too large, then several 861 * cq's will be needed. 862 */ 863 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 864 &hca->svc_rcq); 865 if (status != RDMA_SUCCESS) { 866 goto fail3; 867 } 868 869 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 870 &hca->svc_scq); 871 if (status != RDMA_SUCCESS) { 872 goto fail3; 873 } 874 875 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 876 &hca->clnt_rcq); 877 if (status != RDMA_SUCCESS) { 878 goto fail3; 879 } 880 881 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 882 &hca->clnt_scq); 883 if (status != RDMA_SUCCESS) { 884 goto fail3; 885 } 886 887 /* 888 * Create buffer pools. 889 * Note rib_rbuf_create also allocates memory windows. 890 */ 891 hca->recv_pool = rib_rbufpool_create(hca, 892 RECV_BUFFER, rib_max_rbufs); 893 if (hca->recv_pool == NULL) { 894 goto fail3; 895 } 896 897 hca->send_pool = rib_rbufpool_create(hca, 898 SEND_BUFFER, rib_max_rbufs); 899 if (hca->send_pool == NULL) { 900 rib_rbufpool_destroy(hca, RECV_BUFFER); 901 goto fail3; 902 } 903 904 if (hca->server_side_cache == NULL) { 905 (void) sprintf(rssc_name, 906 "rib_srvr_cache_%llx", 907 (long long unsigned int) hca->hca_guid); 908 hca->server_side_cache = kmem_cache_create( 909 rssc_name, 910 sizeof (cache_avl_struct_t), 0, 911 NULL, 912 NULL, 913 rib_server_side_cache_reclaim, 914 hca, NULL, 0); 915 } 916 917 avl_create(&hca->avl_tree, 918 avl_compare, 919 sizeof (cache_avl_struct_t), 920 (uint_t)(uintptr_t)&example_avl_node.avl_link- 921 (uint_t)(uintptr_t)&example_avl_node); 922 923 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 924 hca->iblock); 925 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 926 rw_init(&hca->avl_rw_lock, 927 NULL, RW_DRIVER, hca->iblock); 928 mutex_init(&hca->cache_allocation_lock, 929 NULL, MUTEX_DRIVER, NULL); 930 hca->avl_init = TRUE; 931 932 /* Create kstats for the cache */ 933 ASSERT(INGLOBALZONE(curproc)); 934 935 if (!stats_enabled) { 936 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 937 KSTAT_TYPE_NAMED, 938 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 939 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 940 GLOBAL_ZONEID); 941 if (ksp) { 942 ksp->ks_data = (void *) &rpcib_kstat; 943 ksp->ks_update = rpcib_cache_kstat_update; 944 kstat_install(ksp); 945 stats_enabled = TRUE; 946 } 947 } 948 if (hca->cleanup_helper == NULL) { 949 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 950 951 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 952 (unsigned long long int) hca->hca_guid); 953 hca->cleanup_helper = ddi_taskq_create(NULL, 954 tq_name, 1, TASKQ_DEFAULTPRI, 0); 955 } 956 957 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 958 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 959 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 960 hca->iblock); 961 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 962 hca->iblock); 963 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 964 hca->inuse = TRUE; 965 966 hca->next = ribstat->hcas_list; 967 ribstat->hcas_list = hca; 968 ribstat->nhca_inited++; 969 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 970 continue; 971 972 fail3: 973 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 974 fail2: 975 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 976 fail1: 977 (void) ibt_close_hca(hca->hca_hdl); 978 kmem_free(hca, sizeof (rib_hca_t)); 979 } 980 rw_exit(&ribstat->hcas_list_lock); 981 ibt_free_hca_list(hca_guids, ribstat->hca_count); 982 rib_mod.rdma_count = rib_stat->nhca_inited; 983 984 /* 985 * return success if at least one new hca has been configured. 986 */ 987 if (ribstat->nhca_inited != old_nhca_inited) 988 return (RDMA_SUCCESS); 989 else 990 return (RDMA_FAILED); 991 } 992 993 /* 994 * Callback routines 995 */ 996 997 /* 998 * SCQ handlers 999 */ 1000 /* ARGSUSED */ 1001 static void 1002 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1003 { 1004 ibt_status_t ibt_status; 1005 ibt_wc_t wc; 1006 struct send_wid *wd; 1007 CONN *conn; 1008 rib_qp_t *qp; 1009 int i; 1010 1011 /* 1012 * Re-enable cq notify here to avoid missing any 1013 * completion queue notification. 1014 */ 1015 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1016 1017 ibt_status = IBT_SUCCESS; 1018 while (ibt_status != IBT_CQ_EMPTY) { 1019 bzero(&wc, sizeof (wc)); 1020 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1021 if (ibt_status != IBT_SUCCESS) 1022 return; 1023 1024 /* 1025 * Got a send completion 1026 */ 1027 if (wc.wc_id != RDMA_DUMMY_WRID) { 1028 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1029 qp = wd->qp; 1030 conn = qptoc(qp); 1031 1032 mutex_enter(&wd->sendwait_lock); 1033 switch (wc.wc_status) { 1034 case IBT_WC_SUCCESS: 1035 wd->status = RDMA_SUCCESS; 1036 break; 1037 default: 1038 /* 1039 * RC Send Q Error Code Local state Remote State 1040 * ==================== =========== ============ 1041 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1042 * IBT_WC_LOCAL_LEN_ERR ERROR None 1043 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1044 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1045 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1046 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1047 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1048 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1049 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1050 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1051 * IBT_WC_WR_FLUSHED_ERR ERROR None 1052 */ 1053 /* 1054 * Channel in error state. Set connection to 1055 * ERROR and cleanup will happen either from 1056 * conn_release or from rib_conn_get 1057 */ 1058 wd->status = RDMA_FAILED; 1059 mutex_enter(&conn->c_lock); 1060 if (conn->c_state != C_DISCONN_PEND) 1061 conn->c_state = C_ERROR_CONN; 1062 mutex_exit(&conn->c_lock); 1063 break; 1064 } 1065 1066 if (wd->cv_sig == 1) { 1067 /* 1068 * Notify poster 1069 */ 1070 cv_signal(&wd->wait_cv); 1071 mutex_exit(&wd->sendwait_lock); 1072 } else { 1073 /* 1074 * Poster not waiting for notification. 1075 * Free the send buffers and send_wid 1076 */ 1077 for (i = 0; i < wd->nsbufs; i++) { 1078 rib_rbuf_free(qptoc(wd->qp), 1079 SEND_BUFFER, 1080 (void *)(uintptr_t)wd->sbufaddr[i]); 1081 } 1082 1083 /* decrement the send ref count */ 1084 rib_send_rele(qp); 1085 1086 mutex_exit(&wd->sendwait_lock); 1087 (void) rib_free_sendwait(wd); 1088 } 1089 } 1090 } 1091 } 1092 1093 /* ARGSUSED */ 1094 static void 1095 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1096 { 1097 ibt_status_t ibt_status; 1098 ibt_wc_t wc; 1099 struct send_wid *wd; 1100 rib_qp_t *qp; 1101 CONN *conn; 1102 int i; 1103 1104 /* 1105 * Re-enable cq notify here to avoid missing any 1106 * completion queue notification. 1107 */ 1108 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1109 1110 ibt_status = IBT_SUCCESS; 1111 while (ibt_status != IBT_CQ_EMPTY) { 1112 bzero(&wc, sizeof (wc)); 1113 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1114 if (ibt_status != IBT_SUCCESS) 1115 return; 1116 1117 /* 1118 * Got a send completion 1119 */ 1120 if (wc.wc_id != RDMA_DUMMY_WRID) { 1121 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1122 qp = wd->qp; 1123 conn = qptoc(qp); 1124 mutex_enter(&wd->sendwait_lock); 1125 1126 switch (wc.wc_status) { 1127 case IBT_WC_SUCCESS: 1128 wd->status = RDMA_SUCCESS; 1129 break; 1130 default: 1131 /* 1132 * Channel in error state. Set connection to 1133 * ERROR and cleanup will happen either from 1134 * conn_release or conn timeout. 1135 */ 1136 wd->status = RDMA_FAILED; 1137 mutex_enter(&conn->c_lock); 1138 if (conn->c_state != C_DISCONN_PEND) 1139 conn->c_state = C_ERROR_CONN; 1140 mutex_exit(&conn->c_lock); 1141 break; 1142 } 1143 1144 if (wd->cv_sig == 1) { 1145 /* 1146 * Update completion status and notify poster 1147 */ 1148 cv_signal(&wd->wait_cv); 1149 mutex_exit(&wd->sendwait_lock); 1150 } else { 1151 /* 1152 * Poster not waiting for notification. 1153 * Free the send buffers and send_wid 1154 */ 1155 for (i = 0; i < wd->nsbufs; i++) { 1156 rib_rbuf_free(qptoc(wd->qp), 1157 SEND_BUFFER, 1158 (void *)(uintptr_t)wd->sbufaddr[i]); 1159 } 1160 1161 /* decrement the send ref count */ 1162 rib_send_rele(qp); 1163 1164 mutex_exit(&wd->sendwait_lock); 1165 (void) rib_free_sendwait(wd); 1166 } 1167 } 1168 } 1169 } 1170 1171 /* 1172 * RCQ handler 1173 */ 1174 /* ARGSUSED */ 1175 static void 1176 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1177 { 1178 rib_qp_t *qp; 1179 ibt_status_t ibt_status; 1180 ibt_wc_t wc; 1181 struct recv_wid *rwid; 1182 1183 /* 1184 * Re-enable cq notify here to avoid missing any 1185 * completion queue notification. 1186 */ 1187 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1188 1189 ibt_status = IBT_SUCCESS; 1190 while (ibt_status != IBT_CQ_EMPTY) { 1191 bzero(&wc, sizeof (wc)); 1192 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1193 if (ibt_status != IBT_SUCCESS) 1194 return; 1195 1196 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1197 qp = rwid->qp; 1198 1199 if (wc.wc_status == IBT_WC_SUCCESS) { 1200 XDR inxdrs, *xdrs; 1201 uint_t xid, vers, op, find_xid = 0; 1202 struct reply *r; 1203 CONN *conn = qptoc(qp); 1204 uint32_t rdma_credit = 0; 1205 1206 xdrs = &inxdrs; 1207 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1208 wc.wc_bytes_xfer, XDR_DECODE); 1209 /* 1210 * Treat xid as opaque (xid is the first entity 1211 * in the rpc rdma message). 1212 */ 1213 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1214 1215 /* Skip xid and set the xdr position accordingly. */ 1216 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1217 (void) xdr_u_int(xdrs, &vers); 1218 (void) xdr_u_int(xdrs, &rdma_credit); 1219 (void) xdr_u_int(xdrs, &op); 1220 XDR_DESTROY(xdrs); 1221 1222 if (vers != RPCRDMA_VERS) { 1223 /* 1224 * Invalid RPC/RDMA version. Cannot 1225 * interoperate. Set connection to 1226 * ERROR state and bail out. 1227 */ 1228 mutex_enter(&conn->c_lock); 1229 if (conn->c_state != C_DISCONN_PEND) 1230 conn->c_state = C_ERROR_CONN; 1231 mutex_exit(&conn->c_lock); 1232 rib_rbuf_free(conn, RECV_BUFFER, 1233 (void *)(uintptr_t)rwid->addr); 1234 rib_free_wid(rwid); 1235 rib_recv_rele(qp); 1236 continue; 1237 } 1238 1239 mutex_enter(&qp->replylist_lock); 1240 for (r = qp->replylist; r != NULL; r = r->next) { 1241 if (r->xid == xid) { 1242 find_xid = 1; 1243 switch (op) { 1244 case RDMA_MSG: 1245 case RDMA_NOMSG: 1246 case RDMA_MSGP: 1247 r->status = RDMA_SUCCESS; 1248 r->vaddr_cq = rwid->addr; 1249 r->bytes_xfer = 1250 wc.wc_bytes_xfer; 1251 cv_signal(&r->wait_cv); 1252 break; 1253 default: 1254 rib_rbuf_free(qptoc(qp), 1255 RECV_BUFFER, 1256 (void *)(uintptr_t) 1257 rwid->addr); 1258 break; 1259 } 1260 break; 1261 } 1262 } 1263 mutex_exit(&qp->replylist_lock); 1264 if (find_xid == 0) { 1265 /* RPC caller not waiting for reply */ 1266 1267 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1268 int, xid); 1269 1270 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1271 (void *)(uintptr_t)rwid->addr); 1272 } 1273 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1274 CONN *conn = qptoc(qp); 1275 1276 /* 1277 * Connection being flushed. Just free 1278 * the posted buffer 1279 */ 1280 rib_rbuf_free(conn, RECV_BUFFER, 1281 (void *)(uintptr_t)rwid->addr); 1282 } else { 1283 CONN *conn = qptoc(qp); 1284 /* 1285 * RC Recv Q Error Code Local state Remote State 1286 * ==================== =========== ============ 1287 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1288 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1290 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1291 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1292 * IBT_WC_WR_FLUSHED_ERR None None 1293 */ 1294 /* 1295 * Channel in error state. Set connection 1296 * in ERROR state. 1297 */ 1298 mutex_enter(&conn->c_lock); 1299 if (conn->c_state != C_DISCONN_PEND) 1300 conn->c_state = C_ERROR_CONN; 1301 mutex_exit(&conn->c_lock); 1302 rib_rbuf_free(conn, RECV_BUFFER, 1303 (void *)(uintptr_t)rwid->addr); 1304 } 1305 rib_free_wid(rwid); 1306 rib_recv_rele(qp); 1307 } 1308 } 1309 1310 /* Server side */ 1311 /* ARGSUSED */ 1312 static void 1313 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1314 { 1315 rdma_recv_data_t *rdp; 1316 rib_qp_t *qp; 1317 ibt_status_t ibt_status; 1318 ibt_wc_t wc; 1319 struct svc_recv *s_recvp; 1320 CONN *conn; 1321 mblk_t *mp; 1322 1323 /* 1324 * Re-enable cq notify here to avoid missing any 1325 * completion queue notification. 1326 */ 1327 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1328 1329 ibt_status = IBT_SUCCESS; 1330 while (ibt_status != IBT_CQ_EMPTY) { 1331 bzero(&wc, sizeof (wc)); 1332 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1333 if (ibt_status != IBT_SUCCESS) 1334 return; 1335 1336 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1337 qp = s_recvp->qp; 1338 conn = qptoc(qp); 1339 1340 if (wc.wc_status == IBT_WC_SUCCESS) { 1341 XDR inxdrs, *xdrs; 1342 uint_t xid, vers, op; 1343 uint32_t rdma_credit; 1344 1345 xdrs = &inxdrs; 1346 /* s_recvp->vaddr stores data */ 1347 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1348 wc.wc_bytes_xfer, XDR_DECODE); 1349 1350 /* 1351 * Treat xid as opaque (xid is the first entity 1352 * in the rpc rdma message). 1353 */ 1354 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1355 /* Skip xid and set the xdr position accordingly. */ 1356 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1357 if (!xdr_u_int(xdrs, &vers) || 1358 !xdr_u_int(xdrs, &rdma_credit) || 1359 !xdr_u_int(xdrs, &op)) { 1360 rib_rbuf_free(conn, RECV_BUFFER, 1361 (void *)(uintptr_t)s_recvp->vaddr); 1362 XDR_DESTROY(xdrs); 1363 rib_recv_rele(qp); 1364 (void) rib_free_svc_recv(s_recvp); 1365 continue; 1366 } 1367 XDR_DESTROY(xdrs); 1368 1369 if (vers != RPCRDMA_VERS) { 1370 /* 1371 * Invalid RPC/RDMA version. 1372 * Drop rpc rdma message. 1373 */ 1374 rib_rbuf_free(conn, RECV_BUFFER, 1375 (void *)(uintptr_t)s_recvp->vaddr); 1376 rib_recv_rele(qp); 1377 (void) rib_free_svc_recv(s_recvp); 1378 continue; 1379 } 1380 /* 1381 * Is this for RDMA_DONE? 1382 */ 1383 if (op == RDMA_DONE) { 1384 rib_rbuf_free(conn, RECV_BUFFER, 1385 (void *)(uintptr_t)s_recvp->vaddr); 1386 /* 1387 * Wake up the thread waiting on 1388 * a RDMA_DONE for xid 1389 */ 1390 mutex_enter(&qp->rdlist_lock); 1391 rdma_done_notify(qp, xid); 1392 mutex_exit(&qp->rdlist_lock); 1393 rib_recv_rele(qp); 1394 (void) rib_free_svc_recv(s_recvp); 1395 continue; 1396 } 1397 1398 mutex_enter(&plugin_state_lock); 1399 mutex_enter(&conn->c_lock); 1400 if ((plugin_state == ACCEPT) && 1401 (conn->c_state == C_CONNECTED)) { 1402 conn->c_ref++; 1403 mutex_exit(&conn->c_lock); 1404 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1405 == NULL) 1406 (void) strwaitbuf( 1407 sizeof (*rdp), BPRI_LO); 1408 /* 1409 * Plugin is in accept state, hence the master 1410 * transport queue for this is still accepting 1411 * requests. Hence we can call svc_queuereq to 1412 * queue this recieved msg. 1413 */ 1414 rdp = (rdma_recv_data_t *)mp->b_rptr; 1415 rdp->conn = conn; 1416 rdp->rpcmsg.addr = 1417 (caddr_t)(uintptr_t)s_recvp->vaddr; 1418 rdp->rpcmsg.type = RECV_BUFFER; 1419 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1420 rdp->status = wc.wc_status; 1421 mp->b_wptr += sizeof (*rdp); 1422 svc_queuereq((queue_t *)rib_stat->q, mp); 1423 mutex_exit(&plugin_state_lock); 1424 } else { 1425 /* 1426 * The master transport for this is going 1427 * away and the queue is not accepting anymore 1428 * requests for krpc, so don't do anything, just 1429 * free the msg. 1430 */ 1431 mutex_exit(&conn->c_lock); 1432 mutex_exit(&plugin_state_lock); 1433 rib_rbuf_free(conn, RECV_BUFFER, 1434 (void *)(uintptr_t)s_recvp->vaddr); 1435 } 1436 } else { 1437 rib_rbuf_free(conn, RECV_BUFFER, 1438 (void *)(uintptr_t)s_recvp->vaddr); 1439 } 1440 rib_recv_rele(qp); 1441 (void) rib_free_svc_recv(s_recvp); 1442 } 1443 } 1444 1445 static void 1446 rib_attach_hca() 1447 { 1448 mutex_enter(&rib_stat->open_hca_lock); 1449 (void) rpcib_open_hcas(rib_stat); 1450 rib_listen(NULL); 1451 mutex_exit(&rib_stat->open_hca_lock); 1452 } 1453 1454 /* 1455 * Handles DR event of IBT_HCA_DETACH_EVENT. 1456 */ 1457 /* ARGSUSED */ 1458 static void 1459 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1460 ibt_async_code_t code, ibt_async_event_t *event) 1461 { 1462 switch (code) { 1463 case IBT_HCA_ATTACH_EVENT: 1464 rib_attach_hca(); 1465 break; 1466 case IBT_HCA_DETACH_EVENT: 1467 rib_detach_hca(hca_hdl); 1468 #ifdef DEBUG 1469 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1470 #endif 1471 break; 1472 case IBT_EVENT_PORT_UP: 1473 /* 1474 * A port is up. We should call rib_listen() since there is 1475 * a chance that rib_listen() may have failed during 1476 * rib_attach_hca() because the port had not been up yet. 1477 */ 1478 rib_listen(NULL); 1479 #ifdef DEBUG 1480 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1481 #endif 1482 break; 1483 #ifdef DEBUG 1484 case IBT_EVENT_PATH_MIGRATED: 1485 cmn_err(CE_NOTE, "rib_async_handler(): " 1486 "IBT_EVENT_PATH_MIGRATED\n"); 1487 break; 1488 case IBT_EVENT_SQD: 1489 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1490 break; 1491 case IBT_EVENT_COM_EST: 1492 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1493 break; 1494 case IBT_ERROR_CATASTROPHIC_CHAN: 1495 cmn_err(CE_NOTE, "rib_async_handler(): " 1496 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1497 break; 1498 case IBT_ERROR_INVALID_REQUEST_CHAN: 1499 cmn_err(CE_NOTE, "rib_async_handler(): " 1500 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1501 break; 1502 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1503 cmn_err(CE_NOTE, "rib_async_handler(): " 1504 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1505 break; 1506 case IBT_ERROR_PATH_MIGRATE_REQ: 1507 cmn_err(CE_NOTE, "rib_async_handler(): " 1508 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1509 break; 1510 case IBT_ERROR_CQ: 1511 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1512 break; 1513 case IBT_ERROR_PORT_DOWN: 1514 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1515 break; 1516 case IBT_ASYNC_OPAQUE1: 1517 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1518 break; 1519 case IBT_ASYNC_OPAQUE2: 1520 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1521 break; 1522 case IBT_ASYNC_OPAQUE3: 1523 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1524 break; 1525 case IBT_ASYNC_OPAQUE4: 1526 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1527 break; 1528 #endif 1529 default: 1530 break; 1531 } 1532 } 1533 1534 /* 1535 * Client's reachable function. 1536 */ 1537 static rdma_stat 1538 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1539 { 1540 rdma_stat status; 1541 rpcib_ping_t rpt; 1542 struct netbuf saddr; 1543 CONN *conn; 1544 1545 bzero(&saddr, sizeof (struct netbuf)); 1546 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1547 1548 if (status == RDMA_SUCCESS) { 1549 *handle = (void *)rpt.hca; 1550 /* release the reference */ 1551 (void) rib_conn_release(conn); 1552 return (RDMA_SUCCESS); 1553 } else { 1554 *handle = NULL; 1555 DTRACE_PROBE(rpcib__i__pingfailed); 1556 return (RDMA_FAILED); 1557 } 1558 } 1559 1560 /* Client side qp creation */ 1561 static rdma_stat 1562 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1563 { 1564 rib_qp_t *kqp = NULL; 1565 CONN *conn; 1566 rdma_clnt_cred_ctrl_t *cc_info; 1567 1568 ASSERT(qp != NULL); 1569 *qp = NULL; 1570 1571 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1572 conn = qptoc(kqp); 1573 kqp->hca = hca; 1574 kqp->rdmaconn.c_rdmamod = &rib_mod; 1575 kqp->rdmaconn.c_private = (caddr_t)kqp; 1576 1577 kqp->mode = RIB_CLIENT; 1578 kqp->chan_flags = IBT_BLOCKING; 1579 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1580 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1581 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1582 /* 1583 * Initialize 1584 */ 1585 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1586 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1587 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1588 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1589 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1590 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1591 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1592 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1593 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1594 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1595 /* 1596 * Initialize the client credit control 1597 * portion of the rdmaconn struct. 1598 */ 1599 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1600 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1601 cc_info->clnt_cc_granted_ops = 0; 1602 cc_info->clnt_cc_in_flight_ops = 0; 1603 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1604 1605 *qp = kqp; 1606 return (RDMA_SUCCESS); 1607 } 1608 1609 /* Server side qp creation */ 1610 static rdma_stat 1611 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1612 { 1613 rib_qp_t *kqp = NULL; 1614 ibt_chan_sizes_t chan_sizes; 1615 ibt_rc_chan_alloc_args_t qp_attr; 1616 ibt_status_t ibt_status; 1617 rdma_srv_cred_ctrl_t *cc_info; 1618 1619 *qp = NULL; 1620 1621 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1622 kqp->hca = hca; 1623 kqp->port_num = port; 1624 kqp->rdmaconn.c_rdmamod = &rib_mod; 1625 kqp->rdmaconn.c_private = (caddr_t)kqp; 1626 1627 /* 1628 * Create the qp handle 1629 */ 1630 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1631 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1632 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1633 qp_attr.rc_pd = hca->pd_hdl; 1634 qp_attr.rc_hca_port_num = port; 1635 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1636 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1637 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1638 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1639 qp_attr.rc_clone_chan = NULL; 1640 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1641 qp_attr.rc_flags = IBT_WR_SIGNALED; 1642 1643 rw_enter(&hca->state_lock, RW_READER); 1644 if (hca->state != HCA_DETACHED) { 1645 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1646 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1647 &chan_sizes); 1648 } else { 1649 rw_exit(&hca->state_lock); 1650 goto fail; 1651 } 1652 rw_exit(&hca->state_lock); 1653 1654 if (ibt_status != IBT_SUCCESS) { 1655 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1656 int, ibt_status); 1657 goto fail; 1658 } 1659 1660 kqp->mode = RIB_SERVER; 1661 kqp->chan_flags = IBT_BLOCKING; 1662 kqp->q = q; /* server ONLY */ 1663 1664 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1665 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1666 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1667 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1668 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1669 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1670 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1671 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1672 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1673 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1674 /* 1675 * Set the private data area to qp to be used in callbacks 1676 */ 1677 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1678 kqp->rdmaconn.c_state = C_CONNECTED; 1679 1680 /* 1681 * Initialize the server credit control 1682 * portion of the rdmaconn struct. 1683 */ 1684 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1685 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1686 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1687 cc_info->srv_cc_cur_buffers_used = 0; 1688 cc_info->srv_cc_posted = preposted_rbufs; 1689 1690 *qp = kqp; 1691 1692 return (RDMA_SUCCESS); 1693 fail: 1694 if (kqp) 1695 kmem_free(kqp, sizeof (rib_qp_t)); 1696 1697 return (RDMA_FAILED); 1698 } 1699 1700 /* ARGSUSED */ 1701 ibt_cm_status_t 1702 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1703 ibt_cm_return_args_t *ret_args, void *priv_data, 1704 ibt_priv_data_len_t len) 1705 { 1706 rib_hca_t *hca; 1707 1708 hca = (rib_hca_t *)clnt_hdl; 1709 1710 switch (event->cm_type) { 1711 1712 /* got a connection close event */ 1713 case IBT_CM_EVENT_CONN_CLOSED: 1714 { 1715 CONN *conn; 1716 rib_qp_t *qp; 1717 1718 /* check reason why connection was closed */ 1719 switch (event->cm_event.closed) { 1720 case IBT_CM_CLOSED_DREP_RCVD: 1721 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1722 case IBT_CM_CLOSED_DUP: 1723 case IBT_CM_CLOSED_ABORT: 1724 case IBT_CM_CLOSED_ALREADY: 1725 /* 1726 * These cases indicate the local end initiated 1727 * the closing of the channel. Nothing to do here. 1728 */ 1729 break; 1730 default: 1731 /* 1732 * Reason for CONN_CLOSED event must be one of 1733 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1734 * or IBT_CM_CLOSED_STALE. These indicate cases were 1735 * the remote end is closing the channel. In these 1736 * cases free the channel and transition to error 1737 * state 1738 */ 1739 qp = ibt_get_chan_private(event->cm_channel); 1740 conn = qptoc(qp); 1741 mutex_enter(&conn->c_lock); 1742 if (conn->c_state == C_DISCONN_PEND) { 1743 mutex_exit(&conn->c_lock); 1744 break; 1745 } 1746 1747 conn->c_state = C_ERROR_CONN; 1748 1749 /* 1750 * Free the conn if c_ref is down to 0 already 1751 */ 1752 if (conn->c_ref == 0) { 1753 /* 1754 * Remove from list and free conn 1755 */ 1756 conn->c_state = C_DISCONN_PEND; 1757 mutex_exit(&conn->c_lock); 1758 rw_enter(&hca->state_lock, RW_READER); 1759 if (hca->state != HCA_DETACHED) 1760 (void) rib_disconnect_channel(conn, 1761 &hca->cl_conn_list); 1762 rw_exit(&hca->state_lock); 1763 } else { 1764 /* 1765 * conn will be freed when c_ref goes to 0. 1766 * Indicate to cleaning thread not to close 1767 * the connection, but just free the channel. 1768 */ 1769 conn->c_flags |= C_CLOSE_NOTNEEDED; 1770 mutex_exit(&conn->c_lock); 1771 } 1772 #ifdef DEBUG 1773 if (rib_debug) 1774 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1775 "(CONN_CLOSED) channel disconnected"); 1776 #endif 1777 break; 1778 } 1779 break; 1780 } 1781 default: 1782 break; 1783 } 1784 return (IBT_CM_ACCEPT); 1785 } 1786 1787 /* 1788 * Connect to the server. 1789 */ 1790 rdma_stat 1791 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1792 { 1793 ibt_chan_open_args_t chan_args; /* channel args */ 1794 ibt_chan_sizes_t chan_sizes; 1795 ibt_rc_chan_alloc_args_t qp_attr; 1796 ibt_status_t ibt_status; 1797 ibt_rc_returns_t ret_args; /* conn reject info */ 1798 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1799 ibt_ip_cm_info_t ipcm_info; 1800 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1801 1802 1803 (void) bzero(&chan_args, sizeof (chan_args)); 1804 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1805 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1806 1807 ipcm_info.src_addr.family = rptp->srcip.family; 1808 switch (ipcm_info.src_addr.family) { 1809 case AF_INET: 1810 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1811 break; 1812 case AF_INET6: 1813 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1814 break; 1815 } 1816 1817 ipcm_info.dst_addr.family = rptp->srcip.family; 1818 switch (ipcm_info.dst_addr.family) { 1819 case AF_INET: 1820 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1821 break; 1822 case AF_INET6: 1823 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1824 break; 1825 } 1826 1827 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1828 1829 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1830 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1831 1832 if (ibt_status != IBT_SUCCESS) { 1833 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1834 return (-1); 1835 } 1836 1837 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1838 /* Alloc a RC channel */ 1839 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1840 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1841 qp_attr.rc_pd = hca->pd_hdl; 1842 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1843 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1844 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1845 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1846 qp_attr.rc_clone_chan = NULL; 1847 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1848 qp_attr.rc_flags = IBT_WR_SIGNALED; 1849 1850 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1851 chan_args.oc_path = &rptp->path; 1852 1853 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1854 chan_args.oc_cm_clnt_private = (void *)hca; 1855 chan_args.oc_rdma_ra_out = 4; 1856 chan_args.oc_rdma_ra_in = 4; 1857 chan_args.oc_path_retry_cnt = 2; 1858 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1859 chan_args.oc_priv_data = cmp_ip_pvt; 1860 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1861 1862 refresh: 1863 rw_enter(&hca->state_lock, RW_READER); 1864 if (hca->state != HCA_DETACHED) { 1865 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1866 IBT_ACHAN_NO_FLAGS, 1867 &qp_attr, &qp->qp_hdl, 1868 &chan_sizes); 1869 } else { 1870 rw_exit(&hca->state_lock); 1871 return (RDMA_FAILED); 1872 } 1873 rw_exit(&hca->state_lock); 1874 1875 if (ibt_status != IBT_SUCCESS) { 1876 DTRACE_PROBE1(rpcib__i_conntosrv, 1877 int, ibt_status); 1878 return (RDMA_FAILED); 1879 } 1880 1881 /* Connect to the Server */ 1882 (void) bzero(&ret_args, sizeof (ret_args)); 1883 mutex_enter(&qp->cb_lock); 1884 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1885 IBT_BLOCKING, &chan_args, &ret_args); 1886 if (ibt_status != IBT_SUCCESS) { 1887 DTRACE_PROBE2(rpcib__i_openrctosrv, 1888 int, ibt_status, int, ret_args.rc_status); 1889 1890 (void) ibt_free_channel(qp->qp_hdl); 1891 qp->qp_hdl = NULL; 1892 mutex_exit(&qp->cb_lock); 1893 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1894 ret_args.rc_status == IBT_CM_CONN_STALE) { 1895 /* 1896 * Got IBT_CM_CONN_STALE probably because of stale 1897 * data on the passive end of a channel that existed 1898 * prior to reboot. Retry establishing a channel 1899 * REFRESH_ATTEMPTS times, during which time the 1900 * stale conditions on the server might clear up. 1901 */ 1902 goto refresh; 1903 } 1904 return (RDMA_FAILED); 1905 } 1906 mutex_exit(&qp->cb_lock); 1907 /* 1908 * Set the private data area to qp to be used in callbacks 1909 */ 1910 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1911 return (RDMA_SUCCESS); 1912 } 1913 1914 rdma_stat 1915 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1916 { 1917 uint_t i, addr_count; 1918 ibt_status_t ibt_status; 1919 uint8_t num_paths_p; 1920 ibt_ip_path_attr_t ipattr; 1921 ibt_path_ip_src_t srcip; 1922 rpcib_ipaddrs_t addrs4; 1923 rpcib_ipaddrs_t addrs6; 1924 struct sockaddr_in *sinp; 1925 struct sockaddr_in6 *sin6p; 1926 rdma_stat retval = RDMA_FAILED; 1927 rib_hca_t *hca; 1928 1929 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1930 return (RDMA_INVAL); 1931 ASSERT(raddr->buf != NULL); 1932 1933 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1934 1935 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1936 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1937 retval = RDMA_FAILED; 1938 goto done2; 1939 } 1940 1941 if (addr_type == AF_INET) { 1942 addr_count = addrs4.ri_count; 1943 sinp = (struct sockaddr_in *)raddr->buf; 1944 rptp->dstip.family = AF_INET; 1945 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1946 sinp = addrs4.ri_list; 1947 } else { 1948 addr_count = addrs6.ri_count; 1949 sin6p = (struct sockaddr_in6 *)raddr->buf; 1950 rptp->dstip.family = AF_INET6; 1951 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1952 sin6p = addrs6.ri_list; 1953 } 1954 1955 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1956 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1957 rw_enter(&hca->state_lock, RW_READER); 1958 if (hca->state == HCA_DETACHED) { 1959 rw_exit(&hca->state_lock); 1960 continue; 1961 } 1962 1963 ipattr.ipa_dst_ip = &rptp->dstip; 1964 ipattr.ipa_hca_guid = hca->hca_guid; 1965 ipattr.ipa_ndst = 1; 1966 ipattr.ipa_max_paths = 1; 1967 ipattr.ipa_src_ip.family = rptp->dstip.family; 1968 for (i = 0; i < addr_count; i++) { 1969 num_paths_p = 0; 1970 if (addr_type == AF_INET) { 1971 ipattr.ipa_src_ip.un.ip4addr = 1972 sinp[i].sin_addr.s_addr; 1973 } else { 1974 ipattr.ipa_src_ip.un.ip6addr = 1975 sin6p[i].sin6_addr; 1976 } 1977 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1978 1979 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1980 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1981 &num_paths_p, &srcip); 1982 if (ibt_status == IBT_SUCCESS && 1983 num_paths_p != 0 && 1984 rptp->path.pi_hca_guid == hca->hca_guid) { 1985 rptp->hca = hca; 1986 rw_exit(&hca->state_lock); 1987 if (addr_type == AF_INET) { 1988 rptp->srcip.family = AF_INET; 1989 rptp->srcip.un.ip4addr = 1990 srcip.ip_primary.un.ip4addr; 1991 } else { 1992 rptp->srcip.family = AF_INET6; 1993 rptp->srcip.un.ip6addr = 1994 srcip.ip_primary.un.ip6addr; 1995 1996 } 1997 retval = RDMA_SUCCESS; 1998 goto done1; 1999 } 2000 } 2001 rw_exit(&hca->state_lock); 2002 } 2003 done1: 2004 rw_exit(&rib_stat->hcas_list_lock); 2005 done2: 2006 if (addrs4.ri_size > 0) 2007 kmem_free(addrs4.ri_list, addrs4.ri_size); 2008 if (addrs6.ri_size > 0) 2009 kmem_free(addrs6.ri_list, addrs6.ri_size); 2010 return (retval); 2011 } 2012 2013 /* 2014 * Close channel, remove from connection list and 2015 * free up resources allocated for that channel. 2016 */ 2017 rdma_stat 2018 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2019 { 2020 rib_qp_t *qp = ctoqp(conn); 2021 rib_hca_t *hca; 2022 2023 mutex_enter(&conn->c_lock); 2024 if (conn->c_timeout != NULL) { 2025 mutex_exit(&conn->c_lock); 2026 (void) untimeout(conn->c_timeout); 2027 mutex_enter(&conn->c_lock); 2028 } 2029 2030 while (conn->c_flags & C_CLOSE_PENDING) { 2031 cv_wait(&conn->c_cv, &conn->c_lock); 2032 } 2033 mutex_exit(&conn->c_lock); 2034 2035 /* 2036 * c_ref == 0 and connection is in C_DISCONN_PEND 2037 */ 2038 hca = qp->hca; 2039 if (conn_list != NULL) 2040 (void) rib_rm_conn(conn, conn_list); 2041 2042 /* 2043 * There is only one case where we get here with 2044 * qp_hdl = NULL, which is during connection setup on 2045 * the client. In such a case there are no posted 2046 * send/recv buffers. 2047 */ 2048 if (qp->qp_hdl != NULL) { 2049 mutex_enter(&qp->posted_rbufs_lock); 2050 while (qp->n_posted_rbufs) 2051 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2052 mutex_exit(&qp->posted_rbufs_lock); 2053 2054 mutex_enter(&qp->send_rbufs_lock); 2055 while (qp->n_send_rbufs) 2056 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2057 mutex_exit(&qp->send_rbufs_lock); 2058 2059 (void) ibt_free_channel(qp->qp_hdl); 2060 qp->qp_hdl = NULL; 2061 } 2062 2063 ASSERT(qp->rdlist == NULL); 2064 2065 if (qp->replylist != NULL) { 2066 (void) rib_rem_replylist(qp); 2067 } 2068 2069 cv_destroy(&qp->cb_conn_cv); 2070 cv_destroy(&qp->posted_rbufs_cv); 2071 cv_destroy(&qp->send_rbufs_cv); 2072 mutex_destroy(&qp->cb_lock); 2073 mutex_destroy(&qp->replylist_lock); 2074 mutex_destroy(&qp->posted_rbufs_lock); 2075 mutex_destroy(&qp->send_rbufs_lock); 2076 mutex_destroy(&qp->rdlist_lock); 2077 2078 cv_destroy(&conn->c_cv); 2079 mutex_destroy(&conn->c_lock); 2080 2081 if (conn->c_raddr.buf != NULL) { 2082 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2083 } 2084 if (conn->c_laddr.buf != NULL) { 2085 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2086 } 2087 if (conn->c_netid != NULL) { 2088 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2089 } 2090 if (conn->c_addrmask.buf != NULL) { 2091 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len); 2092 } 2093 2094 /* 2095 * Credit control cleanup. 2096 */ 2097 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2098 rdma_clnt_cred_ctrl_t *cc_info; 2099 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2100 cv_destroy(&cc_info->clnt_cc_cv); 2101 } 2102 2103 kmem_free(qp, sizeof (rib_qp_t)); 2104 2105 /* 2106 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2107 * then the hca is no longer being used. 2108 */ 2109 if (conn_list != NULL) { 2110 rw_enter(&hca->state_lock, RW_READER); 2111 if (hca->state == HCA_DETACHED) { 2112 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2113 if (hca->srv_conn_list.conn_hd == NULL) { 2114 rw_enter(&hca->cl_conn_list.conn_lock, 2115 RW_READER); 2116 2117 if (hca->cl_conn_list.conn_hd == NULL) { 2118 mutex_enter(&hca->inuse_lock); 2119 hca->inuse = FALSE; 2120 cv_signal(&hca->cb_cv); 2121 mutex_exit(&hca->inuse_lock); 2122 } 2123 rw_exit(&hca->cl_conn_list.conn_lock); 2124 } 2125 rw_exit(&hca->srv_conn_list.conn_lock); 2126 } 2127 rw_exit(&hca->state_lock); 2128 } 2129 2130 return (RDMA_SUCCESS); 2131 } 2132 2133 /* 2134 * All sends are done under the protection of 2135 * the wdesc->sendwait_lock. n_send_rbufs count 2136 * is protected using the send_rbufs_lock. 2137 * lock ordering is: 2138 * sendwait_lock -> send_rbufs_lock 2139 */ 2140 2141 void 2142 rib_send_hold(rib_qp_t *qp) 2143 { 2144 mutex_enter(&qp->send_rbufs_lock); 2145 qp->n_send_rbufs++; 2146 mutex_exit(&qp->send_rbufs_lock); 2147 } 2148 2149 void 2150 rib_send_rele(rib_qp_t *qp) 2151 { 2152 mutex_enter(&qp->send_rbufs_lock); 2153 qp->n_send_rbufs--; 2154 if (qp->n_send_rbufs == 0) 2155 cv_signal(&qp->send_rbufs_cv); 2156 mutex_exit(&qp->send_rbufs_lock); 2157 } 2158 2159 void 2160 rib_recv_rele(rib_qp_t *qp) 2161 { 2162 mutex_enter(&qp->posted_rbufs_lock); 2163 qp->n_posted_rbufs--; 2164 if (qp->n_posted_rbufs == 0) 2165 cv_signal(&qp->posted_rbufs_cv); 2166 mutex_exit(&qp->posted_rbufs_lock); 2167 } 2168 2169 /* 2170 * Wait for send completion notification. Only on receiving a 2171 * notification be it a successful or error completion, free the 2172 * send_wid. 2173 */ 2174 static rdma_stat 2175 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2176 { 2177 clock_t timout, cv_wait_ret; 2178 rdma_stat error = RDMA_SUCCESS; 2179 int i; 2180 2181 /* 2182 * Wait for send to complete 2183 */ 2184 ASSERT(wd != NULL); 2185 mutex_enter(&wd->sendwait_lock); 2186 if (wd->status == (uint_t)SEND_WAIT) { 2187 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2188 ddi_get_lbolt(); 2189 2190 if (qp->mode == RIB_SERVER) { 2191 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2192 &wd->sendwait_lock, timout)) > 0 && 2193 wd->status == (uint_t)SEND_WAIT) 2194 ; 2195 switch (cv_wait_ret) { 2196 case -1: /* timeout */ 2197 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2198 2199 wd->cv_sig = 0; /* no signal needed */ 2200 error = RDMA_TIMEDOUT; 2201 break; 2202 default: /* got send completion */ 2203 break; 2204 } 2205 } else { 2206 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2207 &wd->sendwait_lock, timout)) > 0 && 2208 wd->status == (uint_t)SEND_WAIT) 2209 ; 2210 switch (cv_wait_ret) { 2211 case -1: /* timeout */ 2212 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2213 2214 wd->cv_sig = 0; /* no signal needed */ 2215 error = RDMA_TIMEDOUT; 2216 break; 2217 case 0: /* interrupted */ 2218 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2219 2220 wd->cv_sig = 0; /* no signal needed */ 2221 error = RDMA_INTR; 2222 break; 2223 default: /* got send completion */ 2224 break; 2225 } 2226 } 2227 } 2228 2229 if (wd->status != (uint_t)SEND_WAIT) { 2230 /* got send completion */ 2231 if (wd->status != RDMA_SUCCESS) { 2232 switch (wd->status) { 2233 case RDMA_CONNLOST: 2234 error = RDMA_CONNLOST; 2235 break; 2236 default: 2237 error = RDMA_FAILED; 2238 break; 2239 } 2240 } 2241 for (i = 0; i < wd->nsbufs; i++) { 2242 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2243 (void *)(uintptr_t)wd->sbufaddr[i]); 2244 } 2245 2246 rib_send_rele(qp); 2247 2248 mutex_exit(&wd->sendwait_lock); 2249 (void) rib_free_sendwait(wd); 2250 2251 } else { 2252 mutex_exit(&wd->sendwait_lock); 2253 } 2254 return (error); 2255 } 2256 2257 static struct send_wid * 2258 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2259 { 2260 struct send_wid *wd; 2261 2262 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2263 wd->xid = xid; 2264 wd->cv_sig = cv_sig; 2265 wd->qp = qp; 2266 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2267 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2268 wd->status = (uint_t)SEND_WAIT; 2269 2270 return (wd); 2271 } 2272 2273 static int 2274 rib_free_sendwait(struct send_wid *wdesc) 2275 { 2276 cv_destroy(&wdesc->wait_cv); 2277 mutex_destroy(&wdesc->sendwait_lock); 2278 kmem_free(wdesc, sizeof (*wdesc)); 2279 2280 return (0); 2281 } 2282 2283 static rdma_stat 2284 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2285 { 2286 mutex_enter(&qp->replylist_lock); 2287 if (rep != NULL) { 2288 (void) rib_remreply(qp, rep); 2289 mutex_exit(&qp->replylist_lock); 2290 return (RDMA_SUCCESS); 2291 } 2292 mutex_exit(&qp->replylist_lock); 2293 return (RDMA_FAILED); 2294 } 2295 2296 /* 2297 * Send buffers are freed here only in case of error in posting 2298 * on QP. If the post succeeded, the send buffers are freed upon 2299 * send completion in rib_sendwait() or in the scq_handler. 2300 */ 2301 rdma_stat 2302 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2303 int send_sig, int cv_sig, caddr_t *swid) 2304 { 2305 struct send_wid *wdesc; 2306 struct clist *clp; 2307 ibt_status_t ibt_status = IBT_SUCCESS; 2308 rdma_stat ret = RDMA_SUCCESS; 2309 ibt_send_wr_t tx_wr; 2310 int i, nds; 2311 ibt_wr_ds_t sgl[DSEG_MAX]; 2312 uint_t total_msg_size; 2313 rib_qp_t *qp; 2314 2315 qp = ctoqp(conn); 2316 2317 ASSERT(cl != NULL); 2318 2319 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2320 2321 nds = 0; 2322 total_msg_size = 0; 2323 clp = cl; 2324 while (clp != NULL) { 2325 if (nds >= DSEG_MAX) { 2326 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2327 return (RDMA_FAILED); 2328 } 2329 sgl[nds].ds_va = clp->w.c_saddr; 2330 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2331 sgl[nds].ds_len = clp->c_len; 2332 total_msg_size += clp->c_len; 2333 clp = clp->c_next; 2334 nds++; 2335 } 2336 2337 if (send_sig) { 2338 /* Set SEND_SIGNAL flag. */ 2339 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2340 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2341 *swid = (caddr_t)wdesc; 2342 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2343 mutex_enter(&wdesc->sendwait_lock); 2344 wdesc->nsbufs = nds; 2345 for (i = 0; i < nds; i++) { 2346 wdesc->sbufaddr[i] = sgl[i].ds_va; 2347 } 2348 } else { 2349 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2350 *swid = NULL; 2351 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2352 } 2353 2354 tx_wr.wr_opcode = IBT_WRC_SEND; 2355 tx_wr.wr_trans = IBT_RC_SRV; 2356 tx_wr.wr_nds = nds; 2357 tx_wr.wr_sgl = sgl; 2358 2359 mutex_enter(&conn->c_lock); 2360 if (conn->c_state == C_CONNECTED) { 2361 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2362 } 2363 if (conn->c_state != C_CONNECTED || 2364 ibt_status != IBT_SUCCESS) { 2365 if (conn->c_state != C_DISCONN_PEND) 2366 conn->c_state = C_ERROR_CONN; 2367 mutex_exit(&conn->c_lock); 2368 if (send_sig) { 2369 for (i = 0; i < nds; i++) { 2370 rib_rbuf_free(conn, SEND_BUFFER, 2371 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2372 } 2373 mutex_exit(&wdesc->sendwait_lock); 2374 (void) rib_free_sendwait(wdesc); 2375 } 2376 return (RDMA_CONNLOST); 2377 } 2378 2379 mutex_exit(&conn->c_lock); 2380 2381 if (send_sig) { 2382 rib_send_hold(qp); 2383 mutex_exit(&wdesc->sendwait_lock); 2384 if (cv_sig) { 2385 /* 2386 * cv_wait for send to complete. 2387 * We can fail due to a timeout or signal or 2388 * unsuccessful send. 2389 */ 2390 ret = rib_sendwait(qp, wdesc); 2391 2392 return (ret); 2393 } 2394 } 2395 2396 return (RDMA_SUCCESS); 2397 } 2398 2399 2400 rdma_stat 2401 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2402 { 2403 rdma_stat ret; 2404 caddr_t wd; 2405 2406 /* send-wait & cv_signal */ 2407 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2408 return (ret); 2409 } 2410 2411 /* 2412 * Deprecated/obsolete interface not used currently 2413 * but earlier used for READ-READ protocol. 2414 * Send RPC reply and wait for RDMA_DONE. 2415 */ 2416 rdma_stat 2417 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2418 { 2419 rdma_stat ret = RDMA_SUCCESS; 2420 struct rdma_done_list *rd; 2421 clock_t cv_wait_ret; 2422 caddr_t *wid = NULL; 2423 rib_qp_t *qp = ctoqp(conn); 2424 2425 mutex_enter(&qp->rdlist_lock); 2426 rd = rdma_done_add(qp, msgid); 2427 2428 /* No cv_signal (whether send-wait or no-send-wait) */ 2429 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2430 2431 if (ret != RDMA_SUCCESS) { 2432 rdma_done_rm(qp, rd); 2433 } else { 2434 /* 2435 * Wait for RDMA_DONE from remote end 2436 */ 2437 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2438 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2439 TR_CLOCK_TICK); 2440 2441 rdma_done_rm(qp, rd); 2442 2443 if (cv_wait_ret < 0) { 2444 ret = RDMA_TIMEDOUT; 2445 } 2446 } 2447 2448 mutex_exit(&qp->rdlist_lock); 2449 return (ret); 2450 } 2451 2452 static struct recv_wid * 2453 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2454 { 2455 struct recv_wid *rwid; 2456 2457 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2458 rwid->xid = msgid; 2459 rwid->addr = sgl->ds_va; 2460 rwid->qp = qp; 2461 2462 return (rwid); 2463 } 2464 2465 static void 2466 rib_free_wid(struct recv_wid *rwid) 2467 { 2468 kmem_free(rwid, sizeof (struct recv_wid)); 2469 } 2470 2471 rdma_stat 2472 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2473 { 2474 rib_qp_t *qp = ctoqp(conn); 2475 struct clist *clp = cl; 2476 struct reply *rep; 2477 struct recv_wid *rwid; 2478 int nds; 2479 ibt_wr_ds_t sgl[DSEG_MAX]; 2480 ibt_recv_wr_t recv_wr; 2481 rdma_stat ret; 2482 ibt_status_t ibt_status; 2483 2484 /* 2485 * rdma_clnt_postrecv uses RECV_BUFFER. 2486 */ 2487 2488 nds = 0; 2489 while (cl != NULL) { 2490 if (nds >= DSEG_MAX) { 2491 ret = RDMA_FAILED; 2492 goto done; 2493 } 2494 sgl[nds].ds_va = cl->w.c_saddr; 2495 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2496 sgl[nds].ds_len = cl->c_len; 2497 cl = cl->c_next; 2498 nds++; 2499 } 2500 2501 if (nds != 1) { 2502 ret = RDMA_FAILED; 2503 goto done; 2504 } 2505 2506 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2507 recv_wr.wr_nds = nds; 2508 recv_wr.wr_sgl = sgl; 2509 2510 rwid = rib_create_wid(qp, &sgl[0], msgid); 2511 if (rwid) { 2512 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2513 } else { 2514 ret = RDMA_NORESOURCE; 2515 goto done; 2516 } 2517 rep = rib_addreplylist(qp, msgid); 2518 if (!rep) { 2519 rib_free_wid(rwid); 2520 ret = RDMA_NORESOURCE; 2521 goto done; 2522 } 2523 2524 mutex_enter(&conn->c_lock); 2525 2526 if (conn->c_state == C_CONNECTED) { 2527 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2528 } 2529 2530 if (conn->c_state != C_CONNECTED || 2531 ibt_status != IBT_SUCCESS) { 2532 if (conn->c_state != C_DISCONN_PEND) 2533 conn->c_state = C_ERROR_CONN; 2534 mutex_exit(&conn->c_lock); 2535 rib_free_wid(rwid); 2536 (void) rib_rem_rep(qp, rep); 2537 ret = RDMA_CONNLOST; 2538 goto done; 2539 } 2540 2541 mutex_enter(&qp->posted_rbufs_lock); 2542 qp->n_posted_rbufs++; 2543 mutex_exit(&qp->posted_rbufs_lock); 2544 2545 mutex_exit(&conn->c_lock); 2546 return (RDMA_SUCCESS); 2547 2548 done: 2549 while (clp != NULL) { 2550 rib_rbuf_free(conn, RECV_BUFFER, 2551 (void *)(uintptr_t)clp->w.c_saddr3); 2552 clp = clp->c_next; 2553 } 2554 return (ret); 2555 } 2556 2557 rdma_stat 2558 rib_svc_post(CONN* conn, struct clist *cl) 2559 { 2560 rib_qp_t *qp = ctoqp(conn); 2561 struct svc_recv *s_recvp; 2562 int nds; 2563 ibt_wr_ds_t sgl[DSEG_MAX]; 2564 ibt_recv_wr_t recv_wr; 2565 ibt_status_t ibt_status; 2566 2567 nds = 0; 2568 while (cl != NULL) { 2569 if (nds >= DSEG_MAX) { 2570 return (RDMA_FAILED); 2571 } 2572 sgl[nds].ds_va = cl->w.c_saddr; 2573 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2574 sgl[nds].ds_len = cl->c_len; 2575 cl = cl->c_next; 2576 nds++; 2577 } 2578 2579 if (nds != 1) { 2580 rib_rbuf_free(conn, RECV_BUFFER, 2581 (caddr_t)(uintptr_t)sgl[0].ds_va); 2582 2583 return (RDMA_FAILED); 2584 } 2585 2586 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2587 recv_wr.wr_nds = nds; 2588 recv_wr.wr_sgl = sgl; 2589 2590 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2591 /* Use s_recvp's addr as wr id */ 2592 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2593 mutex_enter(&conn->c_lock); 2594 if (conn->c_state == C_CONNECTED) { 2595 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2596 } 2597 if (conn->c_state != C_CONNECTED || 2598 ibt_status != IBT_SUCCESS) { 2599 if (conn->c_state != C_DISCONN_PEND) 2600 conn->c_state = C_ERROR_CONN; 2601 mutex_exit(&conn->c_lock); 2602 rib_rbuf_free(conn, RECV_BUFFER, 2603 (caddr_t)(uintptr_t)sgl[0].ds_va); 2604 (void) rib_free_svc_recv(s_recvp); 2605 2606 return (RDMA_CONNLOST); 2607 } 2608 mutex_exit(&conn->c_lock); 2609 2610 return (RDMA_SUCCESS); 2611 } 2612 2613 /* Client */ 2614 rdma_stat 2615 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2616 { 2617 return (rib_clnt_post(conn, cl, msgid)); 2618 } 2619 2620 /* Client */ 2621 rdma_stat 2622 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2623 { 2624 rib_qp_t *qp = ctoqp(conn); 2625 struct reply *rep; 2626 2627 mutex_enter(&qp->replylist_lock); 2628 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2629 if (rep->xid == msgid) { 2630 if (rep->vaddr_cq) { 2631 rib_rbuf_free(conn, RECV_BUFFER, 2632 (caddr_t)(uintptr_t)rep->vaddr_cq); 2633 } 2634 (void) rib_remreply(qp, rep); 2635 break; 2636 } 2637 } 2638 mutex_exit(&qp->replylist_lock); 2639 2640 return (RDMA_SUCCESS); 2641 } 2642 2643 /* Server */ 2644 rdma_stat 2645 rib_post_recv(CONN *conn, struct clist *cl) 2646 { 2647 rib_qp_t *qp = ctoqp(conn); 2648 2649 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2650 mutex_enter(&qp->posted_rbufs_lock); 2651 qp->n_posted_rbufs++; 2652 mutex_exit(&qp->posted_rbufs_lock); 2653 return (RDMA_SUCCESS); 2654 } 2655 return (RDMA_FAILED); 2656 } 2657 2658 /* 2659 * Client side only interface to "recv" the rpc reply buf 2660 * posted earlier by rib_post_resp(conn, cl, msgid). 2661 */ 2662 rdma_stat 2663 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2664 { 2665 struct reply *rep = NULL; 2666 clock_t timout, cv_wait_ret; 2667 rdma_stat ret = RDMA_SUCCESS; 2668 rib_qp_t *qp = ctoqp(conn); 2669 2670 /* 2671 * Find the reply structure for this msgid 2672 */ 2673 mutex_enter(&qp->replylist_lock); 2674 2675 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2676 if (rep->xid == msgid) 2677 break; 2678 } 2679 2680 if (rep != NULL) { 2681 /* 2682 * If message not yet received, wait. 2683 */ 2684 if (rep->status == (uint_t)REPLY_WAIT) { 2685 timout = ddi_get_lbolt() + 2686 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2687 2688 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2689 &qp->replylist_lock, timout)) > 0 && 2690 rep->status == (uint_t)REPLY_WAIT) 2691 ; 2692 2693 switch (cv_wait_ret) { 2694 case -1: /* timeout */ 2695 ret = RDMA_TIMEDOUT; 2696 break; 2697 case 0: 2698 ret = RDMA_INTR; 2699 break; 2700 default: 2701 break; 2702 } 2703 } 2704 2705 if (rep->status == RDMA_SUCCESS) { 2706 struct clist *cl = NULL; 2707 2708 /* 2709 * Got message successfully 2710 */ 2711 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2712 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2713 *clp = cl; 2714 } else { 2715 if (rep->status != (uint_t)REPLY_WAIT) { 2716 /* 2717 * Got error in reply message. Free 2718 * recv buffer here. 2719 */ 2720 ret = rep->status; 2721 rib_rbuf_free(conn, RECV_BUFFER, 2722 (caddr_t)(uintptr_t)rep->vaddr_cq); 2723 } 2724 } 2725 (void) rib_remreply(qp, rep); 2726 } else { 2727 /* 2728 * No matching reply structure found for given msgid on the 2729 * reply wait list. 2730 */ 2731 ret = RDMA_INVAL; 2732 DTRACE_PROBE(rpcib__i__nomatchxid2); 2733 } 2734 2735 /* 2736 * Done. 2737 */ 2738 mutex_exit(&qp->replylist_lock); 2739 return (ret); 2740 } 2741 2742 /* 2743 * RDMA write a buffer to the remote address. 2744 */ 2745 rdma_stat 2746 rib_write(CONN *conn, struct clist *cl, int wait) 2747 { 2748 ibt_send_wr_t tx_wr; 2749 int cv_sig; 2750 ibt_wr_ds_t sgl[DSEG_MAX]; 2751 struct send_wid *wdesc; 2752 ibt_status_t ibt_status; 2753 rdma_stat ret = RDMA_SUCCESS; 2754 rib_qp_t *qp = ctoqp(conn); 2755 uint64_t n_writes = 0; 2756 2757 if (cl == NULL) { 2758 return (RDMA_FAILED); 2759 } 2760 2761 while ((cl != NULL)) { 2762 if (cl->c_len > 0) { 2763 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2764 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2765 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2766 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2767 sgl[0].ds_va = cl->w.c_saddr; 2768 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2769 sgl[0].ds_len = cl->c_len; 2770 2771 if (wait) { 2772 cv_sig = 1; 2773 } else { 2774 if (n_writes > max_unsignaled_rws) { 2775 n_writes = 0; 2776 cv_sig = 1; 2777 } else { 2778 cv_sig = 0; 2779 } 2780 } 2781 2782 if (cv_sig) { 2783 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2784 wdesc = rib_init_sendwait(0, cv_sig, qp); 2785 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2786 mutex_enter(&wdesc->sendwait_lock); 2787 } else { 2788 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2789 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2790 } 2791 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2792 tx_wr.wr_trans = IBT_RC_SRV; 2793 tx_wr.wr_nds = 1; 2794 tx_wr.wr_sgl = sgl; 2795 2796 mutex_enter(&conn->c_lock); 2797 if (conn->c_state == C_CONNECTED) { 2798 ibt_status = 2799 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2800 } 2801 if (conn->c_state != C_CONNECTED || 2802 ibt_status != IBT_SUCCESS) { 2803 if (conn->c_state != C_DISCONN_PEND) 2804 conn->c_state = C_ERROR_CONN; 2805 mutex_exit(&conn->c_lock); 2806 if (cv_sig) { 2807 mutex_exit(&wdesc->sendwait_lock); 2808 (void) rib_free_sendwait(wdesc); 2809 } 2810 return (RDMA_CONNLOST); 2811 } 2812 2813 mutex_exit(&conn->c_lock); 2814 2815 /* 2816 * Wait for send to complete 2817 */ 2818 if (cv_sig) { 2819 2820 rib_send_hold(qp); 2821 mutex_exit(&wdesc->sendwait_lock); 2822 2823 ret = rib_sendwait(qp, wdesc); 2824 if (ret != 0) 2825 return (ret); 2826 } 2827 n_writes ++; 2828 } 2829 cl = cl->c_next; 2830 } 2831 return (RDMA_SUCCESS); 2832 } 2833 2834 /* 2835 * RDMA Read a buffer from the remote address. 2836 */ 2837 rdma_stat 2838 rib_read(CONN *conn, struct clist *cl, int wait) 2839 { 2840 ibt_send_wr_t rx_wr; 2841 int cv_sig = 0; 2842 ibt_wr_ds_t sgl; 2843 struct send_wid *wdesc; 2844 ibt_status_t ibt_status = IBT_SUCCESS; 2845 rdma_stat ret = RDMA_SUCCESS; 2846 rib_qp_t *qp = ctoqp(conn); 2847 2848 if (cl == NULL) { 2849 return (RDMA_FAILED); 2850 } 2851 2852 while (cl != NULL) { 2853 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2854 /* 2855 * Remote address is at the head chunk item in list. 2856 */ 2857 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2858 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2859 2860 sgl.ds_va = cl->u.c_daddr; 2861 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2862 sgl.ds_len = cl->c_len; 2863 2864 /* 2865 * If there are multiple chunks to be read, and 2866 * wait is set, ask for signal only for the last chunk 2867 * and wait only on the last chunk. The completion of 2868 * RDMA_READ on last chunk ensures that reads on all 2869 * previous chunks are also completed. 2870 */ 2871 if (wait && (cl->c_next == NULL)) { 2872 cv_sig = 1; 2873 wdesc = rib_init_sendwait(0, cv_sig, qp); 2874 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2875 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2876 mutex_enter(&wdesc->sendwait_lock); 2877 } else { 2878 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2879 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2880 } 2881 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2882 rx_wr.wr_trans = IBT_RC_SRV; 2883 rx_wr.wr_nds = 1; 2884 rx_wr.wr_sgl = &sgl; 2885 2886 mutex_enter(&conn->c_lock); 2887 if (conn->c_state == C_CONNECTED) { 2888 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2889 } 2890 if (conn->c_state != C_CONNECTED || 2891 ibt_status != IBT_SUCCESS) { 2892 if (conn->c_state != C_DISCONN_PEND) 2893 conn->c_state = C_ERROR_CONN; 2894 mutex_exit(&conn->c_lock); 2895 if (wait && (cl->c_next == NULL)) { 2896 mutex_exit(&wdesc->sendwait_lock); 2897 (void) rib_free_sendwait(wdesc); 2898 } 2899 return (RDMA_CONNLOST); 2900 } 2901 2902 mutex_exit(&conn->c_lock); 2903 2904 /* 2905 * Wait for send to complete if this is the 2906 * last item in the list. 2907 */ 2908 if (wait && cl->c_next == NULL) { 2909 rib_send_hold(qp); 2910 mutex_exit(&wdesc->sendwait_lock); 2911 2912 ret = rib_sendwait(qp, wdesc); 2913 2914 if (ret != 0) 2915 return (ret); 2916 } 2917 cl = cl->c_next; 2918 } 2919 return (RDMA_SUCCESS); 2920 } 2921 2922 /* 2923 * rib_srv_cm_handler() 2924 * Connection Manager callback to handle RC connection requests. 2925 */ 2926 /* ARGSUSED */ 2927 static ibt_cm_status_t 2928 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2929 ibt_cm_return_args_t *ret_args, void *priv_data, 2930 ibt_priv_data_len_t len) 2931 { 2932 queue_t *q; 2933 rib_qp_t *qp; 2934 rib_hca_t *hca; 2935 rdma_stat status = RDMA_SUCCESS; 2936 int i; 2937 struct clist cl; 2938 rdma_buf_t rdbuf = {0}; 2939 void *buf = NULL; 2940 CONN *conn; 2941 ibt_ip_cm_info_t ipinfo; 2942 struct sockaddr_in *s; 2943 struct sockaddr_in6 *s6; 2944 int sin_size = sizeof (struct sockaddr_in); 2945 int in_size = sizeof (struct in_addr); 2946 int sin6_size = sizeof (struct sockaddr_in6); 2947 2948 ASSERT(any != NULL); 2949 ASSERT(event != NULL); 2950 2951 hca = (rib_hca_t *)any; 2952 2953 /* got a connection request */ 2954 switch (event->cm_type) { 2955 case IBT_CM_EVENT_REQ_RCV: 2956 /* 2957 * If the plugin is in the NO_ACCEPT state, bail out. 2958 */ 2959 mutex_enter(&plugin_state_lock); 2960 if (plugin_state == NO_ACCEPT) { 2961 mutex_exit(&plugin_state_lock); 2962 return (IBT_CM_REJECT); 2963 } 2964 mutex_exit(&plugin_state_lock); 2965 2966 /* 2967 * Need to send a MRA MAD to CM so that it does not 2968 * timeout on us. 2969 */ 2970 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2971 event->cm_event.req.req_timeout * 8, NULL, 0); 2972 2973 mutex_enter(&rib_stat->open_hca_lock); 2974 q = rib_stat->q; 2975 mutex_exit(&rib_stat->open_hca_lock); 2976 2977 status = rib_svc_create_chan(hca, (caddr_t)q, 2978 event->cm_event.req.req_prim_hca_port, &qp); 2979 2980 if (status) { 2981 return (IBT_CM_REJECT); 2982 } 2983 2984 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2985 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2986 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2987 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2988 2989 /* 2990 * Pre-posts RECV buffers 2991 */ 2992 conn = qptoc(qp); 2993 for (i = 0; i < preposted_rbufs; i++) { 2994 bzero(&rdbuf, sizeof (rdbuf)); 2995 rdbuf.type = RECV_BUFFER; 2996 buf = rib_rbuf_alloc(conn, &rdbuf); 2997 if (buf == NULL) { 2998 /* 2999 * A connection is not established yet. 3000 * Just flush the channel. Buffers 3001 * posted till now will error out with 3002 * IBT_WC_WR_FLUSHED_ERR. 3003 */ 3004 (void) ibt_flush_channel(qp->qp_hdl); 3005 (void) rib_disconnect_channel(conn, NULL); 3006 return (IBT_CM_REJECT); 3007 } 3008 3009 bzero(&cl, sizeof (cl)); 3010 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3011 cl.c_len = rdbuf.len; 3012 cl.c_smemhandle.mrc_lmr = 3013 rdbuf.handle.mrc_lmr; /* lkey */ 3014 cl.c_next = NULL; 3015 status = rib_post_recv(conn, &cl); 3016 if (status != RDMA_SUCCESS) { 3017 /* 3018 * A connection is not established yet. 3019 * Just flush the channel. Buffers 3020 * posted till now will error out with 3021 * IBT_WC_WR_FLUSHED_ERR. 3022 */ 3023 (void) ibt_flush_channel(qp->qp_hdl); 3024 (void) rib_disconnect_channel(conn, NULL); 3025 return (IBT_CM_REJECT); 3026 } 3027 } 3028 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3029 3030 /* 3031 * Get the address translation 3032 */ 3033 rw_enter(&hca->state_lock, RW_READER); 3034 if (hca->state == HCA_DETACHED) { 3035 rw_exit(&hca->state_lock); 3036 return (IBT_CM_REJECT); 3037 } 3038 rw_exit(&hca->state_lock); 3039 3040 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3041 3042 if (ibt_get_ip_data(event->cm_priv_data_len, 3043 event->cm_priv_data, 3044 &ipinfo) != IBT_SUCCESS) { 3045 3046 return (IBT_CM_REJECT); 3047 } 3048 3049 switch (ipinfo.src_addr.family) { 3050 case AF_INET: 3051 3052 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3053 KM_SLEEP); 3054 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3055 3056 conn->c_raddr.maxlen = 3057 conn->c_raddr.len = sin_size; 3058 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3059 3060 s = (struct sockaddr_in *)conn->c_raddr.buf; 3061 s->sin_family = AF_INET; 3062 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3063 &s->sin_addr, in_size); 3064 3065 conn->c_laddr.maxlen = 3066 conn->c_laddr.len = sin_size; 3067 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3068 3069 s = (struct sockaddr_in *)conn->c_laddr.buf; 3070 s->sin_family = AF_INET; 3071 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3072 &s->sin_addr, in_size); 3073 3074 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3075 sizeof (struct sockaddr_in); 3076 conn->c_addrmask.buf = 3077 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3078 ((struct sockaddr_in *) 3079 conn->c_addrmask.buf)->sin_addr.s_addr = 3080 (uint32_t)~0; 3081 ((struct sockaddr_in *) 3082 conn->c_addrmask.buf)->sin_family = 3083 (sa_family_t)~0; 3084 break; 3085 3086 case AF_INET6: 3087 3088 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3089 KM_SLEEP); 3090 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3091 3092 conn->c_raddr.maxlen = 3093 conn->c_raddr.len = sin6_size; 3094 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3095 3096 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3097 s6->sin6_family = AF_INET6; 3098 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3099 &s6->sin6_addr, 3100 sizeof (struct in6_addr)); 3101 3102 conn->c_laddr.maxlen = 3103 conn->c_laddr.len = sin6_size; 3104 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3105 3106 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3107 s6->sin6_family = AF_INET6; 3108 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3109 &s6->sin6_addr, 3110 sizeof (struct in6_addr)); 3111 3112 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3113 sizeof (struct sockaddr_in6); 3114 conn->c_addrmask.buf = 3115 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3116 (void) memset(&((struct sockaddr_in6 *) 3117 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0, 3118 sizeof (struct in6_addr)); 3119 ((struct sockaddr_in6 *) 3120 conn->c_addrmask.buf)->sin6_family = 3121 (sa_family_t)~0; 3122 break; 3123 3124 default: 3125 return (IBT_CM_REJECT); 3126 } 3127 3128 break; 3129 3130 case IBT_CM_EVENT_CONN_CLOSED: 3131 { 3132 CONN *conn; 3133 rib_qp_t *qp; 3134 3135 switch (event->cm_event.closed) { 3136 case IBT_CM_CLOSED_DREP_RCVD: 3137 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3138 case IBT_CM_CLOSED_DUP: 3139 case IBT_CM_CLOSED_ABORT: 3140 case IBT_CM_CLOSED_ALREADY: 3141 /* 3142 * These cases indicate the local end initiated 3143 * the closing of the channel. Nothing to do here. 3144 */ 3145 break; 3146 default: 3147 /* 3148 * Reason for CONN_CLOSED event must be one of 3149 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3150 * or IBT_CM_CLOSED_STALE. These indicate cases were 3151 * the remote end is closing the channel. In these 3152 * cases free the channel and transition to error 3153 * state 3154 */ 3155 qp = ibt_get_chan_private(event->cm_channel); 3156 conn = qptoc(qp); 3157 mutex_enter(&conn->c_lock); 3158 if (conn->c_state == C_DISCONN_PEND) { 3159 mutex_exit(&conn->c_lock); 3160 break; 3161 } 3162 conn->c_state = C_ERROR_CONN; 3163 3164 /* 3165 * Free the conn if c_ref goes down to 0 3166 */ 3167 if (conn->c_ref == 0) { 3168 /* 3169 * Remove from list and free conn 3170 */ 3171 conn->c_state = C_DISCONN_PEND; 3172 mutex_exit(&conn->c_lock); 3173 (void) rib_disconnect_channel(conn, 3174 &hca->srv_conn_list); 3175 } else { 3176 /* 3177 * conn will be freed when c_ref goes to 0. 3178 * Indicate to cleaning thread not to close 3179 * the connection, but just free the channel. 3180 */ 3181 conn->c_flags |= C_CLOSE_NOTNEEDED; 3182 mutex_exit(&conn->c_lock); 3183 } 3184 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3185 break; 3186 } 3187 break; 3188 } 3189 case IBT_CM_EVENT_CONN_EST: 3190 /* 3191 * RTU received, hence connection established. 3192 */ 3193 if (rib_debug > 1) 3194 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3195 "(CONN_EST) channel established"); 3196 break; 3197 3198 default: 3199 if (rib_debug > 2) { 3200 /* Let CM handle the following events. */ 3201 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3202 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3203 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3204 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3205 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3206 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3207 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3208 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3209 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3210 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3211 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3212 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3213 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3214 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3215 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3216 } 3217 } 3218 return (IBT_CM_DEFAULT); 3219 } 3220 3221 /* accept all other CM messages (i.e. let the CM handle them) */ 3222 return (IBT_CM_ACCEPT); 3223 } 3224 3225 static rdma_stat 3226 rib_register_service(rib_hca_t *hca, int service_type, 3227 uint8_t protocol_num, in_port_t dst_port) 3228 { 3229 ibt_srv_desc_t sdesc; 3230 ibt_hca_portinfo_t *port_infop; 3231 ib_svc_id_t srv_id; 3232 ibt_srv_hdl_t srv_hdl; 3233 uint_t port_size; 3234 uint_t pki, i, num_ports, nbinds; 3235 ibt_status_t ibt_status; 3236 rib_service_t *service; 3237 ib_pkey_t pkey; 3238 3239 /* 3240 * Query all ports for the given HCA 3241 */ 3242 rw_enter(&hca->state_lock, RW_READER); 3243 if (hca->state != HCA_DETACHED) { 3244 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3245 &num_ports, &port_size); 3246 rw_exit(&hca->state_lock); 3247 } else { 3248 rw_exit(&hca->state_lock); 3249 return (RDMA_FAILED); 3250 } 3251 if (ibt_status != IBT_SUCCESS) { 3252 return (RDMA_FAILED); 3253 } 3254 3255 DTRACE_PROBE1(rpcib__i__regservice_numports, 3256 int, num_ports); 3257 3258 for (i = 0; i < num_ports; i++) { 3259 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3260 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3261 int, i+1); 3262 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3263 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3264 int, i+1); 3265 } 3266 } 3267 3268 /* 3269 * Get all the IP addresses on this system to register the 3270 * given "service type" on all DNS recognized IP addrs. 3271 * Each service type such as NFS will have all the systems 3272 * IP addresses as its different names. For now the only 3273 * type of service we support in RPCIB is NFS. 3274 */ 3275 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3276 /* 3277 * Start registering and binding service to active 3278 * on active ports on this HCA. 3279 */ 3280 nbinds = 0; 3281 for (service = rib_stat->service_list; 3282 service && (service->srv_type != service_type); 3283 service = service->next) 3284 ; 3285 3286 if (service == NULL) { 3287 /* 3288 * We use IP addresses as the service names for 3289 * service registration. Register each of them 3290 * with CM to obtain a svc_id and svc_hdl. We do not 3291 * register the service with machine's loopback address. 3292 */ 3293 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3294 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3295 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3296 sdesc.sd_handler = rib_srv_cm_handler; 3297 sdesc.sd_flags = 0; 3298 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3299 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3300 1, &srv_hdl, &srv_id); 3301 if ((ibt_status != IBT_SUCCESS) && 3302 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3303 rw_exit(&rib_stat->service_list_lock); 3304 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3305 int, ibt_status); 3306 ibt_free_portinfo(port_infop, port_size); 3307 return (RDMA_FAILED); 3308 } 3309 3310 /* 3311 * Allocate and prepare a service entry 3312 */ 3313 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3314 3315 service->srv_type = service_type; 3316 service->srv_hdl = srv_hdl; 3317 service->srv_id = srv_id; 3318 3319 service->next = rib_stat->service_list; 3320 rib_stat->service_list = service; 3321 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3322 int, service->srv_type); 3323 } else { 3324 srv_hdl = service->srv_hdl; 3325 srv_id = service->srv_id; 3326 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3327 int, service->srv_type); 3328 } 3329 3330 for (i = 0; i < num_ports; i++) { 3331 ibt_sbind_hdl_t sbp; 3332 rib_hca_service_t *hca_srv; 3333 ib_gid_t gid; 3334 3335 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3336 continue; 3337 3338 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3339 pkey = port_infop[i].p_pkey_tbl[pki]; 3340 3341 rw_enter(&hca->bound_services_lock, RW_READER); 3342 gid = port_infop[i].p_sgid_tbl[0]; 3343 for (hca_srv = hca->bound_services; hca_srv; 3344 hca_srv = hca_srv->next) { 3345 if ((hca_srv->srv_id == service->srv_id) && 3346 (hca_srv->gid.gid_prefix == 3347 gid.gid_prefix) && 3348 (hca_srv->gid.gid_guid == gid.gid_guid)) 3349 break; 3350 } 3351 rw_exit(&hca->bound_services_lock); 3352 if (hca_srv != NULL) { 3353 /* 3354 * port is alreay bound the the service 3355 */ 3356 DTRACE_PROBE1( 3357 rpcib__i__regservice__already__bound, 3358 int, i+1); 3359 nbinds++; 3360 continue; 3361 } 3362 3363 if ((pkey & IBSRM_HB) && 3364 (pkey != IB_PKEY_INVALID_FULL)) { 3365 3366 sbp = NULL; 3367 ibt_status = ibt_bind_service(srv_hdl, 3368 gid, NULL, hca, &sbp); 3369 3370 if (ibt_status == IBT_SUCCESS) { 3371 hca_srv = kmem_zalloc( 3372 sizeof (rib_hca_service_t), 3373 KM_SLEEP); 3374 hca_srv->srv_id = srv_id; 3375 hca_srv->gid = gid; 3376 hca_srv->sbind_hdl = sbp; 3377 3378 rw_enter(&hca->bound_services_lock, 3379 RW_WRITER); 3380 hca_srv->next = hca->bound_services; 3381 hca->bound_services = hca_srv; 3382 rw_exit(&hca->bound_services_lock); 3383 nbinds++; 3384 } 3385 3386 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3387 int, ibt_status); 3388 } 3389 } 3390 } 3391 rw_exit(&rib_stat->service_list_lock); 3392 3393 ibt_free_portinfo(port_infop, port_size); 3394 3395 if (nbinds == 0) { 3396 return (RDMA_FAILED); 3397 } else { 3398 /* 3399 * Put this plugin into accept state, since atleast 3400 * one registration was successful. 3401 */ 3402 mutex_enter(&plugin_state_lock); 3403 plugin_state = ACCEPT; 3404 mutex_exit(&plugin_state_lock); 3405 return (RDMA_SUCCESS); 3406 } 3407 } 3408 3409 void 3410 rib_listen(struct rdma_svc_data *rd) 3411 { 3412 rdma_stat status; 3413 int n_listening = 0; 3414 rib_hca_t *hca; 3415 3416 mutex_enter(&rib_stat->listen_lock); 3417 /* 3418 * if rd parameter is NULL then it means that rib_stat->q is 3419 * already initialized by a call from RDMA and we just want to 3420 * add a newly attached HCA to the same listening state as other 3421 * HCAs. 3422 */ 3423 if (rd == NULL) { 3424 if (rib_stat->q == NULL) { 3425 mutex_exit(&rib_stat->listen_lock); 3426 return; 3427 } 3428 } else { 3429 rib_stat->q = &rd->q; 3430 } 3431 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3432 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3433 /* 3434 * First check if a hca is still attached 3435 */ 3436 rw_enter(&hca->state_lock, RW_READER); 3437 if (hca->state != HCA_INITED) { 3438 rw_exit(&hca->state_lock); 3439 continue; 3440 } 3441 rw_exit(&hca->state_lock); 3442 3443 /* 3444 * Right now the only service type is NFS. Hence 3445 * force feed this value. Ideally to communicate 3446 * the service type it should be passed down in 3447 * rdma_svc_data. 3448 */ 3449 status = rib_register_service(hca, NFS, 3450 IPPROTO_TCP, nfs_rdma_port); 3451 if (status == RDMA_SUCCESS) 3452 n_listening++; 3453 } 3454 rw_exit(&rib_stat->hcas_list_lock); 3455 3456 /* 3457 * Service active on an HCA, check rd->err_code for more 3458 * explainable errors. 3459 */ 3460 if (rd) { 3461 if (n_listening > 0) { 3462 rd->active = 1; 3463 rd->err_code = RDMA_SUCCESS; 3464 } else { 3465 rd->active = 0; 3466 rd->err_code = RDMA_FAILED; 3467 } 3468 } 3469 mutex_exit(&rib_stat->listen_lock); 3470 } 3471 3472 /* XXXX */ 3473 /* ARGSUSED */ 3474 static void 3475 rib_listen_stop(struct rdma_svc_data *svcdata) 3476 { 3477 rib_hca_t *hca; 3478 3479 mutex_enter(&rib_stat->listen_lock); 3480 /* 3481 * KRPC called the RDMATF to stop the listeners, this means 3482 * stop sending incomming or recieved requests to KRPC master 3483 * transport handle for RDMA-IB. This is also means that the 3484 * master transport handle, responsible for us, is going away. 3485 */ 3486 mutex_enter(&plugin_state_lock); 3487 plugin_state = NO_ACCEPT; 3488 if (svcdata != NULL) 3489 svcdata->active = 0; 3490 mutex_exit(&plugin_state_lock); 3491 3492 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3493 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3494 /* 3495 * First check if a hca is still attached 3496 */ 3497 rw_enter(&hca->state_lock, RW_READER); 3498 if (hca->state == HCA_DETACHED) { 3499 rw_exit(&hca->state_lock); 3500 continue; 3501 } 3502 rib_close_channels(&hca->srv_conn_list); 3503 rib_stop_services(hca); 3504 rw_exit(&hca->state_lock); 3505 } 3506 rw_exit(&rib_stat->hcas_list_lock); 3507 3508 /* 3509 * Avoid rib_listen() using the stale q field. 3510 * This could happen if a port goes up after all services 3511 * are already unregistered. 3512 */ 3513 rib_stat->q = NULL; 3514 mutex_exit(&rib_stat->listen_lock); 3515 } 3516 3517 /* 3518 * Traverse the HCA's service list to unbind and deregister services. 3519 * For each bound service of HCA to be removed, first find the corresponding 3520 * service handle (srv_hdl) and then unbind the service by calling 3521 * ibt_unbind_service(). 3522 */ 3523 static void 3524 rib_stop_services(rib_hca_t *hca) 3525 { 3526 rib_hca_service_t *srv_list, *to_remove; 3527 3528 /* 3529 * unbind and deregister the services for this service type. 3530 * Right now there is only one service type. In future it will 3531 * be passed down to this function. 3532 */ 3533 rw_enter(&hca->bound_services_lock, RW_READER); 3534 srv_list = hca->bound_services; 3535 hca->bound_services = NULL; 3536 rw_exit(&hca->bound_services_lock); 3537 3538 while (srv_list != NULL) { 3539 rib_service_t *sc; 3540 3541 to_remove = srv_list; 3542 srv_list = to_remove->next; 3543 rw_enter(&rib_stat->service_list_lock, RW_READER); 3544 for (sc = rib_stat->service_list; 3545 sc && (sc->srv_id != to_remove->srv_id); 3546 sc = sc->next) 3547 ; 3548 /* 3549 * if sc is NULL then the service doesn't exist anymore, 3550 * probably just removed completely through rib_stat. 3551 */ 3552 if (sc != NULL) 3553 (void) ibt_unbind_service(sc->srv_hdl, 3554 to_remove->sbind_hdl); 3555 rw_exit(&rib_stat->service_list_lock); 3556 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3557 } 3558 } 3559 3560 static struct svc_recv * 3561 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3562 { 3563 struct svc_recv *recvp; 3564 3565 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3566 recvp->vaddr = sgl->ds_va; 3567 recvp->qp = qp; 3568 recvp->bytes_xfer = 0; 3569 return (recvp); 3570 } 3571 3572 static int 3573 rib_free_svc_recv(struct svc_recv *recvp) 3574 { 3575 kmem_free(recvp, sizeof (*recvp)); 3576 3577 return (0); 3578 } 3579 3580 static struct reply * 3581 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3582 { 3583 struct reply *rep; 3584 3585 3586 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3587 if (rep == NULL) { 3588 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3589 return (NULL); 3590 } 3591 rep->xid = msgid; 3592 rep->vaddr_cq = NULL; 3593 rep->bytes_xfer = 0; 3594 rep->status = (uint_t)REPLY_WAIT; 3595 rep->prev = NULL; 3596 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3597 3598 mutex_enter(&qp->replylist_lock); 3599 if (qp->replylist) { 3600 rep->next = qp->replylist; 3601 qp->replylist->prev = rep; 3602 } 3603 qp->rep_list_size++; 3604 3605 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3606 int, qp->rep_list_size); 3607 3608 qp->replylist = rep; 3609 mutex_exit(&qp->replylist_lock); 3610 3611 return (rep); 3612 } 3613 3614 static rdma_stat 3615 rib_rem_replylist(rib_qp_t *qp) 3616 { 3617 struct reply *r, *n; 3618 3619 mutex_enter(&qp->replylist_lock); 3620 for (r = qp->replylist; r != NULL; r = n) { 3621 n = r->next; 3622 (void) rib_remreply(qp, r); 3623 } 3624 mutex_exit(&qp->replylist_lock); 3625 3626 return (RDMA_SUCCESS); 3627 } 3628 3629 static int 3630 rib_remreply(rib_qp_t *qp, struct reply *rep) 3631 { 3632 3633 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3634 if (rep->prev) { 3635 rep->prev->next = rep->next; 3636 } 3637 if (rep->next) { 3638 rep->next->prev = rep->prev; 3639 } 3640 if (qp->replylist == rep) 3641 qp->replylist = rep->next; 3642 3643 cv_destroy(&rep->wait_cv); 3644 qp->rep_list_size--; 3645 3646 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3647 int, qp->rep_list_size); 3648 3649 kmem_free(rep, sizeof (*rep)); 3650 3651 return (0); 3652 } 3653 3654 rdma_stat 3655 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3656 struct mrc *buf_handle) 3657 { 3658 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3659 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3660 rdma_stat status; 3661 rib_hca_t *hca = (ctoqp(conn))->hca; 3662 3663 /* 3664 * Note: ALL buffer pools use the same memory type RDMARW. 3665 */ 3666 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3667 if (status == RDMA_SUCCESS) { 3668 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3669 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3670 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3671 } else { 3672 buf_handle->mrc_linfo = NULL; 3673 buf_handle->mrc_lmr = 0; 3674 buf_handle->mrc_rmr = 0; 3675 } 3676 return (status); 3677 } 3678 3679 static rdma_stat 3680 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3681 ibt_mr_flags_t spec, 3682 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3683 { 3684 ibt_mr_attr_t mem_attr; 3685 ibt_status_t ibt_status; 3686 mem_attr.mr_vaddr = (uintptr_t)buf; 3687 mem_attr.mr_len = (ib_msglen_t)size; 3688 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3689 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3690 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3691 IBT_MR_ENABLE_WINDOW_BIND | spec; 3692 3693 rw_enter(&hca->state_lock, RW_READER); 3694 if (hca->state != HCA_DETACHED) { 3695 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3696 &mem_attr, mr_hdlp, mr_descp); 3697 rw_exit(&hca->state_lock); 3698 } else { 3699 rw_exit(&hca->state_lock); 3700 return (RDMA_FAILED); 3701 } 3702 3703 if (ibt_status != IBT_SUCCESS) { 3704 return (RDMA_FAILED); 3705 } 3706 return (RDMA_SUCCESS); 3707 } 3708 3709 rdma_stat 3710 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3711 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3712 { 3713 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3714 rib_lrc_entry_t *l; 3715 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3716 rdma_stat status; 3717 rib_hca_t *hca = (ctoqp(conn))->hca; 3718 3719 /* 3720 * Non-coherent memory registration. 3721 */ 3722 l = (rib_lrc_entry_t *)lrc; 3723 if (l) { 3724 if (l->registered) { 3725 buf_handle->mrc_linfo = 3726 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3727 buf_handle->mrc_lmr = 3728 (uint32_t)l->lrc_mhandle.mrc_lmr; 3729 buf_handle->mrc_rmr = 3730 (uint32_t)l->lrc_mhandle.mrc_rmr; 3731 *sync_handle = (RIB_SYNCMEM_HANDLE) 3732 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3733 return (RDMA_SUCCESS); 3734 } else { 3735 /* Always register the whole buffer */ 3736 buf = (caddr_t)l->lrc_buf; 3737 buflen = l->lrc_len; 3738 } 3739 } 3740 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3741 3742 if (status == RDMA_SUCCESS) { 3743 if (l) { 3744 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3745 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3746 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3747 l->registered = TRUE; 3748 } 3749 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3750 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3751 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3752 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3753 } else { 3754 buf_handle->mrc_linfo = NULL; 3755 buf_handle->mrc_lmr = 0; 3756 buf_handle->mrc_rmr = 0; 3757 } 3758 return (status); 3759 } 3760 3761 /* ARGSUSED */ 3762 rdma_stat 3763 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3764 { 3765 rib_hca_t *hca = (ctoqp(conn))->hca; 3766 /* 3767 * Allow memory deregistration even if HCA is 3768 * getting detached. Need all outstanding 3769 * memory registrations to be deregistered 3770 * before HCA_DETACH_EVENT can be accepted. 3771 */ 3772 (void) ibt_deregister_mr(hca->hca_hdl, 3773 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3774 return (RDMA_SUCCESS); 3775 } 3776 3777 /* ARGSUSED */ 3778 rdma_stat 3779 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3780 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3781 { 3782 rib_lrc_entry_t *l; 3783 l = (rib_lrc_entry_t *)lrc; 3784 if (l) 3785 if (l->registered) 3786 return (RDMA_SUCCESS); 3787 3788 (void) rib_deregistermem(conn, buf, buf_handle); 3789 3790 return (RDMA_SUCCESS); 3791 } 3792 3793 /* ARGSUSED */ 3794 rdma_stat 3795 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3796 int len, int cpu) 3797 { 3798 ibt_status_t status; 3799 rib_hca_t *hca = (ctoqp(conn))->hca; 3800 ibt_mr_sync_t mr_segment; 3801 3802 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3803 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3804 mr_segment.ms_len = (ib_memlen_t)len; 3805 if (cpu) { 3806 /* make incoming data visible to memory */ 3807 mr_segment.ms_flags = IBT_SYNC_WRITE; 3808 } else { 3809 /* make memory changes visible to IO */ 3810 mr_segment.ms_flags = IBT_SYNC_READ; 3811 } 3812 rw_enter(&hca->state_lock, RW_READER); 3813 if (hca->state != HCA_DETACHED) { 3814 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3815 rw_exit(&hca->state_lock); 3816 } else { 3817 rw_exit(&hca->state_lock); 3818 return (RDMA_FAILED); 3819 } 3820 3821 if (status == IBT_SUCCESS) 3822 return (RDMA_SUCCESS); 3823 else { 3824 return (RDMA_FAILED); 3825 } 3826 } 3827 3828 /* 3829 * XXXX ???? 3830 */ 3831 static rdma_stat 3832 rib_getinfo(rdma_info_t *info) 3833 { 3834 /* 3835 * XXXX Hack! 3836 */ 3837 info->addrlen = 16; 3838 info->mts = 1000000; 3839 info->mtu = 1000000; 3840 3841 return (RDMA_SUCCESS); 3842 } 3843 3844 rib_bufpool_t * 3845 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3846 { 3847 rib_bufpool_t *rbp = NULL; 3848 bufpool_t *bp = NULL; 3849 caddr_t buf; 3850 ibt_mr_attr_t mem_attr; 3851 ibt_status_t ibt_status; 3852 int i, j; 3853 3854 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3855 3856 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3857 num * sizeof (void *), KM_SLEEP); 3858 3859 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3860 bp->numelems = num; 3861 3862 3863 switch (ptype) { 3864 case SEND_BUFFER: 3865 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3866 bp->rsize = RPC_MSG_SZ; 3867 break; 3868 case RECV_BUFFER: 3869 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3870 bp->rsize = RPC_BUF_SIZE; 3871 break; 3872 default: 3873 goto fail; 3874 } 3875 3876 /* 3877 * Register the pool. 3878 */ 3879 bp->bufsize = num * bp->rsize; 3880 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3881 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3882 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3883 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3884 sizeof (ibt_mr_desc_t), KM_SLEEP); 3885 rw_enter(&hca->state_lock, RW_READER); 3886 3887 if (hca->state == HCA_DETACHED) { 3888 rw_exit(&hca->state_lock); 3889 goto fail; 3890 } 3891 3892 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3893 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3894 mem_attr.mr_vaddr = (uintptr_t)buf; 3895 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3896 mem_attr.mr_as = NULL; 3897 ibt_status = ibt_register_mr(hca->hca_hdl, 3898 hca->pd_hdl, &mem_attr, 3899 &rbp->mr_hdl[i], 3900 &rbp->mr_desc[i]); 3901 if (ibt_status != IBT_SUCCESS) { 3902 for (j = 0; j < i; j++) { 3903 (void) ibt_deregister_mr(hca->hca_hdl, 3904 rbp->mr_hdl[j]); 3905 } 3906 rw_exit(&hca->state_lock); 3907 goto fail; 3908 } 3909 } 3910 rw_exit(&hca->state_lock); 3911 buf = (caddr_t)bp->buf; 3912 for (i = 0; i < num; i++, buf += bp->rsize) { 3913 bp->buflist[i] = (void *)buf; 3914 } 3915 bp->buffree = num - 1; /* no. of free buffers */ 3916 rbp->bpool = bp; 3917 3918 return (rbp); 3919 fail: 3920 if (bp) { 3921 if (bp->buf) 3922 kmem_free(bp->buf, bp->bufsize); 3923 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3924 } 3925 if (rbp) { 3926 if (rbp->mr_hdl) 3927 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3928 if (rbp->mr_desc) 3929 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3930 kmem_free(rbp, sizeof (rib_bufpool_t)); 3931 } 3932 return (NULL); 3933 } 3934 3935 static void 3936 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3937 { 3938 int i; 3939 rib_bufpool_t *rbp = NULL; 3940 bufpool_t *bp; 3941 3942 /* 3943 * Obtain pool address based on type of pool 3944 */ 3945 switch (ptype) { 3946 case SEND_BUFFER: 3947 rbp = hca->send_pool; 3948 break; 3949 case RECV_BUFFER: 3950 rbp = hca->recv_pool; 3951 break; 3952 default: 3953 return; 3954 } 3955 if (rbp == NULL) 3956 return; 3957 3958 bp = rbp->bpool; 3959 3960 /* 3961 * Deregister the pool memory and free it. 3962 */ 3963 for (i = 0; i < bp->numelems; i++) { 3964 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3965 } 3966 } 3967 3968 static void 3969 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3970 { 3971 3972 rib_bufpool_t *rbp = NULL; 3973 bufpool_t *bp; 3974 3975 /* 3976 * Obtain pool address based on type of pool 3977 */ 3978 switch (ptype) { 3979 case SEND_BUFFER: 3980 rbp = hca->send_pool; 3981 break; 3982 case RECV_BUFFER: 3983 rbp = hca->recv_pool; 3984 break; 3985 default: 3986 return; 3987 } 3988 if (rbp == NULL) 3989 return; 3990 3991 bp = rbp->bpool; 3992 3993 /* 3994 * Free the pool memory. 3995 */ 3996 if (rbp->mr_hdl) 3997 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3998 3999 if (rbp->mr_desc) 4000 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4001 if (bp->buf) 4002 kmem_free(bp->buf, bp->bufsize); 4003 mutex_destroy(&bp->buflock); 4004 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4005 kmem_free(rbp, sizeof (rib_bufpool_t)); 4006 } 4007 4008 void 4009 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4010 { 4011 /* 4012 * Deregister the pool memory and free it. 4013 */ 4014 rib_rbufpool_deregister(hca, ptype); 4015 rib_rbufpool_free(hca, ptype); 4016 } 4017 4018 /* 4019 * Fetch a buffer from the pool of type specified in rdbuf->type. 4020 */ 4021 static rdma_stat 4022 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4023 { 4024 rib_lrc_entry_t *rlep; 4025 4026 if (rdbuf->type == RDMA_LONG_BUFFER) { 4027 rlep = rib_get_cache_buf(conn, rdbuf->len); 4028 rdbuf->rb_private = (caddr_t)rlep; 4029 rdbuf->addr = rlep->lrc_buf; 4030 rdbuf->handle = rlep->lrc_mhandle; 4031 return (RDMA_SUCCESS); 4032 } 4033 4034 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4035 if (rdbuf->addr) { 4036 switch (rdbuf->type) { 4037 case SEND_BUFFER: 4038 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4039 break; 4040 case RECV_BUFFER: 4041 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4042 break; 4043 default: 4044 rdbuf->len = 0; 4045 } 4046 return (RDMA_SUCCESS); 4047 } else 4048 return (RDMA_FAILED); 4049 } 4050 4051 /* 4052 * Fetch a buffer of specified type. 4053 * Note that rdbuf->handle is mw's rkey. 4054 */ 4055 static void * 4056 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4057 { 4058 rib_qp_t *qp = ctoqp(conn); 4059 rib_hca_t *hca = qp->hca; 4060 rdma_btype ptype = rdbuf->type; 4061 void *buf; 4062 rib_bufpool_t *rbp = NULL; 4063 bufpool_t *bp; 4064 int i; 4065 4066 /* 4067 * Obtain pool address based on type of pool 4068 */ 4069 switch (ptype) { 4070 case SEND_BUFFER: 4071 rbp = hca->send_pool; 4072 break; 4073 case RECV_BUFFER: 4074 rbp = hca->recv_pool; 4075 break; 4076 default: 4077 return (NULL); 4078 } 4079 if (rbp == NULL) 4080 return (NULL); 4081 4082 bp = rbp->bpool; 4083 4084 mutex_enter(&bp->buflock); 4085 if (bp->buffree < 0) { 4086 mutex_exit(&bp->buflock); 4087 return (NULL); 4088 } 4089 4090 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4091 buf = bp->buflist[bp->buffree]; 4092 rdbuf->addr = buf; 4093 rdbuf->len = bp->rsize; 4094 for (i = bp->numelems - 1; i >= 0; i--) { 4095 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4096 rdbuf->handle.mrc_rmr = 4097 (uint32_t)rbp->mr_desc[i].md_rkey; 4098 rdbuf->handle.mrc_linfo = 4099 (uintptr_t)rbp->mr_hdl[i]; 4100 rdbuf->handle.mrc_lmr = 4101 (uint32_t)rbp->mr_desc[i].md_lkey; 4102 bp->buffree--; 4103 4104 mutex_exit(&bp->buflock); 4105 4106 return (buf); 4107 } 4108 } 4109 4110 mutex_exit(&bp->buflock); 4111 4112 return (NULL); 4113 } 4114 4115 static void 4116 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4117 { 4118 4119 if (rdbuf->type == RDMA_LONG_BUFFER) { 4120 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4121 rdbuf->rb_private = NULL; 4122 return; 4123 } 4124 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4125 } 4126 4127 static void 4128 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4129 { 4130 rib_qp_t *qp = ctoqp(conn); 4131 rib_hca_t *hca = qp->hca; 4132 rib_bufpool_t *rbp = NULL; 4133 bufpool_t *bp; 4134 4135 /* 4136 * Obtain pool address based on type of pool 4137 */ 4138 switch (ptype) { 4139 case SEND_BUFFER: 4140 rbp = hca->send_pool; 4141 break; 4142 case RECV_BUFFER: 4143 rbp = hca->recv_pool; 4144 break; 4145 default: 4146 return; 4147 } 4148 if (rbp == NULL) 4149 return; 4150 4151 bp = rbp->bpool; 4152 4153 mutex_enter(&bp->buflock); 4154 if (++bp->buffree >= bp->numelems) { 4155 /* 4156 * Should never happen 4157 */ 4158 bp->buffree--; 4159 } else { 4160 bp->buflist[bp->buffree] = buf; 4161 } 4162 mutex_exit(&bp->buflock); 4163 } 4164 4165 static rdma_stat 4166 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4167 { 4168 rw_enter(&connlist->conn_lock, RW_WRITER); 4169 if (connlist->conn_hd) { 4170 cn->c_next = connlist->conn_hd; 4171 connlist->conn_hd->c_prev = cn; 4172 } 4173 connlist->conn_hd = cn; 4174 rw_exit(&connlist->conn_lock); 4175 4176 return (RDMA_SUCCESS); 4177 } 4178 4179 static rdma_stat 4180 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4181 { 4182 rw_enter(&connlist->conn_lock, RW_WRITER); 4183 if (cn->c_prev) { 4184 cn->c_prev->c_next = cn->c_next; 4185 } 4186 if (cn->c_next) { 4187 cn->c_next->c_prev = cn->c_prev; 4188 } 4189 if (connlist->conn_hd == cn) 4190 connlist->conn_hd = cn->c_next; 4191 rw_exit(&connlist->conn_lock); 4192 4193 return (RDMA_SUCCESS); 4194 } 4195 4196 /* ARGSUSED */ 4197 static rdma_stat 4198 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4199 int addr_type, void *handle, CONN **conn) 4200 { 4201 rdma_stat status; 4202 rpcib_ping_t rpt; 4203 4204 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4205 return (status); 4206 } 4207 4208 /* 4209 * rib_find_hca_connection 4210 * 4211 * if there is an existing connection to the specified address then 4212 * it will be returned in conn, otherwise conn will be set to NULL. 4213 * Also cleans up any connection that is in error state. 4214 */ 4215 static int 4216 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4217 struct netbuf *d_svcaddr, CONN **conn) 4218 { 4219 CONN *cn; 4220 clock_t cv_stat, timout; 4221 4222 *conn = NULL; 4223 again: 4224 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4225 cn = hca->cl_conn_list.conn_hd; 4226 while (cn != NULL) { 4227 /* 4228 * First, clear up any connection in the ERROR state 4229 */ 4230 mutex_enter(&cn->c_lock); 4231 if (cn->c_state == C_ERROR_CONN) { 4232 if (cn->c_ref == 0) { 4233 /* 4234 * Remove connection from list and destroy it. 4235 */ 4236 cn->c_state = C_DISCONN_PEND; 4237 mutex_exit(&cn->c_lock); 4238 rw_exit(&hca->cl_conn_list.conn_lock); 4239 rib_conn_close((void *)cn); 4240 goto again; 4241 } 4242 mutex_exit(&cn->c_lock); 4243 cn = cn->c_next; 4244 continue; 4245 } 4246 if (cn->c_state == C_DISCONN_PEND) { 4247 mutex_exit(&cn->c_lock); 4248 cn = cn->c_next; 4249 continue; 4250 } 4251 4252 /* 4253 * source address is only checked for if there is one, 4254 * this is the case for retries. 4255 */ 4256 if ((cn->c_raddr.len == d_svcaddr->len) && 4257 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4258 d_svcaddr->len) == 0) && 4259 ((s_svcaddr->len == 0) || 4260 ((cn->c_laddr.len == s_svcaddr->len) && 4261 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4262 s_svcaddr->len) == 0)))) { 4263 /* 4264 * Our connection. Give up conn list lock 4265 * as we are done traversing the list. 4266 */ 4267 rw_exit(&hca->cl_conn_list.conn_lock); 4268 if (cn->c_state == C_CONNECTED) { 4269 cn->c_ref++; /* sharing a conn */ 4270 mutex_exit(&cn->c_lock); 4271 *conn = cn; 4272 return (RDMA_SUCCESS); 4273 } 4274 if (cn->c_state == C_CONN_PEND) { 4275 /* 4276 * Hold a reference to this conn before 4277 * we give up the lock. 4278 */ 4279 cn->c_ref++; 4280 timout = ddi_get_lbolt() + 4281 drv_usectohz(CONN_WAIT_TIME * 1000000); 4282 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4283 &cn->c_lock, timout)) > 0 && 4284 cn->c_state == C_CONN_PEND) 4285 ; 4286 if (cv_stat == 0) { 4287 (void) rib_conn_release_locked(cn); 4288 return (RDMA_INTR); 4289 } 4290 if (cv_stat < 0) { 4291 (void) rib_conn_release_locked(cn); 4292 return (RDMA_TIMEDOUT); 4293 } 4294 if (cn->c_state == C_CONNECTED) { 4295 *conn = cn; 4296 mutex_exit(&cn->c_lock); 4297 return (RDMA_SUCCESS); 4298 } else { 4299 (void) rib_conn_release_locked(cn); 4300 return (RDMA_TIMEDOUT); 4301 } 4302 } 4303 } 4304 mutex_exit(&cn->c_lock); 4305 cn = cn->c_next; 4306 } 4307 rw_exit(&hca->cl_conn_list.conn_lock); 4308 *conn = NULL; 4309 return (RDMA_FAILED); 4310 } 4311 4312 /* 4313 * Connection management. 4314 * IBTF does not support recycling of channels. So connections are only 4315 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4316 * C_DISCONN_PEND state. No C_IDLE state. 4317 * C_CONN_PEND state: Connection establishment in progress to the server. 4318 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4319 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4320 * only in this state. 4321 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4322 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4323 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4324 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4325 * c_ref drops to 0 (this indicates that RPC has no more references to this 4326 * connection), the connection should be destroyed. A connection transitions 4327 * into this state when it is being destroyed. 4328 */ 4329 /* ARGSUSED */ 4330 static rdma_stat 4331 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4332 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4333 { 4334 CONN *cn; 4335 int status; 4336 rib_hca_t *hca; 4337 rib_qp_t *qp; 4338 int s_addr_len; 4339 char *s_addr_buf; 4340 4341 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4342 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4343 rw_enter(&hca->state_lock, RW_READER); 4344 if (hca->state != HCA_DETACHED) { 4345 status = rib_find_hca_connection(hca, s_svcaddr, 4346 d_svcaddr, conn); 4347 rw_exit(&hca->state_lock); 4348 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4349 rw_exit(&rib_stat->hcas_list_lock); 4350 return (status); 4351 } 4352 } else 4353 rw_exit(&hca->state_lock); 4354 } 4355 rw_exit(&rib_stat->hcas_list_lock); 4356 4357 /* 4358 * No existing connection found, establish a new connection. 4359 */ 4360 bzero(rpt, sizeof (rpcib_ping_t)); 4361 4362 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4363 if (status != RDMA_SUCCESS) { 4364 return (RDMA_FAILED); 4365 } 4366 hca = rpt->hca; 4367 4368 if (rpt->srcip.family == AF_INET) { 4369 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4370 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4371 } else if (rpt->srcip.family == AF_INET6) { 4372 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4373 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4374 } else { 4375 return (RDMA_FAILED); 4376 } 4377 4378 /* 4379 * Channel to server doesn't exist yet, create one. 4380 */ 4381 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4382 return (RDMA_FAILED); 4383 } 4384 cn = qptoc(qp); 4385 cn->c_state = C_CONN_PEND; 4386 cn->c_ref = 1; 4387 4388 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4389 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4390 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4391 4392 if (rpt->srcip.family == AF_INET) { 4393 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4394 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4395 4396 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4397 sizeof (struct sockaddr_in); 4398 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4399 4400 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr = 4401 (uint32_t)~0; 4402 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family = 4403 (ushort_t)~0; 4404 4405 } else { 4406 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4407 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4408 4409 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4410 sizeof (struct sockaddr_in6); 4411 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4412 4413 (void) memset( 4414 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr, 4415 (uchar_t)~0, sizeof (struct in6_addr)); 4416 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family = 4417 (sa_family_t)~0; 4418 } 4419 4420 /* 4421 * Add to conn list. 4422 * We had given up the READER lock. In the time since then, 4423 * another thread might have created the connection we are 4424 * trying here. But for now, that is quiet alright - there 4425 * might be two connections between a pair of hosts instead 4426 * of one. If we really want to close that window, 4427 * then need to check the list after acquiring the 4428 * WRITER lock. 4429 */ 4430 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4431 status = rib_conn_to_srv(hca, qp, rpt); 4432 mutex_enter(&cn->c_lock); 4433 4434 if (cn->c_flags & C_CLOSE_PENDING) { 4435 /* 4436 * This handles a case where the module or 4437 * HCA detached in the time a connection is 4438 * established. In such a case close the 4439 * connection immediately if this is the 4440 * only reference. 4441 */ 4442 if (cn->c_ref == 1) { 4443 cn->c_ref--; 4444 cn->c_state = C_DISCONN_PEND; 4445 mutex_exit(&cn->c_lock); 4446 rib_conn_close((void *)cn); 4447 return (RDMA_FAILED); 4448 } 4449 4450 /* 4451 * Connection to be closed later when c_ref = 0 4452 */ 4453 status = RDMA_FAILED; 4454 } 4455 4456 if (status == RDMA_SUCCESS) { 4457 cn->c_state = C_CONNECTED; 4458 *conn = cn; 4459 } else { 4460 cn->c_state = C_ERROR_CONN; 4461 cn->c_ref--; 4462 } 4463 cv_signal(&cn->c_cv); 4464 mutex_exit(&cn->c_lock); 4465 return (status); 4466 } 4467 4468 static void 4469 rib_conn_close(void *rarg) 4470 { 4471 CONN *conn = (CONN *)rarg; 4472 rib_qp_t *qp = ctoqp(conn); 4473 4474 mutex_enter(&conn->c_lock); 4475 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4476 4477 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4478 4479 /* 4480 * Live connection in CONNECTED state. 4481 */ 4482 if (conn->c_state == C_CONNECTED) { 4483 conn->c_state = C_ERROR_CONN; 4484 } 4485 mutex_exit(&conn->c_lock); 4486 4487 rib_close_a_channel(conn); 4488 4489 mutex_enter(&conn->c_lock); 4490 conn->c_flags &= ~C_CLOSE_PENDING; 4491 } 4492 4493 mutex_exit(&conn->c_lock); 4494 4495 if (qp->mode == RIB_SERVER) 4496 (void) rib_disconnect_channel(conn, 4497 &qp->hca->srv_conn_list); 4498 else 4499 (void) rib_disconnect_channel(conn, 4500 &qp->hca->cl_conn_list); 4501 } 4502 4503 static void 4504 rib_conn_timeout_call(void *carg) 4505 { 4506 time_t idle_time; 4507 CONN *conn = (CONN *)carg; 4508 rib_hca_t *hca = ctoqp(conn)->hca; 4509 int error; 4510 4511 mutex_enter(&conn->c_lock); 4512 if ((conn->c_ref > 0) || 4513 (conn->c_state == C_DISCONN_PEND)) { 4514 conn->c_timeout = NULL; 4515 mutex_exit(&conn->c_lock); 4516 return; 4517 } 4518 4519 idle_time = (gethrestime_sec() - conn->c_last_used); 4520 4521 if ((idle_time <= rib_conn_timeout) && 4522 (conn->c_state != C_ERROR_CONN)) { 4523 /* 4524 * There was activity after the last timeout. 4525 * Extend the conn life. Unless the conn is 4526 * already in error state. 4527 */ 4528 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4529 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4530 mutex_exit(&conn->c_lock); 4531 return; 4532 } 4533 4534 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4535 (void *)conn, DDI_NOSLEEP); 4536 4537 /* 4538 * If taskq dispatch fails above, then reset the timeout 4539 * to try again after 10 secs. 4540 */ 4541 4542 if (error != DDI_SUCCESS) { 4543 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4544 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4545 mutex_exit(&conn->c_lock); 4546 return; 4547 } 4548 4549 conn->c_state = C_DISCONN_PEND; 4550 mutex_exit(&conn->c_lock); 4551 } 4552 4553 static rdma_stat 4554 rib_conn_release(CONN *conn) 4555 { 4556 mutex_enter(&conn->c_lock); 4557 return (rib_conn_release_locked(conn)); 4558 } 4559 4560 /* 4561 * Expects conn->c_lock to be held on entry. 4562 * c_lock released on return 4563 */ 4564 static rdma_stat 4565 rib_conn_release_locked(CONN *conn) 4566 { 4567 conn->c_ref--; 4568 4569 conn->c_last_used = gethrestime_sec(); 4570 if (conn->c_ref > 0) { 4571 mutex_exit(&conn->c_lock); 4572 return (RDMA_SUCCESS); 4573 } 4574 4575 /* 4576 * If a conn is C_ERROR_CONN, close the channel. 4577 */ 4578 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4579 conn->c_state = C_DISCONN_PEND; 4580 mutex_exit(&conn->c_lock); 4581 rib_conn_close((void *)conn); 4582 return (RDMA_SUCCESS); 4583 } 4584 4585 /* 4586 * c_ref == 0, set a timeout for conn release 4587 */ 4588 4589 if (conn->c_timeout == NULL) { 4590 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4591 SEC_TO_TICK(rib_conn_timeout)); 4592 } 4593 4594 mutex_exit(&conn->c_lock); 4595 return (RDMA_SUCCESS); 4596 } 4597 4598 /* 4599 * Add at front of list 4600 */ 4601 static struct rdma_done_list * 4602 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4603 { 4604 struct rdma_done_list *rd; 4605 4606 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4607 4608 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4609 rd->xid = xid; 4610 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4611 4612 rd->prev = NULL; 4613 rd->next = qp->rdlist; 4614 if (qp->rdlist != NULL) 4615 qp->rdlist->prev = rd; 4616 qp->rdlist = rd; 4617 4618 return (rd); 4619 } 4620 4621 static void 4622 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4623 { 4624 struct rdma_done_list *r; 4625 4626 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4627 4628 r = rd->next; 4629 if (r != NULL) { 4630 r->prev = rd->prev; 4631 } 4632 4633 r = rd->prev; 4634 if (r != NULL) { 4635 r->next = rd->next; 4636 } else { 4637 qp->rdlist = rd->next; 4638 } 4639 4640 cv_destroy(&rd->rdma_done_cv); 4641 kmem_free(rd, sizeof (*rd)); 4642 } 4643 4644 static void 4645 rdma_done_rem_list(rib_qp_t *qp) 4646 { 4647 struct rdma_done_list *r, *n; 4648 4649 mutex_enter(&qp->rdlist_lock); 4650 for (r = qp->rdlist; r != NULL; r = n) { 4651 n = r->next; 4652 rdma_done_rm(qp, r); 4653 } 4654 mutex_exit(&qp->rdlist_lock); 4655 } 4656 4657 static void 4658 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4659 { 4660 struct rdma_done_list *r = qp->rdlist; 4661 4662 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4663 4664 while (r) { 4665 if (r->xid == xid) { 4666 cv_signal(&r->rdma_done_cv); 4667 return; 4668 } else { 4669 r = r->next; 4670 } 4671 } 4672 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4673 int, xid); 4674 } 4675 4676 /* 4677 * Expects conn->c_lock to be held by the caller. 4678 */ 4679 4680 static void 4681 rib_close_a_channel(CONN *conn) 4682 { 4683 rib_qp_t *qp; 4684 qp = ctoqp(conn); 4685 4686 if (qp->qp_hdl == NULL) { 4687 /* channel already freed */ 4688 return; 4689 } 4690 4691 /* 4692 * Call ibt_close_rc_channel in blocking mode 4693 * with no callbacks. 4694 */ 4695 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4696 NULL, 0, NULL, NULL, 0); 4697 } 4698 4699 /* 4700 * Goes through all connections and closes the channel 4701 * This will cause all the WRs on those channels to be 4702 * flushed. 4703 */ 4704 static void 4705 rib_close_channels(rib_conn_list_t *connlist) 4706 { 4707 CONN *conn, *tmp; 4708 4709 rw_enter(&connlist->conn_lock, RW_READER); 4710 conn = connlist->conn_hd; 4711 while (conn != NULL) { 4712 mutex_enter(&conn->c_lock); 4713 tmp = conn->c_next; 4714 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4715 4716 if (conn->c_state == C_CONN_PEND) { 4717 conn->c_flags |= C_CLOSE_PENDING; 4718 goto next; 4719 } 4720 4721 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4722 4723 /* 4724 * Live connection in CONNECTED state. 4725 */ 4726 if (conn->c_state == C_CONNECTED) 4727 conn->c_state = C_ERROR_CONN; 4728 mutex_exit(&conn->c_lock); 4729 4730 rib_close_a_channel(conn); 4731 4732 mutex_enter(&conn->c_lock); 4733 conn->c_flags &= ~C_CLOSE_PENDING; 4734 /* Signal a pending rib_disconnect_channel() */ 4735 cv_signal(&conn->c_cv); 4736 } 4737 next: 4738 mutex_exit(&conn->c_lock); 4739 conn = tmp; 4740 } 4741 rw_exit(&connlist->conn_lock); 4742 } 4743 4744 /* 4745 * Frees up all connections that are no longer being referenced 4746 */ 4747 static void 4748 rib_purge_connlist(rib_conn_list_t *connlist) 4749 { 4750 CONN *conn; 4751 4752 top: 4753 rw_enter(&connlist->conn_lock, RW_READER); 4754 conn = connlist->conn_hd; 4755 while (conn != NULL) { 4756 mutex_enter(&conn->c_lock); 4757 4758 /* 4759 * At this point connection is either in ERROR 4760 * or DISCONN_PEND state. If in DISCONN_PEND state 4761 * then some other thread is culling that connection. 4762 * If not and if c_ref is 0, then destroy the connection. 4763 */ 4764 if (conn->c_ref == 0 && 4765 conn->c_state != C_DISCONN_PEND) { 4766 /* 4767 * Cull the connection 4768 */ 4769 conn->c_state = C_DISCONN_PEND; 4770 mutex_exit(&conn->c_lock); 4771 rw_exit(&connlist->conn_lock); 4772 (void) rib_disconnect_channel(conn, connlist); 4773 goto top; 4774 } else { 4775 /* 4776 * conn disconnect already scheduled or will 4777 * happen from conn_release when c_ref drops to 0. 4778 */ 4779 mutex_exit(&conn->c_lock); 4780 } 4781 conn = conn->c_next; 4782 } 4783 rw_exit(&connlist->conn_lock); 4784 4785 /* 4786 * At this point, only connections with c_ref != 0 are on the list 4787 */ 4788 } 4789 4790 /* 4791 * Free all the HCA resources and close 4792 * the hca. 4793 */ 4794 4795 static void 4796 rib_free_hca(rib_hca_t *hca) 4797 { 4798 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4799 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4800 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4801 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4802 4803 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4804 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4805 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4806 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4807 4808 rib_rbufpool_destroy(hca, RECV_BUFFER); 4809 rib_rbufpool_destroy(hca, SEND_BUFFER); 4810 rib_destroy_cache(hca); 4811 if (rib_mod.rdma_count == 0) 4812 (void) rdma_unregister_mod(&rib_mod); 4813 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4814 (void) ibt_close_hca(hca->hca_hdl); 4815 hca->hca_hdl = NULL; 4816 } 4817 4818 4819 static void 4820 rib_stop_hca_services(rib_hca_t *hca) 4821 { 4822 rib_stop_services(hca); 4823 rib_close_channels(&hca->cl_conn_list); 4824 rib_close_channels(&hca->srv_conn_list); 4825 4826 rib_purge_connlist(&hca->cl_conn_list); 4827 rib_purge_connlist(&hca->srv_conn_list); 4828 4829 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4830 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4831 GLOBAL_ZONEID); 4832 stats_enabled = FALSE; 4833 } 4834 4835 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4836 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4837 if (hca->srv_conn_list.conn_hd == NULL && 4838 hca->cl_conn_list.conn_hd == NULL) { 4839 /* 4840 * conn_lists are NULL, so destroy 4841 * buffers, close hca and be done. 4842 */ 4843 rib_free_hca(hca); 4844 } 4845 rw_exit(&hca->cl_conn_list.conn_lock); 4846 rw_exit(&hca->srv_conn_list.conn_lock); 4847 4848 if (hca->hca_hdl != NULL) { 4849 mutex_enter(&hca->inuse_lock); 4850 while (hca->inuse) 4851 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4852 mutex_exit(&hca->inuse_lock); 4853 4854 rib_free_hca(hca); 4855 } 4856 rw_destroy(&hca->bound_services_lock); 4857 4858 if (hca->cleanup_helper != NULL) { 4859 ddi_taskq_destroy(hca->cleanup_helper); 4860 hca->cleanup_helper = NULL; 4861 } 4862 } 4863 4864 /* 4865 * Cleans and closes up all uses of the HCA 4866 */ 4867 static void 4868 rib_detach_hca(ibt_hca_hdl_t hca_hdl) 4869 { 4870 rib_hca_t *hca = NULL; 4871 rib_hca_t **hcap; 4872 4873 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4874 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { 4875 hca = *hcap; 4876 rw_enter(&hca->state_lock, RW_WRITER); 4877 if (hca->hca_hdl == hca_hdl) { 4878 /* 4879 * Mark as detached and remove from 4880 * hca list. 4881 */ 4882 hca->state = HCA_DETACHED; 4883 *hcap = hca->next; 4884 rib_stat->nhca_inited--; 4885 rib_mod.rdma_count--; 4886 rw_exit(&hca->state_lock); 4887 break; 4888 } 4889 rw_exit(&hca->state_lock); 4890 } 4891 rw_exit(&rib_stat->hcas_list_lock); 4892 4893 if (hca == NULL) 4894 return; 4895 ASSERT(hca->hca_hdl == hca_hdl); 4896 4897 /* 4898 * Stop all services on the HCA 4899 * Go through cl_conn_list and close all rc_channels 4900 * Go through svr_conn_list and close all rc_channels 4901 * Free connections whose c_ref has dropped to 0 4902 * Destroy all CQs 4903 * Deregister and released all buffer pool memory after all 4904 * connections are destroyed 4905 * Free the protection domain 4906 * ibt_close_hca() 4907 */ 4908 rib_stop_hca_services(hca); 4909 4910 kmem_free(hca, sizeof (*hca)); 4911 } 4912 4913 static void 4914 rib_server_side_cache_reclaim(void *argp) 4915 { 4916 cache_avl_struct_t *rcas; 4917 rib_lrc_entry_t *rb; 4918 rib_hca_t *hca = (rib_hca_t *)argp; 4919 4920 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4921 rcas = avl_first(&hca->avl_tree); 4922 if (rcas != NULL) 4923 avl_remove(&hca->avl_tree, rcas); 4924 4925 while (rcas != NULL) { 4926 while (rcas->r.forw != &rcas->r) { 4927 rcas->elements--; 4928 rb = rcas->r.forw; 4929 remque(rb); 4930 if (rb->registered) 4931 (void) rib_deregistermem_via_hca(hca, 4932 rb->lrc_buf, rb->lrc_mhandle); 4933 4934 hca->cache_allocation -= rb->lrc_len; 4935 kmem_free(rb->lrc_buf, rb->lrc_len); 4936 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4937 } 4938 mutex_destroy(&rcas->node_lock); 4939 kmem_cache_free(hca->server_side_cache, rcas); 4940 rcas = avl_first(&hca->avl_tree); 4941 if (rcas != NULL) 4942 avl_remove(&hca->avl_tree, rcas); 4943 } 4944 rw_exit(&hca->avl_rw_lock); 4945 } 4946 4947 static void 4948 rib_server_side_cache_cleanup(void *argp) 4949 { 4950 cache_avl_struct_t *rcas; 4951 rib_lrc_entry_t *rb; 4952 rib_hca_t *hca = (rib_hca_t *)argp; 4953 4954 mutex_enter(&hca->cache_allocation_lock); 4955 if (hca->cache_allocation < cache_limit) { 4956 mutex_exit(&hca->cache_allocation_lock); 4957 return; 4958 } 4959 mutex_exit(&hca->cache_allocation_lock); 4960 4961 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4962 rcas = avl_last(&hca->avl_tree); 4963 if (rcas != NULL) 4964 avl_remove(&hca->avl_tree, rcas); 4965 4966 while (rcas != NULL) { 4967 while (rcas->r.forw != &rcas->r) { 4968 rcas->elements--; 4969 rb = rcas->r.forw; 4970 remque(rb); 4971 if (rb->registered) 4972 (void) rib_deregistermem_via_hca(hca, 4973 rb->lrc_buf, rb->lrc_mhandle); 4974 4975 hca->cache_allocation -= rb->lrc_len; 4976 4977 kmem_free(rb->lrc_buf, rb->lrc_len); 4978 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4979 } 4980 mutex_destroy(&rcas->node_lock); 4981 if (hca->server_side_cache) { 4982 kmem_cache_free(hca->server_side_cache, rcas); 4983 } 4984 4985 if (hca->cache_allocation < cache_limit) { 4986 rw_exit(&hca->avl_rw_lock); 4987 return; 4988 } 4989 4990 rcas = avl_last(&hca->avl_tree); 4991 if (rcas != NULL) 4992 avl_remove(&hca->avl_tree, rcas); 4993 } 4994 rw_exit(&hca->avl_rw_lock); 4995 } 4996 4997 static int 4998 avl_compare(const void *t1, const void *t2) 4999 { 5000 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 5001 return (0); 5002 5003 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 5004 return (-1); 5005 5006 return (1); 5007 } 5008 5009 static void 5010 rib_destroy_cache(rib_hca_t *hca) 5011 { 5012 if (hca->avl_init) { 5013 rib_server_side_cache_reclaim((void *)hca); 5014 if (hca->server_side_cache) { 5015 kmem_cache_destroy(hca->server_side_cache); 5016 hca->server_side_cache = NULL; 5017 } 5018 avl_destroy(&hca->avl_tree); 5019 mutex_destroy(&hca->cache_allocation_lock); 5020 rw_destroy(&hca->avl_rw_lock); 5021 } 5022 hca->avl_init = FALSE; 5023 } 5024 5025 static void 5026 rib_force_cleanup(void *hca) 5027 { 5028 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 5029 (void) ddi_taskq_dispatch( 5030 ((rib_hca_t *)hca)->cleanup_helper, 5031 rib_server_side_cache_cleanup, 5032 (void *)hca, DDI_NOSLEEP); 5033 } 5034 5035 static rib_lrc_entry_t * 5036 rib_get_cache_buf(CONN *conn, uint32_t len) 5037 { 5038 cache_avl_struct_t cas, *rcas; 5039 rib_hca_t *hca = (ctoqp(conn))->hca; 5040 rib_lrc_entry_t *reply_buf; 5041 avl_index_t where = NULL; 5042 uint64_t c_alloc = 0; 5043 5044 if (!hca->avl_init) 5045 goto error_alloc; 5046 5047 cas.len = len; 5048 5049 rw_enter(&hca->avl_rw_lock, RW_READER); 5050 5051 mutex_enter(&hca->cache_allocation_lock); 5052 c_alloc = hca->cache_allocation; 5053 mutex_exit(&hca->cache_allocation_lock); 5054 5055 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 5056 &where)) == NULL) { 5057 /* Am I above the cache limit */ 5058 if ((c_alloc + len) >= cache_limit) { 5059 rib_force_cleanup((void *)hca); 5060 rw_exit(&hca->avl_rw_lock); 5061 mutex_enter(&hca->cache_allocation_lock); 5062 hca->cache_misses_above_the_limit ++; 5063 mutex_exit(&hca->cache_allocation_lock); 5064 5065 /* Allocate and register the buffer directly */ 5066 goto error_alloc; 5067 } 5068 5069 rw_exit(&hca->avl_rw_lock); 5070 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5071 5072 /* Recheck to make sure no other thread added the entry in */ 5073 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 5074 &cas, &where)) == NULL) { 5075 /* Allocate an avl tree entry */ 5076 rcas = (cache_avl_struct_t *) 5077 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 5078 5079 bzero(rcas, sizeof (cache_avl_struct_t)); 5080 rcas->elements = 0; 5081 rcas->r.forw = &rcas->r; 5082 rcas->r.back = &rcas->r; 5083 rcas->len = len; 5084 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5085 avl_insert(&hca->avl_tree, rcas, where); 5086 } 5087 } 5088 5089 mutex_enter(&rcas->node_lock); 5090 5091 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5092 reply_buf = rcas->r.forw; 5093 remque(reply_buf); 5094 rcas->elements--; 5095 mutex_exit(&rcas->node_lock); 5096 rw_exit(&hca->avl_rw_lock); 5097 5098 mutex_enter(&hca->cache_allocation_lock); 5099 hca->cache_hits++; 5100 hca->cache_allocation -= len; 5101 mutex_exit(&hca->cache_allocation_lock); 5102 } else { 5103 /* Am I above the cache limit */ 5104 mutex_exit(&rcas->node_lock); 5105 if ((c_alloc + len) >= cache_limit) { 5106 rib_force_cleanup((void *)hca); 5107 rw_exit(&hca->avl_rw_lock); 5108 5109 mutex_enter(&hca->cache_allocation_lock); 5110 hca->cache_misses_above_the_limit++; 5111 mutex_exit(&hca->cache_allocation_lock); 5112 /* Allocate and register the buffer directly */ 5113 goto error_alloc; 5114 } 5115 rw_exit(&hca->avl_rw_lock); 5116 mutex_enter(&hca->cache_allocation_lock); 5117 hca->cache_misses++; 5118 mutex_exit(&hca->cache_allocation_lock); 5119 /* Allocate a reply_buf entry */ 5120 reply_buf = (rib_lrc_entry_t *) 5121 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5122 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5123 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5124 reply_buf->lrc_len = len; 5125 reply_buf->registered = FALSE; 5126 reply_buf->avl_node = (void *)rcas; 5127 } 5128 5129 return (reply_buf); 5130 5131 error_alloc: 5132 reply_buf = (rib_lrc_entry_t *) 5133 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5134 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5135 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5136 reply_buf->lrc_len = len; 5137 reply_buf->registered = FALSE; 5138 reply_buf->avl_node = NULL; 5139 5140 return (reply_buf); 5141 } 5142 5143 /* 5144 * Return a pre-registered back to the cache (without 5145 * unregistering the buffer).. 5146 */ 5147 5148 static void 5149 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5150 { 5151 cache_avl_struct_t cas, *rcas; 5152 avl_index_t where = NULL; 5153 rib_hca_t *hca = (ctoqp(conn))->hca; 5154 5155 if (!hca->avl_init) 5156 goto error_free; 5157 5158 cas.len = reg_buf->lrc_len; 5159 rw_enter(&hca->avl_rw_lock, RW_READER); 5160 if ((rcas = (cache_avl_struct_t *) 5161 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5162 rw_exit(&hca->avl_rw_lock); 5163 goto error_free; 5164 } else { 5165 cas.len = reg_buf->lrc_len; 5166 mutex_enter(&rcas->node_lock); 5167 insque(reg_buf, &rcas->r); 5168 rcas->elements ++; 5169 mutex_exit(&rcas->node_lock); 5170 rw_exit(&hca->avl_rw_lock); 5171 mutex_enter(&hca->cache_allocation_lock); 5172 hca->cache_allocation += cas.len; 5173 mutex_exit(&hca->cache_allocation_lock); 5174 } 5175 5176 return; 5177 5178 error_free: 5179 5180 if (reg_buf->registered) 5181 (void) rib_deregistermem_via_hca(hca, 5182 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5183 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5184 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5185 } 5186 5187 static rdma_stat 5188 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5189 uint_t buflen, struct mrc *buf_handle) 5190 { 5191 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5192 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5193 rdma_stat status; 5194 5195 5196 /* 5197 * Note: ALL buffer pools use the same memory type RDMARW. 5198 */ 5199 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5200 if (status == RDMA_SUCCESS) { 5201 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5202 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5203 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5204 } else { 5205 buf_handle->mrc_linfo = NULL; 5206 buf_handle->mrc_lmr = 0; 5207 buf_handle->mrc_rmr = 0; 5208 } 5209 return (status); 5210 } 5211 5212 /* ARGSUSED */ 5213 static rdma_stat 5214 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5215 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5216 { 5217 5218 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5219 return (RDMA_SUCCESS); 5220 } 5221 5222 /* ARGSUSED */ 5223 static rdma_stat 5224 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5225 { 5226 5227 (void) ibt_deregister_mr(hca->hca_hdl, 5228 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5229 return (RDMA_SUCCESS); 5230 } 5231 5232 /* 5233 * Check if the IP interface named by `lifrp' is RDMA-capable. 5234 */ 5235 static boolean_t 5236 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5237 { 5238 char ifname[LIFNAMSIZ]; 5239 char *cp; 5240 5241 if (lifrp->lifr_type == IFT_IB) 5242 return (B_TRUE); 5243 5244 /* 5245 * Strip off the logical interface portion before getting 5246 * intimate with the name. 5247 */ 5248 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5249 if ((cp = strchr(ifname, ':')) != NULL) 5250 *cp = '\0'; 5251 5252 return (strcmp("lo0", ifname) == 0); 5253 } 5254 5255 static int 5256 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5257 { 5258 vnode_t *kkvp, *vp; 5259 TIUSER *tiptr; 5260 struct strioctl iocb; 5261 k_sigset_t smask; 5262 int err = 0; 5263 5264 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { 5265 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, 5266 &tiptr, CRED()) == 0) { 5267 vp = tiptr->fp->f_vnode; 5268 } else { 5269 VN_RELE(kkvp); 5270 return (EPROTO); 5271 } 5272 } else { 5273 return (EPROTO); 5274 } 5275 5276 iocb.ic_cmd = cmd; 5277 iocb.ic_timout = 0; 5278 iocb.ic_len = len; 5279 iocb.ic_dp = (caddr_t)arg; 5280 sigintr(&smask, 0); 5281 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5282 sigunintr(&smask); 5283 (void) t_kclose(tiptr, 0); 5284 VN_RELE(kkvp); 5285 return (err); 5286 } 5287 5288 /* 5289 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5290 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5291 */ 5292 static int 5293 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5294 { 5295 int err; 5296 struct lifnum lifn; 5297 5298 bzero(&lifn, sizeof (struct lifnum)); 5299 lifn.lifn_family = AF_UNSPEC; 5300 5301 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5302 if (err != 0) 5303 return (err); 5304 5305 /* 5306 * Pad the interface count to account for additional interfaces that 5307 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5308 */ 5309 lifn.lifn_count += 4; 5310 5311 bzero(lifcp, sizeof (struct lifconf)); 5312 lifcp->lifc_family = AF_UNSPEC; 5313 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5314 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5315 5316 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5317 if (err != 0) { 5318 kmem_free(lifcp->lifc_buf, *bufsizep); 5319 return (err); 5320 } 5321 return (0); 5322 } 5323 5324 static boolean_t 5325 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5326 { 5327 uint_t i, nifs; 5328 uint_t bufsize; 5329 struct lifconf lifc; 5330 struct lifreq *lifrp; 5331 struct sockaddr_in *sinp; 5332 struct sockaddr_in6 *sin6p; 5333 5334 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5335 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5336 5337 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5338 return (B_FALSE); 5339 5340 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5341 kmem_free(lifc.lifc_buf, bufsize); 5342 return (B_FALSE); 5343 } 5344 5345 /* 5346 * Worst case is that all of the addresses are IB-capable and have 5347 * the same address family, so size our buffers accordingly. 5348 */ 5349 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5350 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5351 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5352 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5353 5354 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5355 if (!rpcib_rdma_capable_interface(lifrp)) 5356 continue; 5357 5358 if (lifrp->lifr_addr.ss_family == AF_INET) { 5359 sinp = addrs4->ri_list; 5360 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5361 sizeof (struct sockaddr_in)); 5362 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5363 sin6p = addrs6->ri_list; 5364 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5365 sizeof (struct sockaddr_in6)); 5366 } 5367 } 5368 5369 kmem_free(lifc.lifc_buf, bufsize); 5370 return (B_TRUE); 5371 } 5372 5373 /* ARGSUSED */ 5374 static int 5375 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5376 { 5377 rib_hca_t *hca; 5378 5379 if (KSTAT_WRITE == rw) { 5380 return (EACCES); 5381 } 5382 5383 rpcib_kstat.cache_limit.value.ui64 = 5384 (uint64_t)cache_limit; 5385 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5386 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5387 rpcib_kstat.cache_allocation.value.ui64 += 5388 (uint64_t)hca->cache_allocation; 5389 rpcib_kstat.cache_hits.value.ui64 += 5390 (uint64_t)hca->cache_hits; 5391 rpcib_kstat.cache_misses.value.ui64 += 5392 (uint64_t)hca->cache_misses; 5393 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5394 (uint64_t)hca->cache_misses_above_the_limit; 5395 } 5396 rw_exit(&rib_stat->hcas_list_lock); 5397 return (0); 5398 } 5399