1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 #define RIBNETID_TCP "tcp" 221 #define RIBNETID_TCP6 "tcp6" 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 234 235 int preposted_rbufs = RDMA_BUFS_GRANT; 236 int send_threshold = 1; 237 238 /* 239 * Old cards with Tavor driver have limited memory footprint 240 * when booted in 32bit. The rib_max_rbufs tunable can be 241 * tuned for more buffers if needed. 242 */ 243 244 #if !defined(_ELF64) && !defined(__sparc) 245 int rib_max_rbufs = MAX_BUFS; 246 #else 247 int rib_max_rbufs = 10 * MAX_BUFS; 248 #endif /* !(_ELF64) && !(__sparc) */ 249 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 251 252 /* 253 * State of the plugin. 254 * ACCEPT = accepting new connections and requests. 255 * NO_ACCEPT = not accepting new connection and requests. 256 * This should eventually move to rpcib_state_t structure, since this 257 * will tell in which state the plugin is for a particular type of service 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 259 * state for one and in no_accept state for the other. 260 */ 261 int plugin_state; 262 kmutex_t plugin_state_lock; 263 264 ldi_ident_t rpcib_li; 265 266 /* 267 * RPCIB RDMATF operations 268 */ 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 270 static rdma_stat rib_disconnect(CONN *conn); 271 static void rib_listen(struct rdma_svc_data *rd); 272 static void rib_listen_stop(struct rdma_svc_data *rd); 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 274 uint_t buflen, struct mrc *buf_handle); 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 276 struct mrc buf_handle); 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 280 struct mrc buf_handle); 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 283 void *lrc); 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 287 caddr_t buf, int len, int cpu); 288 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 290 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 293 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 295 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 306 int addr_type, void *, CONN **); 307 static rdma_stat rib_conn_release(CONN *conn); 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 309 rpcib_ping_t *, CONN **); 310 static rdma_stat rib_getinfo(rdma_info_t *info); 311 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 314 static void rib_destroy_cache(rib_hca_t *hca); 315 static void rib_server_side_cache_reclaim(void *argp); 316 static int avl_compare(const void *t1, const void *t2); 317 318 static void rib_stop_services(rib_hca_t *); 319 static void rib_close_channels(rib_conn_list_t *); 320 static void rib_conn_close(void *); 321 static void rib_recv_rele(rib_qp_t *); 322 static rdma_stat rib_conn_release_locked(CONN *conn); 323 324 /* 325 * RPCIB addressing operations 326 */ 327 328 /* 329 * RDMA operations the RPCIB module exports 330 */ 331 static rdmaops_t rib_ops = { 332 rib_reachable, 333 rib_conn_get, 334 rib_conn_release, 335 rib_listen, 336 rib_listen_stop, 337 rib_registermem, 338 rib_deregistermem, 339 rib_registermemsync, 340 rib_deregistermemsync, 341 rib_syncmem, 342 rib_reg_buf_alloc, 343 rib_reg_buf_free, 344 rib_send, 345 rib_send_resp, 346 rib_post_resp, 347 rib_post_resp_remove, 348 rib_post_recv, 349 rib_recv, 350 rib_read, 351 rib_write, 352 rib_getinfo, 353 }; 354 355 /* 356 * RDMATF RPCIB plugin details 357 */ 358 static rdma_mod_t rib_mod = { 359 "ibtf", /* api name */ 360 RDMATF_VERS_1, 361 0, 362 &rib_ops, /* rdma op vector for ibtf */ 363 }; 364 365 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 366 static rdma_stat rib_qp_init(rib_qp_t *, int); 367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 373 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 375 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 378 rib_qp_t **); 379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 380 rib_qp_t **); 381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 383 static int rib_free_sendwait(struct send_wid *); 384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 386 static void rdma_done_rem_list(rib_qp_t *); 387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 388 389 static void rib_async_handler(void *, 390 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 393 static int rib_free_svc_recv(struct svc_recv *); 394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 395 static void rib_free_wid(struct recv_wid *); 396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 397 static void rib_detach_hca(ibt_hca_hdl_t); 398 static void rib_close_a_channel(CONN *); 399 static void rib_send_hold(rib_qp_t *); 400 static void rib_send_rele(rib_qp_t *); 401 402 /* 403 * Registration with IBTF as a consumer 404 */ 405 static struct ibt_clnt_modinfo_s rib_modinfo = { 406 IBTI_V_CURR, 407 IBT_GENERIC, 408 rib_async_handler, /* async event handler */ 409 NULL, /* Memory Region Handler */ 410 "nfs/ib" 411 }; 412 413 /* 414 * Global strucuture 415 */ 416 417 typedef struct rpcib_s { 418 dev_info_t *rpcib_dip; 419 kmutex_t rpcib_mutex; 420 } rpcib_t; 421 422 rpcib_t rpcib; 423 424 /* 425 * /etc/system controlled variable to control 426 * debugging in rpcib kernel module. 427 * Set it to values greater that 1 to control 428 * the amount of debugging messages required. 429 */ 430 int rib_debug = 0; 431 432 int 433 _init(void) 434 { 435 int error; 436 437 error = mod_install((struct modlinkage *)&rib_modlinkage); 438 if (error != 0) { 439 /* 440 * Could not load module 441 */ 442 return (error); 443 } 444 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 445 return (0); 446 } 447 448 int 449 _fini() 450 { 451 int status; 452 453 /* 454 * Remove module 455 */ 456 if ((status = mod_remove(&rib_modlinkage)) != 0) { 457 return (status); 458 } 459 mutex_destroy(&plugin_state_lock); 460 return (0); 461 } 462 463 int 464 _info(struct modinfo *modinfop) 465 { 466 return (mod_info(&rib_modlinkage, modinfop)); 467 } 468 469 /* 470 * rpcib_getinfo() 471 * Given the device number, return the devinfo pointer or the 472 * instance number. 473 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 474 */ 475 476 /*ARGSUSED*/ 477 static int 478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 479 { 480 int ret = DDI_SUCCESS; 481 482 switch (cmd) { 483 case DDI_INFO_DEVT2DEVINFO: 484 if (rpcib.rpcib_dip != NULL) 485 *result = rpcib.rpcib_dip; 486 else { 487 *result = NULL; 488 ret = DDI_FAILURE; 489 } 490 break; 491 492 case DDI_INFO_DEVT2INSTANCE: 493 *result = NULL; 494 break; 495 496 default: 497 ret = DDI_FAILURE; 498 } 499 return (ret); 500 } 501 502 static void 503 rpcib_free_hca_list() 504 { 505 rib_hca_t *hca, *hcap; 506 507 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 508 hca = rib_stat->hcas_list; 509 rib_stat->hcas_list = NULL; 510 rw_exit(&rib_stat->hcas_list_lock); 511 while (hca != NULL) { 512 rw_enter(&hca->state_lock, RW_WRITER); 513 hcap = hca; 514 hca = hca->next; 515 rib_stat->nhca_inited--; 516 rib_mod.rdma_count--; 517 hcap->state = HCA_DETACHED; 518 rw_exit(&hcap->state_lock); 519 rib_stop_hca_services(hcap); 520 521 kmem_free(hcap, sizeof (*hcap)); 522 } 523 } 524 525 static rdma_stat 526 rpcib_free_service_list() 527 { 528 rib_service_t *service; 529 ibt_status_t ret; 530 531 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 532 while (rib_stat->service_list != NULL) { 533 service = rib_stat->service_list; 534 ret = ibt_unbind_all_services(service->srv_hdl); 535 if (ret != IBT_SUCCESS) { 536 rw_exit(&rib_stat->service_list_lock); 537 #ifdef DEBUG 538 cmn_err(CE_NOTE, "rpcib_free_service_list: " 539 "ibt_unbind_all_services failed (%d)\n", (int)ret); 540 #endif 541 return (RDMA_FAILED); 542 } 543 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 544 service->srv_hdl); 545 if (ret != IBT_SUCCESS) { 546 rw_exit(&rib_stat->service_list_lock); 547 #ifdef DEBUG 548 cmn_err(CE_NOTE, "rpcib_free_service_list: " 549 "ibt_deregister_service failed (%d)\n", (int)ret); 550 #endif 551 return (RDMA_FAILED); 552 } 553 rib_stat->service_list = service->next; 554 kmem_free(service, sizeof (rib_service_t)); 555 } 556 rw_exit(&rib_stat->service_list_lock); 557 558 return (RDMA_SUCCESS); 559 } 560 561 static int 562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 563 { 564 ibt_status_t ibt_status; 565 rdma_stat r_status; 566 567 switch (cmd) { 568 case DDI_ATTACH: 569 break; 570 case DDI_RESUME: 571 return (DDI_SUCCESS); 572 default: 573 return (DDI_FAILURE); 574 } 575 576 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 577 578 mutex_enter(&rpcib.rpcib_mutex); 579 if (rpcib.rpcib_dip != NULL) { 580 mutex_exit(&rpcib.rpcib_mutex); 581 return (DDI_FAILURE); 582 } 583 rpcib.rpcib_dip = dip; 584 mutex_exit(&rpcib.rpcib_mutex); 585 /* 586 * Create the "rpcib" minor-node. 587 */ 588 if (ddi_create_minor_node(dip, 589 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 590 /* Error message, no cmn_err as they print on console */ 591 return (DDI_FAILURE); 592 } 593 594 if (rib_stat == NULL) { 595 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 596 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 597 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 598 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 599 } 600 601 rib_stat->hca_count = ibt_get_hca_list(NULL); 602 if (rib_stat->hca_count < 1) { 603 mutex_destroy(&rib_stat->listen_lock); 604 rw_destroy(&rib_stat->hcas_list_lock); 605 mutex_destroy(&rib_stat->open_hca_lock); 606 kmem_free(rib_stat, sizeof (*rib_stat)); 607 rib_stat = NULL; 608 return (DDI_FAILURE); 609 } 610 611 ibt_status = ibt_attach(&rib_modinfo, dip, 612 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 613 614 if (ibt_status != IBT_SUCCESS) { 615 mutex_destroy(&rib_stat->listen_lock); 616 rw_destroy(&rib_stat->hcas_list_lock); 617 mutex_destroy(&rib_stat->open_hca_lock); 618 kmem_free(rib_stat, sizeof (*rib_stat)); 619 rib_stat = NULL; 620 return (DDI_FAILURE); 621 } 622 623 rib_stat->service_list = NULL; 624 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 625 mutex_enter(&rib_stat->open_hca_lock); 626 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 627 mutex_exit(&rib_stat->open_hca_lock); 628 goto open_fail; 629 } 630 mutex_exit(&rib_stat->open_hca_lock); 631 632 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 633 DDI_PROP_SUCCESS) { 634 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 635 "failed."); 636 goto register_fail; 637 } 638 639 /* 640 * Register with rdmatf 641 */ 642 r_status = rdma_register_mod(&rib_mod); 643 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 644 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 645 "status = %d", r_status); 646 goto register_fail; 647 } 648 649 return (DDI_SUCCESS); 650 651 register_fail: 652 653 open_fail: 654 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 655 rpcib_free_hca_list(); 656 (void) rpcib_free_service_list(); 657 mutex_destroy(&rib_stat->listen_lock); 658 rw_destroy(&rib_stat->hcas_list_lock); 659 mutex_destroy(&rib_stat->open_hca_lock); 660 rw_destroy(&rib_stat->service_list_lock); 661 kmem_free(rib_stat, sizeof (*rib_stat)); 662 rib_stat = NULL; 663 return (DDI_FAILURE); 664 } 665 666 /*ARGSUSED*/ 667 static int 668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 669 { 670 switch (cmd) { 671 672 case DDI_DETACH: 673 break; 674 675 case DDI_SUSPEND: 676 default: 677 return (DDI_FAILURE); 678 } 679 680 /* 681 * Detach the hca and free resources 682 */ 683 mutex_enter(&plugin_state_lock); 684 plugin_state = NO_ACCEPT; 685 mutex_exit(&plugin_state_lock); 686 687 if (rpcib_free_service_list() != RDMA_SUCCESS) 688 return (DDI_FAILURE); 689 rpcib_free_hca_list(); 690 691 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 692 mutex_destroy(&rib_stat->listen_lock); 693 rw_destroy(&rib_stat->hcas_list_lock); 694 mutex_destroy(&rib_stat->open_hca_lock); 695 rw_destroy(&rib_stat->service_list_lock); 696 697 kmem_free(rib_stat, sizeof (*rib_stat)); 698 rib_stat = NULL; 699 700 mutex_enter(&rpcib.rpcib_mutex); 701 rpcib.rpcib_dip = NULL; 702 mutex_exit(&rpcib.rpcib_mutex); 703 mutex_destroy(&rpcib.rpcib_mutex); 704 return (DDI_SUCCESS); 705 } 706 707 708 static void rib_rbufpool_free(rib_hca_t *, int); 709 static void rib_rbufpool_deregister(rib_hca_t *, int); 710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 712 static rdma_stat rib_rem_replylist(rib_qp_t *); 713 static int rib_remreply(rib_qp_t *, struct reply *); 714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 716 717 718 /* 719 * One CQ pair per HCA 720 */ 721 static rdma_stat 722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 723 rib_cq_t **cqp) 724 { 725 rib_cq_t *cq; 726 ibt_cq_attr_t cq_attr; 727 uint32_t real_size; 728 ibt_status_t status; 729 rdma_stat error = RDMA_SUCCESS; 730 731 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 732 cq->rib_hca = hca; 733 cq_attr.cq_size = cq_size; 734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 736 &real_size); 737 if (status != IBT_SUCCESS) { 738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 739 " status=%d", status); 740 error = RDMA_FAILED; 741 goto fail; 742 } 743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 744 745 /* 746 * Enable CQ callbacks. CQ Callbacks are single shot 747 * (e.g. you have to call ibt_enable_cq_notify() 748 * after each callback to get another one). 749 */ 750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 751 if (status != IBT_SUCCESS) { 752 cmn_err(CE_WARN, "rib_create_cq: " 753 "enable_cq_notify failed, status %d", status); 754 error = RDMA_FAILED; 755 goto fail; 756 } 757 *cqp = cq; 758 759 return (error); 760 fail: 761 if (cq->rib_cq_hdl) 762 (void) ibt_free_cq(cq->rib_cq_hdl); 763 if (cq) 764 kmem_free(cq, sizeof (rib_cq_t)); 765 return (error); 766 } 767 768 /* 769 * rpcib_find_hca 770 * 771 * Caller should have already locked the hcas_lock before calling 772 * this function. 773 */ 774 static rib_hca_t * 775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 776 { 777 rib_hca_t *hca = ribstat->hcas_list; 778 779 while (hca && hca->hca_guid != guid) 780 hca = hca->next; 781 782 return (hca); 783 } 784 785 static rdma_stat 786 rpcib_open_hcas(rpcib_state_t *ribstat) 787 { 788 rib_hca_t *hca; 789 ibt_status_t ibt_status; 790 rdma_stat status; 791 ibt_hca_portinfo_t *pinfop; 792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 793 uint_t size, cq_size; 794 int i; 795 kstat_t *ksp; 796 cache_avl_struct_t example_avl_node; 797 char rssc_name[32]; 798 int old_nhca_inited = ribstat->nhca_inited; 799 ib_guid_t *hca_guids; 800 801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 802 803 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 804 if (ribstat->hca_count == 0) 805 return (RDMA_FAILED); 806 807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 808 /* 809 * Open a hca and setup for RDMA 810 */ 811 for (i = 0; i < ribstat->hca_count; i++) { 812 if (rpcib_find_hca(ribstat, hca_guids[i])) 813 continue; 814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 815 816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 817 hca_guids[i], &hca->hca_hdl); 818 if (ibt_status != IBT_SUCCESS) { 819 kmem_free(hca, sizeof (rib_hca_t)); 820 continue; 821 } 822 hca->hca_guid = hca_guids[i]; 823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 824 hca->state = HCA_INITED; 825 826 /* 827 * query HCA info 828 */ 829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 830 if (ibt_status != IBT_SUCCESS) { 831 goto fail1; 832 } 833 834 /* 835 * One PD (Protection Domain) per HCA. 836 * A qp is allowed to access a memory region 837 * only when it's in the same PD as that of 838 * the memory region. 839 */ 840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 841 if (ibt_status != IBT_SUCCESS) { 842 goto fail1; 843 } 844 845 /* 846 * query HCA ports 847 */ 848 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 849 0, &pinfop, &hca->hca_nports, &size); 850 if (ibt_status != IBT_SUCCESS) { 851 goto fail2; 852 } 853 hca->hca_ports = pinfop; 854 hca->hca_pinfosz = size; 855 pinfop = NULL; 856 857 cq_size = DEF_CQ_SIZE; /* default cq size */ 858 /* 859 * Create 2 pairs of cq's (1 pair for client 860 * and the other pair for server) on this hca. 861 * If number of qp's gets too large, then several 862 * cq's will be needed. 863 */ 864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 865 &hca->svc_rcq); 866 if (status != RDMA_SUCCESS) { 867 goto fail3; 868 } 869 870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 871 &hca->svc_scq); 872 if (status != RDMA_SUCCESS) { 873 goto fail3; 874 } 875 876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 877 &hca->clnt_rcq); 878 if (status != RDMA_SUCCESS) { 879 goto fail3; 880 } 881 882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 883 &hca->clnt_scq); 884 if (status != RDMA_SUCCESS) { 885 goto fail3; 886 } 887 888 /* 889 * Create buffer pools. 890 * Note rib_rbuf_create also allocates memory windows. 891 */ 892 hca->recv_pool = rib_rbufpool_create(hca, 893 RECV_BUFFER, rib_max_rbufs); 894 if (hca->recv_pool == NULL) { 895 goto fail3; 896 } 897 898 hca->send_pool = rib_rbufpool_create(hca, 899 SEND_BUFFER, rib_max_rbufs); 900 if (hca->send_pool == NULL) { 901 rib_rbufpool_destroy(hca, RECV_BUFFER); 902 goto fail3; 903 } 904 905 if (hca->server_side_cache == NULL) { 906 (void) sprintf(rssc_name, 907 "rib_srvr_cache_%llx", 908 (long long unsigned int) hca->hca_guid); 909 hca->server_side_cache = kmem_cache_create( 910 rssc_name, 911 sizeof (cache_avl_struct_t), 0, 912 NULL, 913 NULL, 914 rib_server_side_cache_reclaim, 915 hca, NULL, 0); 916 } 917 918 avl_create(&hca->avl_tree, 919 avl_compare, 920 sizeof (cache_avl_struct_t), 921 (uint_t)(uintptr_t)&example_avl_node.avl_link- 922 (uint_t)(uintptr_t)&example_avl_node); 923 924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 925 hca->iblock); 926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 927 rw_init(&hca->avl_rw_lock, 928 NULL, RW_DRIVER, hca->iblock); 929 mutex_init(&hca->cache_allocation_lock, 930 NULL, MUTEX_DRIVER, NULL); 931 hca->avl_init = TRUE; 932 933 /* Create kstats for the cache */ 934 ASSERT(INGLOBALZONE(curproc)); 935 936 if (!stats_enabled) { 937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 938 KSTAT_TYPE_NAMED, 939 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 941 GLOBAL_ZONEID); 942 if (ksp) { 943 ksp->ks_data = (void *) &rpcib_kstat; 944 ksp->ks_update = rpcib_cache_kstat_update; 945 kstat_install(ksp); 946 stats_enabled = TRUE; 947 } 948 } 949 if (hca->cleanup_helper == NULL) { 950 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 951 952 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 953 (unsigned long long int) hca->hca_guid); 954 hca->cleanup_helper = ddi_taskq_create(NULL, 955 tq_name, 1, TASKQ_DEFAULTPRI, 0); 956 } 957 958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 961 hca->iblock); 962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 963 hca->iblock); 964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 965 hca->inuse = TRUE; 966 967 hca->next = ribstat->hcas_list; 968 ribstat->hcas_list = hca; 969 ribstat->nhca_inited++; 970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 971 continue; 972 973 fail3: 974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 975 fail2: 976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 977 fail1: 978 (void) ibt_close_hca(hca->hca_hdl); 979 kmem_free(hca, sizeof (rib_hca_t)); 980 } 981 rw_exit(&ribstat->hcas_list_lock); 982 ibt_free_hca_list(hca_guids, ribstat->hca_count); 983 rib_mod.rdma_count = rib_stat->nhca_inited; 984 985 /* 986 * return success if at least one new hca has been configured. 987 */ 988 if (ribstat->nhca_inited != old_nhca_inited) 989 return (RDMA_SUCCESS); 990 else 991 return (RDMA_FAILED); 992 } 993 994 /* 995 * Callback routines 996 */ 997 998 /* 999 * SCQ handlers 1000 */ 1001 /* ARGSUSED */ 1002 static void 1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1004 { 1005 ibt_status_t ibt_status; 1006 ibt_wc_t wc; 1007 struct send_wid *wd; 1008 CONN *conn; 1009 rib_qp_t *qp; 1010 int i; 1011 1012 /* 1013 * Re-enable cq notify here to avoid missing any 1014 * completion queue notification. 1015 */ 1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1017 1018 ibt_status = IBT_SUCCESS; 1019 while (ibt_status != IBT_CQ_EMPTY) { 1020 bzero(&wc, sizeof (wc)); 1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1022 if (ibt_status != IBT_SUCCESS) 1023 return; 1024 1025 /* 1026 * Got a send completion 1027 */ 1028 if (wc.wc_id != RDMA_DUMMY_WRID) { 1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1030 qp = wd->qp; 1031 conn = qptoc(qp); 1032 1033 mutex_enter(&wd->sendwait_lock); 1034 switch (wc.wc_status) { 1035 case IBT_WC_SUCCESS: 1036 wd->status = RDMA_SUCCESS; 1037 break; 1038 default: 1039 /* 1040 * RC Send Q Error Code Local state Remote State 1041 * ==================== =========== ============ 1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1043 * IBT_WC_LOCAL_LEN_ERR ERROR None 1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1052 * IBT_WC_WR_FLUSHED_ERR ERROR None 1053 */ 1054 /* 1055 * Channel in error state. Set connection to 1056 * ERROR and cleanup will happen either from 1057 * conn_release or from rib_conn_get 1058 */ 1059 wd->status = RDMA_FAILED; 1060 mutex_enter(&conn->c_lock); 1061 if (conn->c_state != C_DISCONN_PEND) 1062 conn->c_state = C_ERROR_CONN; 1063 mutex_exit(&conn->c_lock); 1064 break; 1065 } 1066 1067 if (wd->cv_sig == 1) { 1068 /* 1069 * Notify poster 1070 */ 1071 cv_signal(&wd->wait_cv); 1072 mutex_exit(&wd->sendwait_lock); 1073 } else { 1074 /* 1075 * Poster not waiting for notification. 1076 * Free the send buffers and send_wid 1077 */ 1078 for (i = 0; i < wd->nsbufs; i++) { 1079 rib_rbuf_free(qptoc(wd->qp), 1080 SEND_BUFFER, 1081 (void *)(uintptr_t)wd->sbufaddr[i]); 1082 } 1083 1084 /* decrement the send ref count */ 1085 rib_send_rele(qp); 1086 1087 mutex_exit(&wd->sendwait_lock); 1088 (void) rib_free_sendwait(wd); 1089 } 1090 } 1091 } 1092 } 1093 1094 /* ARGSUSED */ 1095 static void 1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1097 { 1098 ibt_status_t ibt_status; 1099 ibt_wc_t wc; 1100 struct send_wid *wd; 1101 rib_qp_t *qp; 1102 CONN *conn; 1103 int i; 1104 1105 /* 1106 * Re-enable cq notify here to avoid missing any 1107 * completion queue notification. 1108 */ 1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1110 1111 ibt_status = IBT_SUCCESS; 1112 while (ibt_status != IBT_CQ_EMPTY) { 1113 bzero(&wc, sizeof (wc)); 1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1115 if (ibt_status != IBT_SUCCESS) 1116 return; 1117 1118 /* 1119 * Got a send completion 1120 */ 1121 if (wc.wc_id != RDMA_DUMMY_WRID) { 1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1123 qp = wd->qp; 1124 conn = qptoc(qp); 1125 mutex_enter(&wd->sendwait_lock); 1126 1127 switch (wc.wc_status) { 1128 case IBT_WC_SUCCESS: 1129 wd->status = RDMA_SUCCESS; 1130 break; 1131 default: 1132 /* 1133 * Channel in error state. Set connection to 1134 * ERROR and cleanup will happen either from 1135 * conn_release or conn timeout. 1136 */ 1137 wd->status = RDMA_FAILED; 1138 mutex_enter(&conn->c_lock); 1139 if (conn->c_state != C_DISCONN_PEND) 1140 conn->c_state = C_ERROR_CONN; 1141 mutex_exit(&conn->c_lock); 1142 break; 1143 } 1144 1145 if (wd->cv_sig == 1) { 1146 /* 1147 * Update completion status and notify poster 1148 */ 1149 cv_signal(&wd->wait_cv); 1150 mutex_exit(&wd->sendwait_lock); 1151 } else { 1152 /* 1153 * Poster not waiting for notification. 1154 * Free the send buffers and send_wid 1155 */ 1156 for (i = 0; i < wd->nsbufs; i++) { 1157 rib_rbuf_free(qptoc(wd->qp), 1158 SEND_BUFFER, 1159 (void *)(uintptr_t)wd->sbufaddr[i]); 1160 } 1161 1162 /* decrement the send ref count */ 1163 rib_send_rele(qp); 1164 1165 mutex_exit(&wd->sendwait_lock); 1166 (void) rib_free_sendwait(wd); 1167 } 1168 } 1169 } 1170 } 1171 1172 /* 1173 * RCQ handler 1174 */ 1175 /* ARGSUSED */ 1176 static void 1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1178 { 1179 rib_qp_t *qp; 1180 ibt_status_t ibt_status; 1181 ibt_wc_t wc; 1182 struct recv_wid *rwid; 1183 1184 /* 1185 * Re-enable cq notify here to avoid missing any 1186 * completion queue notification. 1187 */ 1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1189 1190 ibt_status = IBT_SUCCESS; 1191 while (ibt_status != IBT_CQ_EMPTY) { 1192 bzero(&wc, sizeof (wc)); 1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1194 if (ibt_status != IBT_SUCCESS) 1195 return; 1196 1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1198 qp = rwid->qp; 1199 1200 if (wc.wc_status == IBT_WC_SUCCESS) { 1201 XDR inxdrs, *xdrs; 1202 uint_t xid, vers, op, find_xid = 0; 1203 struct reply *r; 1204 CONN *conn = qptoc(qp); 1205 uint32_t rdma_credit = 0; 1206 1207 xdrs = &inxdrs; 1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1209 wc.wc_bytes_xfer, XDR_DECODE); 1210 /* 1211 * Treat xid as opaque (xid is the first entity 1212 * in the rpc rdma message). 1213 */ 1214 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1215 1216 /* Skip xid and set the xdr position accordingly. */ 1217 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1218 (void) xdr_u_int(xdrs, &vers); 1219 (void) xdr_u_int(xdrs, &rdma_credit); 1220 (void) xdr_u_int(xdrs, &op); 1221 XDR_DESTROY(xdrs); 1222 1223 if (vers != RPCRDMA_VERS) { 1224 /* 1225 * Invalid RPC/RDMA version. Cannot 1226 * interoperate. Set connection to 1227 * ERROR state and bail out. 1228 */ 1229 mutex_enter(&conn->c_lock); 1230 if (conn->c_state != C_DISCONN_PEND) 1231 conn->c_state = C_ERROR_CONN; 1232 mutex_exit(&conn->c_lock); 1233 rib_rbuf_free(conn, RECV_BUFFER, 1234 (void *)(uintptr_t)rwid->addr); 1235 rib_free_wid(rwid); 1236 rib_recv_rele(qp); 1237 continue; 1238 } 1239 1240 mutex_enter(&qp->replylist_lock); 1241 for (r = qp->replylist; r != NULL; r = r->next) { 1242 if (r->xid == xid) { 1243 find_xid = 1; 1244 switch (op) { 1245 case RDMA_MSG: 1246 case RDMA_NOMSG: 1247 case RDMA_MSGP: 1248 r->status = RDMA_SUCCESS; 1249 r->vaddr_cq = rwid->addr; 1250 r->bytes_xfer = 1251 wc.wc_bytes_xfer; 1252 cv_signal(&r->wait_cv); 1253 break; 1254 default: 1255 rib_rbuf_free(qptoc(qp), 1256 RECV_BUFFER, 1257 (void *)(uintptr_t) 1258 rwid->addr); 1259 break; 1260 } 1261 break; 1262 } 1263 } 1264 mutex_exit(&qp->replylist_lock); 1265 if (find_xid == 0) { 1266 /* RPC caller not waiting for reply */ 1267 1268 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1269 int, xid); 1270 1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1272 (void *)(uintptr_t)rwid->addr); 1273 } 1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1275 CONN *conn = qptoc(qp); 1276 1277 /* 1278 * Connection being flushed. Just free 1279 * the posted buffer 1280 */ 1281 rib_rbuf_free(conn, RECV_BUFFER, 1282 (void *)(uintptr_t)rwid->addr); 1283 } else { 1284 CONN *conn = qptoc(qp); 1285 /* 1286 * RC Recv Q Error Code Local state Remote State 1287 * ==================== =========== ============ 1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1293 * IBT_WC_WR_FLUSHED_ERR None None 1294 */ 1295 /* 1296 * Channel in error state. Set connection 1297 * in ERROR state. 1298 */ 1299 mutex_enter(&conn->c_lock); 1300 if (conn->c_state != C_DISCONN_PEND) 1301 conn->c_state = C_ERROR_CONN; 1302 mutex_exit(&conn->c_lock); 1303 rib_rbuf_free(conn, RECV_BUFFER, 1304 (void *)(uintptr_t)rwid->addr); 1305 } 1306 rib_free_wid(rwid); 1307 rib_recv_rele(qp); 1308 } 1309 } 1310 1311 /* Server side */ 1312 /* ARGSUSED */ 1313 static void 1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1315 { 1316 rdma_recv_data_t *rdp; 1317 rib_qp_t *qp; 1318 ibt_status_t ibt_status; 1319 ibt_wc_t wc; 1320 struct svc_recv *s_recvp; 1321 CONN *conn; 1322 mblk_t *mp; 1323 1324 /* 1325 * Re-enable cq notify here to avoid missing any 1326 * completion queue notification. 1327 */ 1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1329 1330 ibt_status = IBT_SUCCESS; 1331 while (ibt_status != IBT_CQ_EMPTY) { 1332 bzero(&wc, sizeof (wc)); 1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1334 if (ibt_status != IBT_SUCCESS) 1335 return; 1336 1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1338 qp = s_recvp->qp; 1339 conn = qptoc(qp); 1340 1341 if (wc.wc_status == IBT_WC_SUCCESS) { 1342 XDR inxdrs, *xdrs; 1343 uint_t xid, vers, op; 1344 uint32_t rdma_credit; 1345 1346 xdrs = &inxdrs; 1347 /* s_recvp->vaddr stores data */ 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1349 wc.wc_bytes_xfer, XDR_DECODE); 1350 1351 /* 1352 * Treat xid as opaque (xid is the first entity 1353 * in the rpc rdma message). 1354 */ 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1356 /* Skip xid and set the xdr position accordingly. */ 1357 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1358 if (!xdr_u_int(xdrs, &vers) || 1359 !xdr_u_int(xdrs, &rdma_credit) || 1360 !xdr_u_int(xdrs, &op)) { 1361 rib_rbuf_free(conn, RECV_BUFFER, 1362 (void *)(uintptr_t)s_recvp->vaddr); 1363 XDR_DESTROY(xdrs); 1364 rib_recv_rele(qp); 1365 (void) rib_free_svc_recv(s_recvp); 1366 continue; 1367 } 1368 XDR_DESTROY(xdrs); 1369 1370 if (vers != RPCRDMA_VERS) { 1371 /* 1372 * Invalid RPC/RDMA version. 1373 * Drop rpc rdma message. 1374 */ 1375 rib_rbuf_free(conn, RECV_BUFFER, 1376 (void *)(uintptr_t)s_recvp->vaddr); 1377 rib_recv_rele(qp); 1378 (void) rib_free_svc_recv(s_recvp); 1379 continue; 1380 } 1381 /* 1382 * Is this for RDMA_DONE? 1383 */ 1384 if (op == RDMA_DONE) { 1385 rib_rbuf_free(conn, RECV_BUFFER, 1386 (void *)(uintptr_t)s_recvp->vaddr); 1387 /* 1388 * Wake up the thread waiting on 1389 * a RDMA_DONE for xid 1390 */ 1391 mutex_enter(&qp->rdlist_lock); 1392 rdma_done_notify(qp, xid); 1393 mutex_exit(&qp->rdlist_lock); 1394 rib_recv_rele(qp); 1395 (void) rib_free_svc_recv(s_recvp); 1396 continue; 1397 } 1398 1399 mutex_enter(&plugin_state_lock); 1400 mutex_enter(&conn->c_lock); 1401 if ((plugin_state == ACCEPT) && 1402 (conn->c_state == C_CONNECTED)) { 1403 conn->c_ref++; 1404 mutex_exit(&conn->c_lock); 1405 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1406 == NULL) 1407 (void) strwaitbuf( 1408 sizeof (*rdp), BPRI_LO); 1409 /* 1410 * Plugin is in accept state, hence the master 1411 * transport queue for this is still accepting 1412 * requests. Hence we can call svc_queuereq to 1413 * queue this recieved msg. 1414 */ 1415 rdp = (rdma_recv_data_t *)mp->b_rptr; 1416 rdp->conn = conn; 1417 rdp->rpcmsg.addr = 1418 (caddr_t)(uintptr_t)s_recvp->vaddr; 1419 rdp->rpcmsg.type = RECV_BUFFER; 1420 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1421 rdp->status = wc.wc_status; 1422 mp->b_wptr += sizeof (*rdp); 1423 svc_queuereq((queue_t *)rib_stat->q, mp); 1424 mutex_exit(&plugin_state_lock); 1425 } else { 1426 /* 1427 * The master transport for this is going 1428 * away and the queue is not accepting anymore 1429 * requests for krpc, so don't do anything, just 1430 * free the msg. 1431 */ 1432 mutex_exit(&conn->c_lock); 1433 mutex_exit(&plugin_state_lock); 1434 rib_rbuf_free(conn, RECV_BUFFER, 1435 (void *)(uintptr_t)s_recvp->vaddr); 1436 } 1437 } else { 1438 rib_rbuf_free(conn, RECV_BUFFER, 1439 (void *)(uintptr_t)s_recvp->vaddr); 1440 } 1441 rib_recv_rele(qp); 1442 (void) rib_free_svc_recv(s_recvp); 1443 } 1444 } 1445 1446 static void 1447 rib_attach_hca() 1448 { 1449 mutex_enter(&rib_stat->open_hca_lock); 1450 (void) rpcib_open_hcas(rib_stat); 1451 rib_listen(NULL); 1452 mutex_exit(&rib_stat->open_hca_lock); 1453 } 1454 1455 /* 1456 * Handles DR event of IBT_HCA_DETACH_EVENT. 1457 */ 1458 /* ARGSUSED */ 1459 static void 1460 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1461 ibt_async_code_t code, ibt_async_event_t *event) 1462 { 1463 switch (code) { 1464 case IBT_HCA_ATTACH_EVENT: 1465 rib_attach_hca(); 1466 break; 1467 case IBT_HCA_DETACH_EVENT: 1468 rib_detach_hca(hca_hdl); 1469 #ifdef DEBUG 1470 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1471 #endif 1472 break; 1473 case IBT_EVENT_PORT_UP: 1474 /* 1475 * A port is up. We should call rib_listen() since there is 1476 * a chance that rib_listen() may have failed during 1477 * rib_attach_hca() because the port had not been up yet. 1478 */ 1479 rib_listen(NULL); 1480 #ifdef DEBUG 1481 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1482 #endif 1483 break; 1484 #ifdef DEBUG 1485 case IBT_EVENT_PATH_MIGRATED: 1486 cmn_err(CE_NOTE, "rib_async_handler(): " 1487 "IBT_EVENT_PATH_MIGRATED\n"); 1488 break; 1489 case IBT_EVENT_SQD: 1490 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1491 break; 1492 case IBT_EVENT_COM_EST: 1493 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1494 break; 1495 case IBT_ERROR_CATASTROPHIC_CHAN: 1496 cmn_err(CE_NOTE, "rib_async_handler(): " 1497 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1498 break; 1499 case IBT_ERROR_INVALID_REQUEST_CHAN: 1500 cmn_err(CE_NOTE, "rib_async_handler(): " 1501 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1502 break; 1503 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1504 cmn_err(CE_NOTE, "rib_async_handler(): " 1505 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1506 break; 1507 case IBT_ERROR_PATH_MIGRATE_REQ: 1508 cmn_err(CE_NOTE, "rib_async_handler(): " 1509 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1510 break; 1511 case IBT_ERROR_CQ: 1512 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1513 break; 1514 case IBT_ERROR_PORT_DOWN: 1515 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1516 break; 1517 case IBT_ASYNC_OPAQUE1: 1518 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1519 break; 1520 case IBT_ASYNC_OPAQUE2: 1521 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1522 break; 1523 case IBT_ASYNC_OPAQUE3: 1524 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1525 break; 1526 case IBT_ASYNC_OPAQUE4: 1527 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1528 break; 1529 #endif 1530 default: 1531 break; 1532 } 1533 } 1534 1535 /* 1536 * Client's reachable function. 1537 */ 1538 static rdma_stat 1539 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1540 { 1541 rdma_stat status; 1542 rpcib_ping_t rpt; 1543 struct netbuf saddr; 1544 CONN *conn; 1545 1546 bzero(&saddr, sizeof (struct netbuf)); 1547 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1548 1549 if (status == RDMA_SUCCESS) { 1550 *handle = (void *)rpt.hca; 1551 /* release the reference */ 1552 (void) rib_conn_release(conn); 1553 return (RDMA_SUCCESS); 1554 } else { 1555 *handle = NULL; 1556 DTRACE_PROBE(rpcib__i__pingfailed); 1557 return (RDMA_FAILED); 1558 } 1559 } 1560 1561 /* Client side qp creation */ 1562 static rdma_stat 1563 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1564 { 1565 rib_qp_t *kqp = NULL; 1566 CONN *conn; 1567 rdma_clnt_cred_ctrl_t *cc_info; 1568 1569 ASSERT(qp != NULL); 1570 *qp = NULL; 1571 1572 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1573 conn = qptoc(kqp); 1574 kqp->hca = hca; 1575 kqp->rdmaconn.c_rdmamod = &rib_mod; 1576 kqp->rdmaconn.c_private = (caddr_t)kqp; 1577 1578 kqp->mode = RIB_CLIENT; 1579 kqp->chan_flags = IBT_BLOCKING; 1580 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1581 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1582 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1583 /* 1584 * Initialize 1585 */ 1586 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1587 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1588 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1589 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1590 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1591 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1592 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1593 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1594 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1595 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1596 /* 1597 * Initialize the client credit control 1598 * portion of the rdmaconn struct. 1599 */ 1600 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1601 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1602 cc_info->clnt_cc_granted_ops = 0; 1603 cc_info->clnt_cc_in_flight_ops = 0; 1604 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1605 1606 *qp = kqp; 1607 return (RDMA_SUCCESS); 1608 } 1609 1610 /* Server side qp creation */ 1611 static rdma_stat 1612 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1613 { 1614 rib_qp_t *kqp = NULL; 1615 ibt_chan_sizes_t chan_sizes; 1616 ibt_rc_chan_alloc_args_t qp_attr; 1617 ibt_status_t ibt_status; 1618 rdma_srv_cred_ctrl_t *cc_info; 1619 1620 *qp = NULL; 1621 1622 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1623 kqp->hca = hca; 1624 kqp->port_num = port; 1625 kqp->rdmaconn.c_rdmamod = &rib_mod; 1626 kqp->rdmaconn.c_private = (caddr_t)kqp; 1627 1628 /* 1629 * Create the qp handle 1630 */ 1631 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1632 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1633 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1634 qp_attr.rc_pd = hca->pd_hdl; 1635 qp_attr.rc_hca_port_num = port; 1636 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1637 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1638 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1639 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1640 qp_attr.rc_clone_chan = NULL; 1641 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1642 qp_attr.rc_flags = IBT_WR_SIGNALED; 1643 1644 rw_enter(&hca->state_lock, RW_READER); 1645 if (hca->state != HCA_DETACHED) { 1646 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1647 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1648 &chan_sizes); 1649 } else { 1650 rw_exit(&hca->state_lock); 1651 goto fail; 1652 } 1653 rw_exit(&hca->state_lock); 1654 1655 if (ibt_status != IBT_SUCCESS) { 1656 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1657 int, ibt_status); 1658 goto fail; 1659 } 1660 1661 kqp->mode = RIB_SERVER; 1662 kqp->chan_flags = IBT_BLOCKING; 1663 kqp->q = q; /* server ONLY */ 1664 1665 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1666 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1667 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1668 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1669 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1670 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1671 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1672 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1673 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1674 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1675 /* 1676 * Set the private data area to qp to be used in callbacks 1677 */ 1678 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1679 kqp->rdmaconn.c_state = C_CONNECTED; 1680 1681 /* 1682 * Initialize the server credit control 1683 * portion of the rdmaconn struct. 1684 */ 1685 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1686 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1687 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1688 cc_info->srv_cc_cur_buffers_used = 0; 1689 cc_info->srv_cc_posted = preposted_rbufs; 1690 1691 *qp = kqp; 1692 1693 return (RDMA_SUCCESS); 1694 fail: 1695 if (kqp) 1696 kmem_free(kqp, sizeof (rib_qp_t)); 1697 1698 return (RDMA_FAILED); 1699 } 1700 1701 /* ARGSUSED */ 1702 ibt_cm_status_t 1703 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1704 ibt_cm_return_args_t *ret_args, void *priv_data, 1705 ibt_priv_data_len_t len) 1706 { 1707 rib_hca_t *hca; 1708 1709 hca = (rib_hca_t *)clnt_hdl; 1710 1711 switch (event->cm_type) { 1712 1713 /* got a connection close event */ 1714 case IBT_CM_EVENT_CONN_CLOSED: 1715 { 1716 CONN *conn; 1717 rib_qp_t *qp; 1718 1719 /* check reason why connection was closed */ 1720 switch (event->cm_event.closed) { 1721 case IBT_CM_CLOSED_DREP_RCVD: 1722 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1723 case IBT_CM_CLOSED_DUP: 1724 case IBT_CM_CLOSED_ABORT: 1725 case IBT_CM_CLOSED_ALREADY: 1726 /* 1727 * These cases indicate the local end initiated 1728 * the closing of the channel. Nothing to do here. 1729 */ 1730 break; 1731 default: 1732 /* 1733 * Reason for CONN_CLOSED event must be one of 1734 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1735 * or IBT_CM_CLOSED_STALE. These indicate cases were 1736 * the remote end is closing the channel. In these 1737 * cases free the channel and transition to error 1738 * state 1739 */ 1740 qp = ibt_get_chan_private(event->cm_channel); 1741 conn = qptoc(qp); 1742 mutex_enter(&conn->c_lock); 1743 if (conn->c_state == C_DISCONN_PEND) { 1744 mutex_exit(&conn->c_lock); 1745 break; 1746 } 1747 1748 conn->c_state = C_ERROR_CONN; 1749 1750 /* 1751 * Free the conn if c_ref is down to 0 already 1752 */ 1753 if (conn->c_ref == 0) { 1754 /* 1755 * Remove from list and free conn 1756 */ 1757 conn->c_state = C_DISCONN_PEND; 1758 mutex_exit(&conn->c_lock); 1759 rw_enter(&hca->state_lock, RW_READER); 1760 if (hca->state != HCA_DETACHED) 1761 (void) rib_disconnect_channel(conn, 1762 &hca->cl_conn_list); 1763 rw_exit(&hca->state_lock); 1764 } else { 1765 /* 1766 * conn will be freed when c_ref goes to 0. 1767 * Indicate to cleaning thread not to close 1768 * the connection, but just free the channel. 1769 */ 1770 conn->c_flags |= C_CLOSE_NOTNEEDED; 1771 mutex_exit(&conn->c_lock); 1772 } 1773 #ifdef DEBUG 1774 if (rib_debug) 1775 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1776 "(CONN_CLOSED) channel disconnected"); 1777 #endif 1778 break; 1779 } 1780 break; 1781 } 1782 default: 1783 break; 1784 } 1785 return (IBT_CM_ACCEPT); 1786 } 1787 1788 /* 1789 * Connect to the server. 1790 */ 1791 rdma_stat 1792 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1793 { 1794 ibt_chan_open_args_t chan_args; /* channel args */ 1795 ibt_chan_sizes_t chan_sizes; 1796 ibt_rc_chan_alloc_args_t qp_attr; 1797 ibt_status_t ibt_status; 1798 ibt_rc_returns_t ret_args; /* conn reject info */ 1799 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1800 ibt_ip_cm_info_t ipcm_info; 1801 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1802 1803 1804 (void) bzero(&chan_args, sizeof (chan_args)); 1805 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1806 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1807 1808 ipcm_info.src_addr.family = rptp->srcip.family; 1809 switch (ipcm_info.src_addr.family) { 1810 case AF_INET: 1811 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1812 break; 1813 case AF_INET6: 1814 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1815 break; 1816 } 1817 1818 ipcm_info.dst_addr.family = rptp->srcip.family; 1819 switch (ipcm_info.dst_addr.family) { 1820 case AF_INET: 1821 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1822 break; 1823 case AF_INET6: 1824 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1825 break; 1826 } 1827 1828 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1829 1830 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1831 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1832 1833 if (ibt_status != IBT_SUCCESS) { 1834 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1835 return (-1); 1836 } 1837 1838 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1839 /* Alloc a RC channel */ 1840 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1841 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1842 qp_attr.rc_pd = hca->pd_hdl; 1843 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1844 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1845 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1846 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1847 qp_attr.rc_clone_chan = NULL; 1848 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1849 qp_attr.rc_flags = IBT_WR_SIGNALED; 1850 1851 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1852 chan_args.oc_path = &rptp->path; 1853 1854 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1855 chan_args.oc_cm_clnt_private = (void *)hca; 1856 chan_args.oc_rdma_ra_out = 4; 1857 chan_args.oc_rdma_ra_in = 4; 1858 chan_args.oc_path_retry_cnt = 2; 1859 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1860 chan_args.oc_priv_data = cmp_ip_pvt; 1861 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1862 1863 refresh: 1864 rw_enter(&hca->state_lock, RW_READER); 1865 if (hca->state != HCA_DETACHED) { 1866 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1867 IBT_ACHAN_NO_FLAGS, 1868 &qp_attr, &qp->qp_hdl, 1869 &chan_sizes); 1870 } else { 1871 rw_exit(&hca->state_lock); 1872 return (RDMA_FAILED); 1873 } 1874 rw_exit(&hca->state_lock); 1875 1876 if (ibt_status != IBT_SUCCESS) { 1877 DTRACE_PROBE1(rpcib__i_conntosrv, 1878 int, ibt_status); 1879 return (RDMA_FAILED); 1880 } 1881 1882 /* Connect to the Server */ 1883 (void) bzero(&ret_args, sizeof (ret_args)); 1884 mutex_enter(&qp->cb_lock); 1885 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1886 IBT_BLOCKING, &chan_args, &ret_args); 1887 if (ibt_status != IBT_SUCCESS) { 1888 DTRACE_PROBE2(rpcib__i_openrctosrv, 1889 int, ibt_status, int, ret_args.rc_status); 1890 1891 (void) ibt_free_channel(qp->qp_hdl); 1892 qp->qp_hdl = NULL; 1893 mutex_exit(&qp->cb_lock); 1894 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1895 ret_args.rc_status == IBT_CM_CONN_STALE) { 1896 /* 1897 * Got IBT_CM_CONN_STALE probably because of stale 1898 * data on the passive end of a channel that existed 1899 * prior to reboot. Retry establishing a channel 1900 * REFRESH_ATTEMPTS times, during which time the 1901 * stale conditions on the server might clear up. 1902 */ 1903 goto refresh; 1904 } 1905 return (RDMA_FAILED); 1906 } 1907 mutex_exit(&qp->cb_lock); 1908 /* 1909 * Set the private data area to qp to be used in callbacks 1910 */ 1911 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1912 return (RDMA_SUCCESS); 1913 } 1914 1915 rdma_stat 1916 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1917 { 1918 uint_t i, addr_count; 1919 ibt_status_t ibt_status; 1920 uint8_t num_paths_p; 1921 ibt_ip_path_attr_t ipattr; 1922 ibt_path_ip_src_t srcip; 1923 rpcib_ipaddrs_t addrs4; 1924 rpcib_ipaddrs_t addrs6; 1925 struct sockaddr_in *sinp; 1926 struct sockaddr_in6 *sin6p; 1927 rdma_stat retval = RDMA_FAILED; 1928 rib_hca_t *hca; 1929 1930 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1931 return (RDMA_INVAL); 1932 ASSERT(raddr->buf != NULL); 1933 1934 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1935 1936 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1937 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1938 retval = RDMA_FAILED; 1939 goto done2; 1940 } 1941 1942 if (addr_type == AF_INET) { 1943 addr_count = addrs4.ri_count; 1944 sinp = (struct sockaddr_in *)raddr->buf; 1945 rptp->dstip.family = AF_INET; 1946 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1947 sinp = addrs4.ri_list; 1948 } else { 1949 addr_count = addrs6.ri_count; 1950 sin6p = (struct sockaddr_in6 *)raddr->buf; 1951 rptp->dstip.family = AF_INET6; 1952 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1953 sin6p = addrs6.ri_list; 1954 } 1955 1956 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1957 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1958 rw_enter(&hca->state_lock, RW_READER); 1959 if (hca->state == HCA_DETACHED) { 1960 rw_exit(&hca->state_lock); 1961 continue; 1962 } 1963 1964 ipattr.ipa_dst_ip = &rptp->dstip; 1965 ipattr.ipa_hca_guid = hca->hca_guid; 1966 ipattr.ipa_ndst = 1; 1967 ipattr.ipa_max_paths = 1; 1968 ipattr.ipa_src_ip.family = rptp->dstip.family; 1969 for (i = 0; i < addr_count; i++) { 1970 num_paths_p = 0; 1971 if (addr_type == AF_INET) { 1972 ipattr.ipa_src_ip.un.ip4addr = 1973 sinp[i].sin_addr.s_addr; 1974 } else { 1975 ipattr.ipa_src_ip.un.ip6addr = 1976 sin6p[i].sin6_addr; 1977 } 1978 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1979 1980 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1981 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1982 &num_paths_p, &srcip); 1983 if (ibt_status == IBT_SUCCESS && 1984 num_paths_p != 0 && 1985 rptp->path.pi_hca_guid == hca->hca_guid) { 1986 rptp->hca = hca; 1987 rw_exit(&hca->state_lock); 1988 if (addr_type == AF_INET) { 1989 rptp->srcip.family = AF_INET; 1990 rptp->srcip.un.ip4addr = 1991 srcip.ip_primary.un.ip4addr; 1992 } else { 1993 rptp->srcip.family = AF_INET6; 1994 rptp->srcip.un.ip6addr = 1995 srcip.ip_primary.un.ip6addr; 1996 1997 } 1998 retval = RDMA_SUCCESS; 1999 goto done1; 2000 } 2001 } 2002 rw_exit(&hca->state_lock); 2003 } 2004 done1: 2005 rw_exit(&rib_stat->hcas_list_lock); 2006 done2: 2007 if (addrs4.ri_size > 0) 2008 kmem_free(addrs4.ri_list, addrs4.ri_size); 2009 if (addrs6.ri_size > 0) 2010 kmem_free(addrs6.ri_list, addrs6.ri_size); 2011 return (retval); 2012 } 2013 2014 /* 2015 * Close channel, remove from connection list and 2016 * free up resources allocated for that channel. 2017 */ 2018 rdma_stat 2019 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2020 { 2021 rib_qp_t *qp = ctoqp(conn); 2022 rib_hca_t *hca; 2023 2024 mutex_enter(&conn->c_lock); 2025 if (conn->c_timeout != NULL) { 2026 mutex_exit(&conn->c_lock); 2027 (void) untimeout(conn->c_timeout); 2028 mutex_enter(&conn->c_lock); 2029 } 2030 2031 while (conn->c_flags & C_CLOSE_PENDING) { 2032 cv_wait(&conn->c_cv, &conn->c_lock); 2033 } 2034 mutex_exit(&conn->c_lock); 2035 2036 /* 2037 * c_ref == 0 and connection is in C_DISCONN_PEND 2038 */ 2039 hca = qp->hca; 2040 if (conn_list != NULL) 2041 (void) rib_rm_conn(conn, conn_list); 2042 2043 /* 2044 * There is only one case where we get here with 2045 * qp_hdl = NULL, which is during connection setup on 2046 * the client. In such a case there are no posted 2047 * send/recv buffers. 2048 */ 2049 if (qp->qp_hdl != NULL) { 2050 mutex_enter(&qp->posted_rbufs_lock); 2051 while (qp->n_posted_rbufs) 2052 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2053 mutex_exit(&qp->posted_rbufs_lock); 2054 2055 mutex_enter(&qp->send_rbufs_lock); 2056 while (qp->n_send_rbufs) 2057 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2058 mutex_exit(&qp->send_rbufs_lock); 2059 2060 (void) ibt_free_channel(qp->qp_hdl); 2061 qp->qp_hdl = NULL; 2062 } 2063 2064 ASSERT(qp->rdlist == NULL); 2065 2066 if (qp->replylist != NULL) { 2067 (void) rib_rem_replylist(qp); 2068 } 2069 2070 cv_destroy(&qp->cb_conn_cv); 2071 cv_destroy(&qp->posted_rbufs_cv); 2072 cv_destroy(&qp->send_rbufs_cv); 2073 mutex_destroy(&qp->cb_lock); 2074 mutex_destroy(&qp->replylist_lock); 2075 mutex_destroy(&qp->posted_rbufs_lock); 2076 mutex_destroy(&qp->send_rbufs_lock); 2077 mutex_destroy(&qp->rdlist_lock); 2078 2079 cv_destroy(&conn->c_cv); 2080 mutex_destroy(&conn->c_lock); 2081 2082 if (conn->c_raddr.buf != NULL) { 2083 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2084 } 2085 if (conn->c_laddr.buf != NULL) { 2086 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2087 } 2088 if (conn->c_netid != NULL) { 2089 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2090 } 2091 2092 /* 2093 * Credit control cleanup. 2094 */ 2095 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2096 rdma_clnt_cred_ctrl_t *cc_info; 2097 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2098 cv_destroy(&cc_info->clnt_cc_cv); 2099 } 2100 2101 kmem_free(qp, sizeof (rib_qp_t)); 2102 2103 /* 2104 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2105 * then the hca is no longer being used. 2106 */ 2107 if (conn_list != NULL) { 2108 rw_enter(&hca->state_lock, RW_READER); 2109 if (hca->state == HCA_DETACHED) { 2110 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2111 if (hca->srv_conn_list.conn_hd == NULL) { 2112 rw_enter(&hca->cl_conn_list.conn_lock, 2113 RW_READER); 2114 2115 if (hca->cl_conn_list.conn_hd == NULL) { 2116 mutex_enter(&hca->inuse_lock); 2117 hca->inuse = FALSE; 2118 cv_signal(&hca->cb_cv); 2119 mutex_exit(&hca->inuse_lock); 2120 } 2121 rw_exit(&hca->cl_conn_list.conn_lock); 2122 } 2123 rw_exit(&hca->srv_conn_list.conn_lock); 2124 } 2125 rw_exit(&hca->state_lock); 2126 } 2127 2128 return (RDMA_SUCCESS); 2129 } 2130 2131 /* 2132 * All sends are done under the protection of 2133 * the wdesc->sendwait_lock. n_send_rbufs count 2134 * is protected using the send_rbufs_lock. 2135 * lock ordering is: 2136 * sendwait_lock -> send_rbufs_lock 2137 */ 2138 2139 void 2140 rib_send_hold(rib_qp_t *qp) 2141 { 2142 mutex_enter(&qp->send_rbufs_lock); 2143 qp->n_send_rbufs++; 2144 mutex_exit(&qp->send_rbufs_lock); 2145 } 2146 2147 void 2148 rib_send_rele(rib_qp_t *qp) 2149 { 2150 mutex_enter(&qp->send_rbufs_lock); 2151 qp->n_send_rbufs--; 2152 if (qp->n_send_rbufs == 0) 2153 cv_signal(&qp->send_rbufs_cv); 2154 mutex_exit(&qp->send_rbufs_lock); 2155 } 2156 2157 void 2158 rib_recv_rele(rib_qp_t *qp) 2159 { 2160 mutex_enter(&qp->posted_rbufs_lock); 2161 qp->n_posted_rbufs--; 2162 if (qp->n_posted_rbufs == 0) 2163 cv_signal(&qp->posted_rbufs_cv); 2164 mutex_exit(&qp->posted_rbufs_lock); 2165 } 2166 2167 /* 2168 * Wait for send completion notification. Only on receiving a 2169 * notification be it a successful or error completion, free the 2170 * send_wid. 2171 */ 2172 static rdma_stat 2173 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2174 { 2175 clock_t timout, cv_wait_ret; 2176 rdma_stat error = RDMA_SUCCESS; 2177 int i; 2178 2179 /* 2180 * Wait for send to complete 2181 */ 2182 ASSERT(wd != NULL); 2183 mutex_enter(&wd->sendwait_lock); 2184 if (wd->status == (uint_t)SEND_WAIT) { 2185 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2186 ddi_get_lbolt(); 2187 2188 if (qp->mode == RIB_SERVER) { 2189 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2190 &wd->sendwait_lock, timout)) > 0 && 2191 wd->status == (uint_t)SEND_WAIT) 2192 ; 2193 switch (cv_wait_ret) { 2194 case -1: /* timeout */ 2195 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2196 2197 wd->cv_sig = 0; /* no signal needed */ 2198 error = RDMA_TIMEDOUT; 2199 break; 2200 default: /* got send completion */ 2201 break; 2202 } 2203 } else { 2204 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2205 &wd->sendwait_lock, timout)) > 0 && 2206 wd->status == (uint_t)SEND_WAIT) 2207 ; 2208 switch (cv_wait_ret) { 2209 case -1: /* timeout */ 2210 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2211 2212 wd->cv_sig = 0; /* no signal needed */ 2213 error = RDMA_TIMEDOUT; 2214 break; 2215 case 0: /* interrupted */ 2216 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2217 2218 wd->cv_sig = 0; /* no signal needed */ 2219 error = RDMA_INTR; 2220 break; 2221 default: /* got send completion */ 2222 break; 2223 } 2224 } 2225 } 2226 2227 if (wd->status != (uint_t)SEND_WAIT) { 2228 /* got send completion */ 2229 if (wd->status != RDMA_SUCCESS) { 2230 switch (wd->status) { 2231 case RDMA_CONNLOST: 2232 error = RDMA_CONNLOST; 2233 break; 2234 default: 2235 error = RDMA_FAILED; 2236 break; 2237 } 2238 } 2239 for (i = 0; i < wd->nsbufs; i++) { 2240 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2241 (void *)(uintptr_t)wd->sbufaddr[i]); 2242 } 2243 2244 rib_send_rele(qp); 2245 2246 mutex_exit(&wd->sendwait_lock); 2247 (void) rib_free_sendwait(wd); 2248 2249 } else { 2250 mutex_exit(&wd->sendwait_lock); 2251 } 2252 return (error); 2253 } 2254 2255 static struct send_wid * 2256 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2257 { 2258 struct send_wid *wd; 2259 2260 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2261 wd->xid = xid; 2262 wd->cv_sig = cv_sig; 2263 wd->qp = qp; 2264 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2265 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2266 wd->status = (uint_t)SEND_WAIT; 2267 2268 return (wd); 2269 } 2270 2271 static int 2272 rib_free_sendwait(struct send_wid *wdesc) 2273 { 2274 cv_destroy(&wdesc->wait_cv); 2275 mutex_destroy(&wdesc->sendwait_lock); 2276 kmem_free(wdesc, sizeof (*wdesc)); 2277 2278 return (0); 2279 } 2280 2281 static rdma_stat 2282 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2283 { 2284 mutex_enter(&qp->replylist_lock); 2285 if (rep != NULL) { 2286 (void) rib_remreply(qp, rep); 2287 mutex_exit(&qp->replylist_lock); 2288 return (RDMA_SUCCESS); 2289 } 2290 mutex_exit(&qp->replylist_lock); 2291 return (RDMA_FAILED); 2292 } 2293 2294 /* 2295 * Send buffers are freed here only in case of error in posting 2296 * on QP. If the post succeeded, the send buffers are freed upon 2297 * send completion in rib_sendwait() or in the scq_handler. 2298 */ 2299 rdma_stat 2300 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2301 int send_sig, int cv_sig, caddr_t *swid) 2302 { 2303 struct send_wid *wdesc; 2304 struct clist *clp; 2305 ibt_status_t ibt_status = IBT_SUCCESS; 2306 rdma_stat ret = RDMA_SUCCESS; 2307 ibt_send_wr_t tx_wr; 2308 int i, nds; 2309 ibt_wr_ds_t sgl[DSEG_MAX]; 2310 uint_t total_msg_size; 2311 rib_qp_t *qp; 2312 2313 qp = ctoqp(conn); 2314 2315 ASSERT(cl != NULL); 2316 2317 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2318 2319 nds = 0; 2320 total_msg_size = 0; 2321 clp = cl; 2322 while (clp != NULL) { 2323 if (nds >= DSEG_MAX) { 2324 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2325 return (RDMA_FAILED); 2326 } 2327 sgl[nds].ds_va = clp->w.c_saddr; 2328 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2329 sgl[nds].ds_len = clp->c_len; 2330 total_msg_size += clp->c_len; 2331 clp = clp->c_next; 2332 nds++; 2333 } 2334 2335 if (send_sig) { 2336 /* Set SEND_SIGNAL flag. */ 2337 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2338 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2339 *swid = (caddr_t)wdesc; 2340 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2341 mutex_enter(&wdesc->sendwait_lock); 2342 wdesc->nsbufs = nds; 2343 for (i = 0; i < nds; i++) { 2344 wdesc->sbufaddr[i] = sgl[i].ds_va; 2345 } 2346 } else { 2347 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2348 *swid = NULL; 2349 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2350 } 2351 2352 tx_wr.wr_opcode = IBT_WRC_SEND; 2353 tx_wr.wr_trans = IBT_RC_SRV; 2354 tx_wr.wr_nds = nds; 2355 tx_wr.wr_sgl = sgl; 2356 2357 mutex_enter(&conn->c_lock); 2358 if (conn->c_state == C_CONNECTED) { 2359 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2360 } 2361 if (conn->c_state != C_CONNECTED || 2362 ibt_status != IBT_SUCCESS) { 2363 if (conn->c_state != C_DISCONN_PEND) 2364 conn->c_state = C_ERROR_CONN; 2365 mutex_exit(&conn->c_lock); 2366 if (send_sig) { 2367 for (i = 0; i < nds; i++) { 2368 rib_rbuf_free(conn, SEND_BUFFER, 2369 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2370 } 2371 mutex_exit(&wdesc->sendwait_lock); 2372 (void) rib_free_sendwait(wdesc); 2373 } 2374 return (RDMA_CONNLOST); 2375 } 2376 2377 mutex_exit(&conn->c_lock); 2378 2379 if (send_sig) { 2380 rib_send_hold(qp); 2381 mutex_exit(&wdesc->sendwait_lock); 2382 if (cv_sig) { 2383 /* 2384 * cv_wait for send to complete. 2385 * We can fail due to a timeout or signal or 2386 * unsuccessful send. 2387 */ 2388 ret = rib_sendwait(qp, wdesc); 2389 2390 return (ret); 2391 } 2392 } 2393 2394 return (RDMA_SUCCESS); 2395 } 2396 2397 2398 rdma_stat 2399 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2400 { 2401 rdma_stat ret; 2402 caddr_t wd; 2403 2404 /* send-wait & cv_signal */ 2405 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2406 return (ret); 2407 } 2408 2409 /* 2410 * Deprecated/obsolete interface not used currently 2411 * but earlier used for READ-READ protocol. 2412 * Send RPC reply and wait for RDMA_DONE. 2413 */ 2414 rdma_stat 2415 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2416 { 2417 rdma_stat ret = RDMA_SUCCESS; 2418 struct rdma_done_list *rd; 2419 clock_t cv_wait_ret; 2420 caddr_t *wid = NULL; 2421 rib_qp_t *qp = ctoqp(conn); 2422 2423 mutex_enter(&qp->rdlist_lock); 2424 rd = rdma_done_add(qp, msgid); 2425 2426 /* No cv_signal (whether send-wait or no-send-wait) */ 2427 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2428 2429 if (ret != RDMA_SUCCESS) { 2430 rdma_done_rm(qp, rd); 2431 } else { 2432 /* 2433 * Wait for RDMA_DONE from remote end 2434 */ 2435 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2436 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2437 TR_CLOCK_TICK); 2438 2439 rdma_done_rm(qp, rd); 2440 2441 if (cv_wait_ret < 0) { 2442 ret = RDMA_TIMEDOUT; 2443 } 2444 } 2445 2446 mutex_exit(&qp->rdlist_lock); 2447 return (ret); 2448 } 2449 2450 static struct recv_wid * 2451 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2452 { 2453 struct recv_wid *rwid; 2454 2455 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2456 rwid->xid = msgid; 2457 rwid->addr = sgl->ds_va; 2458 rwid->qp = qp; 2459 2460 return (rwid); 2461 } 2462 2463 static void 2464 rib_free_wid(struct recv_wid *rwid) 2465 { 2466 kmem_free(rwid, sizeof (struct recv_wid)); 2467 } 2468 2469 rdma_stat 2470 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2471 { 2472 rib_qp_t *qp = ctoqp(conn); 2473 struct clist *clp = cl; 2474 struct reply *rep; 2475 struct recv_wid *rwid; 2476 int nds; 2477 ibt_wr_ds_t sgl[DSEG_MAX]; 2478 ibt_recv_wr_t recv_wr; 2479 rdma_stat ret; 2480 ibt_status_t ibt_status; 2481 2482 /* 2483 * rdma_clnt_postrecv uses RECV_BUFFER. 2484 */ 2485 2486 nds = 0; 2487 while (cl != NULL) { 2488 if (nds >= DSEG_MAX) { 2489 ret = RDMA_FAILED; 2490 goto done; 2491 } 2492 sgl[nds].ds_va = cl->w.c_saddr; 2493 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2494 sgl[nds].ds_len = cl->c_len; 2495 cl = cl->c_next; 2496 nds++; 2497 } 2498 2499 if (nds != 1) { 2500 ret = RDMA_FAILED; 2501 goto done; 2502 } 2503 2504 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2505 recv_wr.wr_nds = nds; 2506 recv_wr.wr_sgl = sgl; 2507 2508 rwid = rib_create_wid(qp, &sgl[0], msgid); 2509 if (rwid) { 2510 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2511 } else { 2512 ret = RDMA_NORESOURCE; 2513 goto done; 2514 } 2515 rep = rib_addreplylist(qp, msgid); 2516 if (!rep) { 2517 rib_free_wid(rwid); 2518 ret = RDMA_NORESOURCE; 2519 goto done; 2520 } 2521 2522 mutex_enter(&conn->c_lock); 2523 2524 if (conn->c_state == C_CONNECTED) { 2525 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2526 } 2527 2528 if (conn->c_state != C_CONNECTED || 2529 ibt_status != IBT_SUCCESS) { 2530 if (conn->c_state != C_DISCONN_PEND) 2531 conn->c_state = C_ERROR_CONN; 2532 mutex_exit(&conn->c_lock); 2533 rib_free_wid(rwid); 2534 (void) rib_rem_rep(qp, rep); 2535 ret = RDMA_CONNLOST; 2536 goto done; 2537 } 2538 2539 mutex_enter(&qp->posted_rbufs_lock); 2540 qp->n_posted_rbufs++; 2541 mutex_exit(&qp->posted_rbufs_lock); 2542 2543 mutex_exit(&conn->c_lock); 2544 return (RDMA_SUCCESS); 2545 2546 done: 2547 while (clp != NULL) { 2548 rib_rbuf_free(conn, RECV_BUFFER, 2549 (void *)(uintptr_t)clp->w.c_saddr3); 2550 clp = clp->c_next; 2551 } 2552 return (ret); 2553 } 2554 2555 rdma_stat 2556 rib_svc_post(CONN* conn, struct clist *cl) 2557 { 2558 rib_qp_t *qp = ctoqp(conn); 2559 struct svc_recv *s_recvp; 2560 int nds; 2561 ibt_wr_ds_t sgl[DSEG_MAX]; 2562 ibt_recv_wr_t recv_wr; 2563 ibt_status_t ibt_status; 2564 2565 nds = 0; 2566 while (cl != NULL) { 2567 if (nds >= DSEG_MAX) { 2568 return (RDMA_FAILED); 2569 } 2570 sgl[nds].ds_va = cl->w.c_saddr; 2571 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2572 sgl[nds].ds_len = cl->c_len; 2573 cl = cl->c_next; 2574 nds++; 2575 } 2576 2577 if (nds != 1) { 2578 rib_rbuf_free(conn, RECV_BUFFER, 2579 (caddr_t)(uintptr_t)sgl[0].ds_va); 2580 2581 return (RDMA_FAILED); 2582 } 2583 2584 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2585 recv_wr.wr_nds = nds; 2586 recv_wr.wr_sgl = sgl; 2587 2588 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2589 /* Use s_recvp's addr as wr id */ 2590 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2591 mutex_enter(&conn->c_lock); 2592 if (conn->c_state == C_CONNECTED) { 2593 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2594 } 2595 if (conn->c_state != C_CONNECTED || 2596 ibt_status != IBT_SUCCESS) { 2597 if (conn->c_state != C_DISCONN_PEND) 2598 conn->c_state = C_ERROR_CONN; 2599 mutex_exit(&conn->c_lock); 2600 rib_rbuf_free(conn, RECV_BUFFER, 2601 (caddr_t)(uintptr_t)sgl[0].ds_va); 2602 (void) rib_free_svc_recv(s_recvp); 2603 2604 return (RDMA_CONNLOST); 2605 } 2606 mutex_exit(&conn->c_lock); 2607 2608 return (RDMA_SUCCESS); 2609 } 2610 2611 /* Client */ 2612 rdma_stat 2613 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2614 { 2615 return (rib_clnt_post(conn, cl, msgid)); 2616 } 2617 2618 /* Client */ 2619 rdma_stat 2620 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2621 { 2622 rib_qp_t *qp = ctoqp(conn); 2623 struct reply *rep; 2624 2625 mutex_enter(&qp->replylist_lock); 2626 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2627 if (rep->xid == msgid) { 2628 if (rep->vaddr_cq) { 2629 rib_rbuf_free(conn, RECV_BUFFER, 2630 (caddr_t)(uintptr_t)rep->vaddr_cq); 2631 } 2632 (void) rib_remreply(qp, rep); 2633 break; 2634 } 2635 } 2636 mutex_exit(&qp->replylist_lock); 2637 2638 return (RDMA_SUCCESS); 2639 } 2640 2641 /* Server */ 2642 rdma_stat 2643 rib_post_recv(CONN *conn, struct clist *cl) 2644 { 2645 rib_qp_t *qp = ctoqp(conn); 2646 2647 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2648 mutex_enter(&qp->posted_rbufs_lock); 2649 qp->n_posted_rbufs++; 2650 mutex_exit(&qp->posted_rbufs_lock); 2651 return (RDMA_SUCCESS); 2652 } 2653 return (RDMA_FAILED); 2654 } 2655 2656 /* 2657 * Client side only interface to "recv" the rpc reply buf 2658 * posted earlier by rib_post_resp(conn, cl, msgid). 2659 */ 2660 rdma_stat 2661 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2662 { 2663 struct reply *rep = NULL; 2664 clock_t timout, cv_wait_ret; 2665 rdma_stat ret = RDMA_SUCCESS; 2666 rib_qp_t *qp = ctoqp(conn); 2667 2668 /* 2669 * Find the reply structure for this msgid 2670 */ 2671 mutex_enter(&qp->replylist_lock); 2672 2673 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2674 if (rep->xid == msgid) 2675 break; 2676 } 2677 2678 if (rep != NULL) { 2679 /* 2680 * If message not yet received, wait. 2681 */ 2682 if (rep->status == (uint_t)REPLY_WAIT) { 2683 timout = ddi_get_lbolt() + 2684 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2685 2686 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2687 &qp->replylist_lock, timout)) > 0 && 2688 rep->status == (uint_t)REPLY_WAIT) 2689 ; 2690 2691 switch (cv_wait_ret) { 2692 case -1: /* timeout */ 2693 ret = RDMA_TIMEDOUT; 2694 break; 2695 case 0: 2696 ret = RDMA_INTR; 2697 break; 2698 default: 2699 break; 2700 } 2701 } 2702 2703 if (rep->status == RDMA_SUCCESS) { 2704 struct clist *cl = NULL; 2705 2706 /* 2707 * Got message successfully 2708 */ 2709 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2710 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2711 *clp = cl; 2712 } else { 2713 if (rep->status != (uint_t)REPLY_WAIT) { 2714 /* 2715 * Got error in reply message. Free 2716 * recv buffer here. 2717 */ 2718 ret = rep->status; 2719 rib_rbuf_free(conn, RECV_BUFFER, 2720 (caddr_t)(uintptr_t)rep->vaddr_cq); 2721 } 2722 } 2723 (void) rib_remreply(qp, rep); 2724 } else { 2725 /* 2726 * No matching reply structure found for given msgid on the 2727 * reply wait list. 2728 */ 2729 ret = RDMA_INVAL; 2730 DTRACE_PROBE(rpcib__i__nomatchxid2); 2731 } 2732 2733 /* 2734 * Done. 2735 */ 2736 mutex_exit(&qp->replylist_lock); 2737 return (ret); 2738 } 2739 2740 /* 2741 * RDMA write a buffer to the remote address. 2742 */ 2743 rdma_stat 2744 rib_write(CONN *conn, struct clist *cl, int wait) 2745 { 2746 ibt_send_wr_t tx_wr; 2747 int cv_sig; 2748 ibt_wr_ds_t sgl[DSEG_MAX]; 2749 struct send_wid *wdesc; 2750 ibt_status_t ibt_status; 2751 rdma_stat ret = RDMA_SUCCESS; 2752 rib_qp_t *qp = ctoqp(conn); 2753 uint64_t n_writes = 0; 2754 2755 if (cl == NULL) { 2756 return (RDMA_FAILED); 2757 } 2758 2759 while ((cl != NULL)) { 2760 if (cl->c_len > 0) { 2761 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2762 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2763 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2764 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2765 sgl[0].ds_va = cl->w.c_saddr; 2766 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2767 sgl[0].ds_len = cl->c_len; 2768 2769 if (wait) { 2770 cv_sig = 1; 2771 } else { 2772 if (n_writes > max_unsignaled_rws) { 2773 n_writes = 0; 2774 cv_sig = 1; 2775 } else { 2776 cv_sig = 0; 2777 } 2778 } 2779 2780 if (cv_sig) { 2781 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2782 wdesc = rib_init_sendwait(0, cv_sig, qp); 2783 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2784 mutex_enter(&wdesc->sendwait_lock); 2785 } else { 2786 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2787 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2788 } 2789 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2790 tx_wr.wr_trans = IBT_RC_SRV; 2791 tx_wr.wr_nds = 1; 2792 tx_wr.wr_sgl = sgl; 2793 2794 mutex_enter(&conn->c_lock); 2795 if (conn->c_state == C_CONNECTED) { 2796 ibt_status = 2797 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2798 } 2799 if (conn->c_state != C_CONNECTED || 2800 ibt_status != IBT_SUCCESS) { 2801 if (conn->c_state != C_DISCONN_PEND) 2802 conn->c_state = C_ERROR_CONN; 2803 mutex_exit(&conn->c_lock); 2804 if (cv_sig) { 2805 mutex_exit(&wdesc->sendwait_lock); 2806 (void) rib_free_sendwait(wdesc); 2807 } 2808 return (RDMA_CONNLOST); 2809 } 2810 2811 mutex_exit(&conn->c_lock); 2812 2813 /* 2814 * Wait for send to complete 2815 */ 2816 if (cv_sig) { 2817 2818 rib_send_hold(qp); 2819 mutex_exit(&wdesc->sendwait_lock); 2820 2821 ret = rib_sendwait(qp, wdesc); 2822 if (ret != 0) 2823 return (ret); 2824 } 2825 n_writes ++; 2826 } 2827 cl = cl->c_next; 2828 } 2829 return (RDMA_SUCCESS); 2830 } 2831 2832 /* 2833 * RDMA Read a buffer from the remote address. 2834 */ 2835 rdma_stat 2836 rib_read(CONN *conn, struct clist *cl, int wait) 2837 { 2838 ibt_send_wr_t rx_wr; 2839 int cv_sig = 0; 2840 ibt_wr_ds_t sgl; 2841 struct send_wid *wdesc; 2842 ibt_status_t ibt_status = IBT_SUCCESS; 2843 rdma_stat ret = RDMA_SUCCESS; 2844 rib_qp_t *qp = ctoqp(conn); 2845 2846 if (cl == NULL) { 2847 return (RDMA_FAILED); 2848 } 2849 2850 while (cl != NULL) { 2851 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2852 /* 2853 * Remote address is at the head chunk item in list. 2854 */ 2855 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2856 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2857 2858 sgl.ds_va = cl->u.c_daddr; 2859 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2860 sgl.ds_len = cl->c_len; 2861 2862 /* 2863 * If there are multiple chunks to be read, and 2864 * wait is set, ask for signal only for the last chunk 2865 * and wait only on the last chunk. The completion of 2866 * RDMA_READ on last chunk ensures that reads on all 2867 * previous chunks are also completed. 2868 */ 2869 if (wait && (cl->c_next == NULL)) { 2870 cv_sig = 1; 2871 wdesc = rib_init_sendwait(0, cv_sig, qp); 2872 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2873 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2874 mutex_enter(&wdesc->sendwait_lock); 2875 } else { 2876 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2877 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2878 } 2879 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2880 rx_wr.wr_trans = IBT_RC_SRV; 2881 rx_wr.wr_nds = 1; 2882 rx_wr.wr_sgl = &sgl; 2883 2884 mutex_enter(&conn->c_lock); 2885 if (conn->c_state == C_CONNECTED) { 2886 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2887 } 2888 if (conn->c_state != C_CONNECTED || 2889 ibt_status != IBT_SUCCESS) { 2890 if (conn->c_state != C_DISCONN_PEND) 2891 conn->c_state = C_ERROR_CONN; 2892 mutex_exit(&conn->c_lock); 2893 if (wait && (cl->c_next == NULL)) { 2894 mutex_exit(&wdesc->sendwait_lock); 2895 (void) rib_free_sendwait(wdesc); 2896 } 2897 return (RDMA_CONNLOST); 2898 } 2899 2900 mutex_exit(&conn->c_lock); 2901 2902 /* 2903 * Wait for send to complete if this is the 2904 * last item in the list. 2905 */ 2906 if (wait && cl->c_next == NULL) { 2907 rib_send_hold(qp); 2908 mutex_exit(&wdesc->sendwait_lock); 2909 2910 ret = rib_sendwait(qp, wdesc); 2911 2912 if (ret != 0) 2913 return (ret); 2914 } 2915 cl = cl->c_next; 2916 } 2917 return (RDMA_SUCCESS); 2918 } 2919 2920 /* 2921 * rib_srv_cm_handler() 2922 * Connection Manager callback to handle RC connection requests. 2923 */ 2924 /* ARGSUSED */ 2925 static ibt_cm_status_t 2926 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2927 ibt_cm_return_args_t *ret_args, void *priv_data, 2928 ibt_priv_data_len_t len) 2929 { 2930 queue_t *q; 2931 rib_qp_t *qp; 2932 rib_hca_t *hca; 2933 rdma_stat status = RDMA_SUCCESS; 2934 int i; 2935 struct clist cl; 2936 rdma_buf_t rdbuf = {0}; 2937 void *buf = NULL; 2938 CONN *conn; 2939 ibt_ip_cm_info_t ipinfo; 2940 struct sockaddr_in *s; 2941 struct sockaddr_in6 *s6; 2942 int sin_size = sizeof (struct sockaddr_in); 2943 int in_size = sizeof (struct in_addr); 2944 int sin6_size = sizeof (struct sockaddr_in6); 2945 2946 ASSERT(any != NULL); 2947 ASSERT(event != NULL); 2948 2949 hca = (rib_hca_t *)any; 2950 2951 /* got a connection request */ 2952 switch (event->cm_type) { 2953 case IBT_CM_EVENT_REQ_RCV: 2954 /* 2955 * If the plugin is in the NO_ACCEPT state, bail out. 2956 */ 2957 mutex_enter(&plugin_state_lock); 2958 if (plugin_state == NO_ACCEPT) { 2959 mutex_exit(&plugin_state_lock); 2960 return (IBT_CM_REJECT); 2961 } 2962 mutex_exit(&plugin_state_lock); 2963 2964 /* 2965 * Need to send a MRA MAD to CM so that it does not 2966 * timeout on us. 2967 */ 2968 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2969 event->cm_event.req.req_timeout * 8, NULL, 0); 2970 2971 mutex_enter(&rib_stat->open_hca_lock); 2972 q = rib_stat->q; 2973 mutex_exit(&rib_stat->open_hca_lock); 2974 2975 status = rib_svc_create_chan(hca, (caddr_t)q, 2976 event->cm_event.req.req_prim_hca_port, &qp); 2977 2978 if (status) { 2979 return (IBT_CM_REJECT); 2980 } 2981 2982 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2983 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2984 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2985 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2986 2987 /* 2988 * Pre-posts RECV buffers 2989 */ 2990 conn = qptoc(qp); 2991 for (i = 0; i < preposted_rbufs; i++) { 2992 bzero(&rdbuf, sizeof (rdbuf)); 2993 rdbuf.type = RECV_BUFFER; 2994 buf = rib_rbuf_alloc(conn, &rdbuf); 2995 if (buf == NULL) { 2996 /* 2997 * A connection is not established yet. 2998 * Just flush the channel. Buffers 2999 * posted till now will error out with 3000 * IBT_WC_WR_FLUSHED_ERR. 3001 */ 3002 (void) ibt_flush_channel(qp->qp_hdl); 3003 (void) rib_disconnect_channel(conn, NULL); 3004 return (IBT_CM_REJECT); 3005 } 3006 3007 bzero(&cl, sizeof (cl)); 3008 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3009 cl.c_len = rdbuf.len; 3010 cl.c_smemhandle.mrc_lmr = 3011 rdbuf.handle.mrc_lmr; /* lkey */ 3012 cl.c_next = NULL; 3013 status = rib_post_recv(conn, &cl); 3014 if (status != RDMA_SUCCESS) { 3015 /* 3016 * A connection is not established yet. 3017 * Just flush the channel. Buffers 3018 * posted till now will error out with 3019 * IBT_WC_WR_FLUSHED_ERR. 3020 */ 3021 (void) ibt_flush_channel(qp->qp_hdl); 3022 (void) rib_disconnect_channel(conn, NULL); 3023 return (IBT_CM_REJECT); 3024 } 3025 } 3026 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3027 3028 /* 3029 * Get the address translation 3030 */ 3031 rw_enter(&hca->state_lock, RW_READER); 3032 if (hca->state == HCA_DETACHED) { 3033 rw_exit(&hca->state_lock); 3034 return (IBT_CM_REJECT); 3035 } 3036 rw_exit(&hca->state_lock); 3037 3038 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3039 3040 if (ibt_get_ip_data(event->cm_priv_data_len, 3041 event->cm_priv_data, 3042 &ipinfo) != IBT_SUCCESS) { 3043 3044 return (IBT_CM_REJECT); 3045 } 3046 3047 switch (ipinfo.src_addr.family) { 3048 case AF_INET: 3049 3050 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3051 KM_SLEEP); 3052 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3053 3054 conn->c_raddr.maxlen = 3055 conn->c_raddr.len = sin_size; 3056 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3057 3058 s = (struct sockaddr_in *)conn->c_raddr.buf; 3059 s->sin_family = AF_INET; 3060 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3061 &s->sin_addr, in_size); 3062 3063 conn->c_laddr.maxlen = 3064 conn->c_laddr.len = sin_size; 3065 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3066 3067 s = (struct sockaddr_in *)conn->c_laddr.buf; 3068 s->sin_family = AF_INET; 3069 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3070 &s->sin_addr, in_size); 3071 3072 break; 3073 3074 case AF_INET6: 3075 3076 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3077 KM_SLEEP); 3078 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3079 3080 conn->c_raddr.maxlen = 3081 conn->c_raddr.len = sin6_size; 3082 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3083 3084 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3085 s6->sin6_family = AF_INET6; 3086 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3087 &s6->sin6_addr, 3088 sizeof (struct in6_addr)); 3089 3090 conn->c_laddr.maxlen = 3091 conn->c_laddr.len = sin6_size; 3092 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3093 3094 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3095 s6->sin6_family = AF_INET6; 3096 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3097 &s6->sin6_addr, 3098 sizeof (struct in6_addr)); 3099 3100 break; 3101 3102 default: 3103 return (IBT_CM_REJECT); 3104 } 3105 3106 break; 3107 3108 case IBT_CM_EVENT_CONN_CLOSED: 3109 { 3110 CONN *conn; 3111 rib_qp_t *qp; 3112 3113 switch (event->cm_event.closed) { 3114 case IBT_CM_CLOSED_DREP_RCVD: 3115 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3116 case IBT_CM_CLOSED_DUP: 3117 case IBT_CM_CLOSED_ABORT: 3118 case IBT_CM_CLOSED_ALREADY: 3119 /* 3120 * These cases indicate the local end initiated 3121 * the closing of the channel. Nothing to do here. 3122 */ 3123 break; 3124 default: 3125 /* 3126 * Reason for CONN_CLOSED event must be one of 3127 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3128 * or IBT_CM_CLOSED_STALE. These indicate cases were 3129 * the remote end is closing the channel. In these 3130 * cases free the channel and transition to error 3131 * state 3132 */ 3133 qp = ibt_get_chan_private(event->cm_channel); 3134 conn = qptoc(qp); 3135 mutex_enter(&conn->c_lock); 3136 if (conn->c_state == C_DISCONN_PEND) { 3137 mutex_exit(&conn->c_lock); 3138 break; 3139 } 3140 conn->c_state = C_ERROR_CONN; 3141 3142 /* 3143 * Free the conn if c_ref goes down to 0 3144 */ 3145 if (conn->c_ref == 0) { 3146 /* 3147 * Remove from list and free conn 3148 */ 3149 conn->c_state = C_DISCONN_PEND; 3150 mutex_exit(&conn->c_lock); 3151 (void) rib_disconnect_channel(conn, 3152 &hca->srv_conn_list); 3153 } else { 3154 /* 3155 * conn will be freed when c_ref goes to 0. 3156 * Indicate to cleaning thread not to close 3157 * the connection, but just free the channel. 3158 */ 3159 conn->c_flags |= C_CLOSE_NOTNEEDED; 3160 mutex_exit(&conn->c_lock); 3161 } 3162 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3163 break; 3164 } 3165 break; 3166 } 3167 case IBT_CM_EVENT_CONN_EST: 3168 /* 3169 * RTU received, hence connection established. 3170 */ 3171 if (rib_debug > 1) 3172 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3173 "(CONN_EST) channel established"); 3174 break; 3175 3176 default: 3177 if (rib_debug > 2) { 3178 /* Let CM handle the following events. */ 3179 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3180 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3181 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3182 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3183 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3184 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3185 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3186 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3187 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3188 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3189 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3190 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3191 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3192 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3193 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3194 } 3195 } 3196 return (IBT_CM_DEFAULT); 3197 } 3198 3199 /* accept all other CM messages (i.e. let the CM handle them) */ 3200 return (IBT_CM_ACCEPT); 3201 } 3202 3203 static rdma_stat 3204 rib_register_service(rib_hca_t *hca, int service_type, 3205 uint8_t protocol_num, in_port_t dst_port) 3206 { 3207 ibt_srv_desc_t sdesc; 3208 ibt_hca_portinfo_t *port_infop; 3209 ib_svc_id_t srv_id; 3210 ibt_srv_hdl_t srv_hdl; 3211 uint_t port_size; 3212 uint_t pki, i, num_ports, nbinds; 3213 ibt_status_t ibt_status; 3214 rib_service_t *service; 3215 ib_pkey_t pkey; 3216 3217 /* 3218 * Query all ports for the given HCA 3219 */ 3220 rw_enter(&hca->state_lock, RW_READER); 3221 if (hca->state != HCA_DETACHED) { 3222 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3223 &num_ports, &port_size); 3224 rw_exit(&hca->state_lock); 3225 } else { 3226 rw_exit(&hca->state_lock); 3227 return (RDMA_FAILED); 3228 } 3229 if (ibt_status != IBT_SUCCESS) { 3230 return (RDMA_FAILED); 3231 } 3232 3233 DTRACE_PROBE1(rpcib__i__regservice_numports, 3234 int, num_ports); 3235 3236 for (i = 0; i < num_ports; i++) { 3237 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3238 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3239 int, i+1); 3240 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3241 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3242 int, i+1); 3243 } 3244 } 3245 3246 /* 3247 * Get all the IP addresses on this system to register the 3248 * given "service type" on all DNS recognized IP addrs. 3249 * Each service type such as NFS will have all the systems 3250 * IP addresses as its different names. For now the only 3251 * type of service we support in RPCIB is NFS. 3252 */ 3253 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3254 /* 3255 * Start registering and binding service to active 3256 * on active ports on this HCA. 3257 */ 3258 nbinds = 0; 3259 for (service = rib_stat->service_list; 3260 service && (service->srv_type != service_type); 3261 service = service->next) 3262 ; 3263 3264 if (service == NULL) { 3265 /* 3266 * We use IP addresses as the service names for 3267 * service registration. Register each of them 3268 * with CM to obtain a svc_id and svc_hdl. We do not 3269 * register the service with machine's loopback address. 3270 */ 3271 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3272 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3273 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3274 sdesc.sd_handler = rib_srv_cm_handler; 3275 sdesc.sd_flags = 0; 3276 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3277 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3278 1, &srv_hdl, &srv_id); 3279 if ((ibt_status != IBT_SUCCESS) && 3280 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3281 rw_exit(&rib_stat->service_list_lock); 3282 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3283 int, ibt_status); 3284 ibt_free_portinfo(port_infop, port_size); 3285 return (RDMA_FAILED); 3286 } 3287 3288 /* 3289 * Allocate and prepare a service entry 3290 */ 3291 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3292 3293 service->srv_type = service_type; 3294 service->srv_hdl = srv_hdl; 3295 service->srv_id = srv_id; 3296 3297 service->next = rib_stat->service_list; 3298 rib_stat->service_list = service; 3299 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3300 int, service->srv_type); 3301 } else { 3302 srv_hdl = service->srv_hdl; 3303 srv_id = service->srv_id; 3304 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3305 int, service->srv_type); 3306 } 3307 3308 for (i = 0; i < num_ports; i++) { 3309 ibt_sbind_hdl_t sbp; 3310 rib_hca_service_t *hca_srv; 3311 ib_gid_t gid; 3312 3313 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3314 continue; 3315 3316 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3317 pkey = port_infop[i].p_pkey_tbl[pki]; 3318 3319 rw_enter(&hca->bound_services_lock, RW_READER); 3320 gid = port_infop[i].p_sgid_tbl[0]; 3321 for (hca_srv = hca->bound_services; hca_srv; 3322 hca_srv = hca_srv->next) { 3323 if ((hca_srv->srv_id == service->srv_id) && 3324 (hca_srv->gid.gid_prefix == 3325 gid.gid_prefix) && 3326 (hca_srv->gid.gid_guid == gid.gid_guid)) 3327 break; 3328 } 3329 rw_exit(&hca->bound_services_lock); 3330 if (hca_srv != NULL) { 3331 /* 3332 * port is alreay bound the the service 3333 */ 3334 DTRACE_PROBE1( 3335 rpcib__i__regservice__already__bound, 3336 int, i+1); 3337 nbinds++; 3338 continue; 3339 } 3340 3341 if ((pkey & IBSRM_HB) && 3342 (pkey != IB_PKEY_INVALID_FULL)) { 3343 3344 sbp = NULL; 3345 ibt_status = ibt_bind_service(srv_hdl, 3346 gid, NULL, hca, &sbp); 3347 3348 if (ibt_status == IBT_SUCCESS) { 3349 hca_srv = kmem_zalloc( 3350 sizeof (rib_hca_service_t), 3351 KM_SLEEP); 3352 hca_srv->srv_id = srv_id; 3353 hca_srv->gid = gid; 3354 hca_srv->sbind_hdl = sbp; 3355 3356 rw_enter(&hca->bound_services_lock, 3357 RW_WRITER); 3358 hca_srv->next = hca->bound_services; 3359 hca->bound_services = hca_srv; 3360 rw_exit(&hca->bound_services_lock); 3361 nbinds++; 3362 } 3363 3364 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3365 int, ibt_status); 3366 } 3367 } 3368 } 3369 rw_exit(&rib_stat->service_list_lock); 3370 3371 ibt_free_portinfo(port_infop, port_size); 3372 3373 if (nbinds == 0) { 3374 return (RDMA_FAILED); 3375 } else { 3376 /* 3377 * Put this plugin into accept state, since atleast 3378 * one registration was successful. 3379 */ 3380 mutex_enter(&plugin_state_lock); 3381 plugin_state = ACCEPT; 3382 mutex_exit(&plugin_state_lock); 3383 return (RDMA_SUCCESS); 3384 } 3385 } 3386 3387 void 3388 rib_listen(struct rdma_svc_data *rd) 3389 { 3390 rdma_stat status; 3391 int n_listening = 0; 3392 rib_hca_t *hca; 3393 3394 mutex_enter(&rib_stat->listen_lock); 3395 /* 3396 * if rd parameter is NULL then it means that rib_stat->q is 3397 * already initialized by a call from RDMA and we just want to 3398 * add a newly attached HCA to the same listening state as other 3399 * HCAs. 3400 */ 3401 if (rd == NULL) { 3402 if (rib_stat->q == NULL) { 3403 mutex_exit(&rib_stat->listen_lock); 3404 return; 3405 } 3406 } else { 3407 rib_stat->q = &rd->q; 3408 } 3409 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3410 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3411 /* 3412 * First check if a hca is still attached 3413 */ 3414 rw_enter(&hca->state_lock, RW_READER); 3415 if (hca->state != HCA_INITED) { 3416 rw_exit(&hca->state_lock); 3417 continue; 3418 } 3419 rw_exit(&hca->state_lock); 3420 3421 /* 3422 * Right now the only service type is NFS. Hence 3423 * force feed this value. Ideally to communicate 3424 * the service type it should be passed down in 3425 * rdma_svc_data. 3426 */ 3427 status = rib_register_service(hca, NFS, 3428 IPPROTO_TCP, nfs_rdma_port); 3429 if (status == RDMA_SUCCESS) 3430 n_listening++; 3431 } 3432 rw_exit(&rib_stat->hcas_list_lock); 3433 3434 /* 3435 * Service active on an HCA, check rd->err_code for more 3436 * explainable errors. 3437 */ 3438 if (rd) { 3439 if (n_listening > 0) { 3440 rd->active = 1; 3441 rd->err_code = RDMA_SUCCESS; 3442 } else { 3443 rd->active = 0; 3444 rd->err_code = RDMA_FAILED; 3445 } 3446 } 3447 mutex_exit(&rib_stat->listen_lock); 3448 } 3449 3450 /* XXXX */ 3451 /* ARGSUSED */ 3452 static void 3453 rib_listen_stop(struct rdma_svc_data *svcdata) 3454 { 3455 rib_hca_t *hca; 3456 3457 mutex_enter(&rib_stat->listen_lock); 3458 /* 3459 * KRPC called the RDMATF to stop the listeners, this means 3460 * stop sending incomming or recieved requests to KRPC master 3461 * transport handle for RDMA-IB. This is also means that the 3462 * master transport handle, responsible for us, is going away. 3463 */ 3464 mutex_enter(&plugin_state_lock); 3465 plugin_state = NO_ACCEPT; 3466 if (svcdata != NULL) 3467 svcdata->active = 0; 3468 mutex_exit(&plugin_state_lock); 3469 3470 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3471 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3472 /* 3473 * First check if a hca is still attached 3474 */ 3475 rw_enter(&hca->state_lock, RW_READER); 3476 if (hca->state == HCA_DETACHED) { 3477 rw_exit(&hca->state_lock); 3478 continue; 3479 } 3480 rib_close_channels(&hca->srv_conn_list); 3481 rib_stop_services(hca); 3482 rw_exit(&hca->state_lock); 3483 } 3484 rw_exit(&rib_stat->hcas_list_lock); 3485 3486 /* 3487 * Avoid rib_listen() using the stale q field. 3488 * This could happen if a port goes up after all services 3489 * are already unregistered. 3490 */ 3491 rib_stat->q = NULL; 3492 mutex_exit(&rib_stat->listen_lock); 3493 } 3494 3495 /* 3496 * Traverse the HCA's service list to unbind and deregister services. 3497 * For each bound service of HCA to be removed, first find the corresponding 3498 * service handle (srv_hdl) and then unbind the service by calling 3499 * ibt_unbind_service(). 3500 */ 3501 static void 3502 rib_stop_services(rib_hca_t *hca) 3503 { 3504 rib_hca_service_t *srv_list, *to_remove; 3505 3506 /* 3507 * unbind and deregister the services for this service type. 3508 * Right now there is only one service type. In future it will 3509 * be passed down to this function. 3510 */ 3511 rw_enter(&hca->bound_services_lock, RW_READER); 3512 srv_list = hca->bound_services; 3513 hca->bound_services = NULL; 3514 rw_exit(&hca->bound_services_lock); 3515 3516 while (srv_list != NULL) { 3517 rib_service_t *sc; 3518 3519 to_remove = srv_list; 3520 srv_list = to_remove->next; 3521 rw_enter(&rib_stat->service_list_lock, RW_READER); 3522 for (sc = rib_stat->service_list; 3523 sc && (sc->srv_id != to_remove->srv_id); 3524 sc = sc->next) 3525 ; 3526 /* 3527 * if sc is NULL then the service doesn't exist anymore, 3528 * probably just removed completely through rib_stat. 3529 */ 3530 if (sc != NULL) 3531 (void) ibt_unbind_service(sc->srv_hdl, 3532 to_remove->sbind_hdl); 3533 rw_exit(&rib_stat->service_list_lock); 3534 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3535 } 3536 } 3537 3538 static struct svc_recv * 3539 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3540 { 3541 struct svc_recv *recvp; 3542 3543 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3544 recvp->vaddr = sgl->ds_va; 3545 recvp->qp = qp; 3546 recvp->bytes_xfer = 0; 3547 return (recvp); 3548 } 3549 3550 static int 3551 rib_free_svc_recv(struct svc_recv *recvp) 3552 { 3553 kmem_free(recvp, sizeof (*recvp)); 3554 3555 return (0); 3556 } 3557 3558 static struct reply * 3559 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3560 { 3561 struct reply *rep; 3562 3563 3564 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3565 if (rep == NULL) { 3566 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3567 return (NULL); 3568 } 3569 rep->xid = msgid; 3570 rep->vaddr_cq = NULL; 3571 rep->bytes_xfer = 0; 3572 rep->status = (uint_t)REPLY_WAIT; 3573 rep->prev = NULL; 3574 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3575 3576 mutex_enter(&qp->replylist_lock); 3577 if (qp->replylist) { 3578 rep->next = qp->replylist; 3579 qp->replylist->prev = rep; 3580 } 3581 qp->rep_list_size++; 3582 3583 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3584 int, qp->rep_list_size); 3585 3586 qp->replylist = rep; 3587 mutex_exit(&qp->replylist_lock); 3588 3589 return (rep); 3590 } 3591 3592 static rdma_stat 3593 rib_rem_replylist(rib_qp_t *qp) 3594 { 3595 struct reply *r, *n; 3596 3597 mutex_enter(&qp->replylist_lock); 3598 for (r = qp->replylist; r != NULL; r = n) { 3599 n = r->next; 3600 (void) rib_remreply(qp, r); 3601 } 3602 mutex_exit(&qp->replylist_lock); 3603 3604 return (RDMA_SUCCESS); 3605 } 3606 3607 static int 3608 rib_remreply(rib_qp_t *qp, struct reply *rep) 3609 { 3610 3611 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3612 if (rep->prev) { 3613 rep->prev->next = rep->next; 3614 } 3615 if (rep->next) { 3616 rep->next->prev = rep->prev; 3617 } 3618 if (qp->replylist == rep) 3619 qp->replylist = rep->next; 3620 3621 cv_destroy(&rep->wait_cv); 3622 qp->rep_list_size--; 3623 3624 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3625 int, qp->rep_list_size); 3626 3627 kmem_free(rep, sizeof (*rep)); 3628 3629 return (0); 3630 } 3631 3632 rdma_stat 3633 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3634 struct mrc *buf_handle) 3635 { 3636 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3637 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3638 rdma_stat status; 3639 rib_hca_t *hca = (ctoqp(conn))->hca; 3640 3641 /* 3642 * Note: ALL buffer pools use the same memory type RDMARW. 3643 */ 3644 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3645 if (status == RDMA_SUCCESS) { 3646 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3647 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3648 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3649 } else { 3650 buf_handle->mrc_linfo = NULL; 3651 buf_handle->mrc_lmr = 0; 3652 buf_handle->mrc_rmr = 0; 3653 } 3654 return (status); 3655 } 3656 3657 static rdma_stat 3658 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3659 ibt_mr_flags_t spec, 3660 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3661 { 3662 ibt_mr_attr_t mem_attr; 3663 ibt_status_t ibt_status; 3664 mem_attr.mr_vaddr = (uintptr_t)buf; 3665 mem_attr.mr_len = (ib_msglen_t)size; 3666 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3667 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3668 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3669 IBT_MR_ENABLE_WINDOW_BIND | spec; 3670 3671 rw_enter(&hca->state_lock, RW_READER); 3672 if (hca->state != HCA_DETACHED) { 3673 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3674 &mem_attr, mr_hdlp, mr_descp); 3675 rw_exit(&hca->state_lock); 3676 } else { 3677 rw_exit(&hca->state_lock); 3678 return (RDMA_FAILED); 3679 } 3680 3681 if (ibt_status != IBT_SUCCESS) { 3682 return (RDMA_FAILED); 3683 } 3684 return (RDMA_SUCCESS); 3685 } 3686 3687 rdma_stat 3688 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3689 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3690 { 3691 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3692 rib_lrc_entry_t *l; 3693 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3694 rdma_stat status; 3695 rib_hca_t *hca = (ctoqp(conn))->hca; 3696 3697 /* 3698 * Non-coherent memory registration. 3699 */ 3700 l = (rib_lrc_entry_t *)lrc; 3701 if (l) { 3702 if (l->registered) { 3703 buf_handle->mrc_linfo = 3704 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3705 buf_handle->mrc_lmr = 3706 (uint32_t)l->lrc_mhandle.mrc_lmr; 3707 buf_handle->mrc_rmr = 3708 (uint32_t)l->lrc_mhandle.mrc_rmr; 3709 *sync_handle = (RIB_SYNCMEM_HANDLE) 3710 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3711 return (RDMA_SUCCESS); 3712 } else { 3713 /* Always register the whole buffer */ 3714 buf = (caddr_t)l->lrc_buf; 3715 buflen = l->lrc_len; 3716 } 3717 } 3718 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3719 3720 if (status == RDMA_SUCCESS) { 3721 if (l) { 3722 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3723 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3724 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3725 l->registered = TRUE; 3726 } 3727 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3728 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3729 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3730 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3731 } else { 3732 buf_handle->mrc_linfo = NULL; 3733 buf_handle->mrc_lmr = 0; 3734 buf_handle->mrc_rmr = 0; 3735 } 3736 return (status); 3737 } 3738 3739 /* ARGSUSED */ 3740 rdma_stat 3741 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3742 { 3743 rib_hca_t *hca = (ctoqp(conn))->hca; 3744 /* 3745 * Allow memory deregistration even if HCA is 3746 * getting detached. Need all outstanding 3747 * memory registrations to be deregistered 3748 * before HCA_DETACH_EVENT can be accepted. 3749 */ 3750 (void) ibt_deregister_mr(hca->hca_hdl, 3751 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3752 return (RDMA_SUCCESS); 3753 } 3754 3755 /* ARGSUSED */ 3756 rdma_stat 3757 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3758 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3759 { 3760 rib_lrc_entry_t *l; 3761 l = (rib_lrc_entry_t *)lrc; 3762 if (l) 3763 if (l->registered) 3764 return (RDMA_SUCCESS); 3765 3766 (void) rib_deregistermem(conn, buf, buf_handle); 3767 3768 return (RDMA_SUCCESS); 3769 } 3770 3771 /* ARGSUSED */ 3772 rdma_stat 3773 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3774 int len, int cpu) 3775 { 3776 ibt_status_t status; 3777 rib_hca_t *hca = (ctoqp(conn))->hca; 3778 ibt_mr_sync_t mr_segment; 3779 3780 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3781 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3782 mr_segment.ms_len = (ib_memlen_t)len; 3783 if (cpu) { 3784 /* make incoming data visible to memory */ 3785 mr_segment.ms_flags = IBT_SYNC_WRITE; 3786 } else { 3787 /* make memory changes visible to IO */ 3788 mr_segment.ms_flags = IBT_SYNC_READ; 3789 } 3790 rw_enter(&hca->state_lock, RW_READER); 3791 if (hca->state != HCA_DETACHED) { 3792 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3793 rw_exit(&hca->state_lock); 3794 } else { 3795 rw_exit(&hca->state_lock); 3796 return (RDMA_FAILED); 3797 } 3798 3799 if (status == IBT_SUCCESS) 3800 return (RDMA_SUCCESS); 3801 else { 3802 return (RDMA_FAILED); 3803 } 3804 } 3805 3806 /* 3807 * XXXX ???? 3808 */ 3809 static rdma_stat 3810 rib_getinfo(rdma_info_t *info) 3811 { 3812 /* 3813 * XXXX Hack! 3814 */ 3815 info->addrlen = 16; 3816 info->mts = 1000000; 3817 info->mtu = 1000000; 3818 3819 return (RDMA_SUCCESS); 3820 } 3821 3822 rib_bufpool_t * 3823 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3824 { 3825 rib_bufpool_t *rbp = NULL; 3826 bufpool_t *bp = NULL; 3827 caddr_t buf; 3828 ibt_mr_attr_t mem_attr; 3829 ibt_status_t ibt_status; 3830 int i, j; 3831 3832 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3833 3834 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3835 num * sizeof (void *), KM_SLEEP); 3836 3837 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3838 bp->numelems = num; 3839 3840 3841 switch (ptype) { 3842 case SEND_BUFFER: 3843 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3844 bp->rsize = RPC_MSG_SZ; 3845 break; 3846 case RECV_BUFFER: 3847 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3848 bp->rsize = RPC_BUF_SIZE; 3849 break; 3850 default: 3851 goto fail; 3852 } 3853 3854 /* 3855 * Register the pool. 3856 */ 3857 bp->bufsize = num * bp->rsize; 3858 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3859 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3860 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3861 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3862 sizeof (ibt_mr_desc_t), KM_SLEEP); 3863 rw_enter(&hca->state_lock, RW_READER); 3864 3865 if (hca->state == HCA_DETACHED) { 3866 rw_exit(&hca->state_lock); 3867 goto fail; 3868 } 3869 3870 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3871 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3872 mem_attr.mr_vaddr = (uintptr_t)buf; 3873 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3874 mem_attr.mr_as = NULL; 3875 ibt_status = ibt_register_mr(hca->hca_hdl, 3876 hca->pd_hdl, &mem_attr, 3877 &rbp->mr_hdl[i], 3878 &rbp->mr_desc[i]); 3879 if (ibt_status != IBT_SUCCESS) { 3880 for (j = 0; j < i; j++) { 3881 (void) ibt_deregister_mr(hca->hca_hdl, 3882 rbp->mr_hdl[j]); 3883 } 3884 rw_exit(&hca->state_lock); 3885 goto fail; 3886 } 3887 } 3888 rw_exit(&hca->state_lock); 3889 buf = (caddr_t)bp->buf; 3890 for (i = 0; i < num; i++, buf += bp->rsize) { 3891 bp->buflist[i] = (void *)buf; 3892 } 3893 bp->buffree = num - 1; /* no. of free buffers */ 3894 rbp->bpool = bp; 3895 3896 return (rbp); 3897 fail: 3898 if (bp) { 3899 if (bp->buf) 3900 kmem_free(bp->buf, bp->bufsize); 3901 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3902 } 3903 if (rbp) { 3904 if (rbp->mr_hdl) 3905 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3906 if (rbp->mr_desc) 3907 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3908 kmem_free(rbp, sizeof (rib_bufpool_t)); 3909 } 3910 return (NULL); 3911 } 3912 3913 static void 3914 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3915 { 3916 int i; 3917 rib_bufpool_t *rbp = NULL; 3918 bufpool_t *bp; 3919 3920 /* 3921 * Obtain pool address based on type of pool 3922 */ 3923 switch (ptype) { 3924 case SEND_BUFFER: 3925 rbp = hca->send_pool; 3926 break; 3927 case RECV_BUFFER: 3928 rbp = hca->recv_pool; 3929 break; 3930 default: 3931 return; 3932 } 3933 if (rbp == NULL) 3934 return; 3935 3936 bp = rbp->bpool; 3937 3938 /* 3939 * Deregister the pool memory and free it. 3940 */ 3941 for (i = 0; i < bp->numelems; i++) { 3942 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3943 } 3944 } 3945 3946 static void 3947 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3948 { 3949 3950 rib_bufpool_t *rbp = NULL; 3951 bufpool_t *bp; 3952 3953 /* 3954 * Obtain pool address based on type of pool 3955 */ 3956 switch (ptype) { 3957 case SEND_BUFFER: 3958 rbp = hca->send_pool; 3959 break; 3960 case RECV_BUFFER: 3961 rbp = hca->recv_pool; 3962 break; 3963 default: 3964 return; 3965 } 3966 if (rbp == NULL) 3967 return; 3968 3969 bp = rbp->bpool; 3970 3971 /* 3972 * Free the pool memory. 3973 */ 3974 if (rbp->mr_hdl) 3975 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3976 3977 if (rbp->mr_desc) 3978 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3979 if (bp->buf) 3980 kmem_free(bp->buf, bp->bufsize); 3981 mutex_destroy(&bp->buflock); 3982 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3983 kmem_free(rbp, sizeof (rib_bufpool_t)); 3984 } 3985 3986 void 3987 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3988 { 3989 /* 3990 * Deregister the pool memory and free it. 3991 */ 3992 rib_rbufpool_deregister(hca, ptype); 3993 rib_rbufpool_free(hca, ptype); 3994 } 3995 3996 /* 3997 * Fetch a buffer from the pool of type specified in rdbuf->type. 3998 */ 3999 static rdma_stat 4000 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4001 { 4002 rib_lrc_entry_t *rlep; 4003 4004 if (rdbuf->type == RDMA_LONG_BUFFER) { 4005 rlep = rib_get_cache_buf(conn, rdbuf->len); 4006 rdbuf->rb_private = (caddr_t)rlep; 4007 rdbuf->addr = rlep->lrc_buf; 4008 rdbuf->handle = rlep->lrc_mhandle; 4009 return (RDMA_SUCCESS); 4010 } 4011 4012 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4013 if (rdbuf->addr) { 4014 switch (rdbuf->type) { 4015 case SEND_BUFFER: 4016 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4017 break; 4018 case RECV_BUFFER: 4019 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4020 break; 4021 default: 4022 rdbuf->len = 0; 4023 } 4024 return (RDMA_SUCCESS); 4025 } else 4026 return (RDMA_FAILED); 4027 } 4028 4029 /* 4030 * Fetch a buffer of specified type. 4031 * Note that rdbuf->handle is mw's rkey. 4032 */ 4033 static void * 4034 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4035 { 4036 rib_qp_t *qp = ctoqp(conn); 4037 rib_hca_t *hca = qp->hca; 4038 rdma_btype ptype = rdbuf->type; 4039 void *buf; 4040 rib_bufpool_t *rbp = NULL; 4041 bufpool_t *bp; 4042 int i; 4043 4044 /* 4045 * Obtain pool address based on type of pool 4046 */ 4047 switch (ptype) { 4048 case SEND_BUFFER: 4049 rbp = hca->send_pool; 4050 break; 4051 case RECV_BUFFER: 4052 rbp = hca->recv_pool; 4053 break; 4054 default: 4055 return (NULL); 4056 } 4057 if (rbp == NULL) 4058 return (NULL); 4059 4060 bp = rbp->bpool; 4061 4062 mutex_enter(&bp->buflock); 4063 if (bp->buffree < 0) { 4064 mutex_exit(&bp->buflock); 4065 return (NULL); 4066 } 4067 4068 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4069 buf = bp->buflist[bp->buffree]; 4070 rdbuf->addr = buf; 4071 rdbuf->len = bp->rsize; 4072 for (i = bp->numelems - 1; i >= 0; i--) { 4073 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4074 rdbuf->handle.mrc_rmr = 4075 (uint32_t)rbp->mr_desc[i].md_rkey; 4076 rdbuf->handle.mrc_linfo = 4077 (uintptr_t)rbp->mr_hdl[i]; 4078 rdbuf->handle.mrc_lmr = 4079 (uint32_t)rbp->mr_desc[i].md_lkey; 4080 bp->buffree--; 4081 4082 mutex_exit(&bp->buflock); 4083 4084 return (buf); 4085 } 4086 } 4087 4088 mutex_exit(&bp->buflock); 4089 4090 return (NULL); 4091 } 4092 4093 static void 4094 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4095 { 4096 4097 if (rdbuf->type == RDMA_LONG_BUFFER) { 4098 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4099 rdbuf->rb_private = NULL; 4100 return; 4101 } 4102 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4103 } 4104 4105 static void 4106 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4107 { 4108 rib_qp_t *qp = ctoqp(conn); 4109 rib_hca_t *hca = qp->hca; 4110 rib_bufpool_t *rbp = NULL; 4111 bufpool_t *bp; 4112 4113 /* 4114 * Obtain pool address based on type of pool 4115 */ 4116 switch (ptype) { 4117 case SEND_BUFFER: 4118 rbp = hca->send_pool; 4119 break; 4120 case RECV_BUFFER: 4121 rbp = hca->recv_pool; 4122 break; 4123 default: 4124 return; 4125 } 4126 if (rbp == NULL) 4127 return; 4128 4129 bp = rbp->bpool; 4130 4131 mutex_enter(&bp->buflock); 4132 if (++bp->buffree >= bp->numelems) { 4133 /* 4134 * Should never happen 4135 */ 4136 bp->buffree--; 4137 } else { 4138 bp->buflist[bp->buffree] = buf; 4139 } 4140 mutex_exit(&bp->buflock); 4141 } 4142 4143 static rdma_stat 4144 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4145 { 4146 rw_enter(&connlist->conn_lock, RW_WRITER); 4147 if (connlist->conn_hd) { 4148 cn->c_next = connlist->conn_hd; 4149 connlist->conn_hd->c_prev = cn; 4150 } 4151 connlist->conn_hd = cn; 4152 rw_exit(&connlist->conn_lock); 4153 4154 return (RDMA_SUCCESS); 4155 } 4156 4157 static rdma_stat 4158 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4159 { 4160 rw_enter(&connlist->conn_lock, RW_WRITER); 4161 if (cn->c_prev) { 4162 cn->c_prev->c_next = cn->c_next; 4163 } 4164 if (cn->c_next) { 4165 cn->c_next->c_prev = cn->c_prev; 4166 } 4167 if (connlist->conn_hd == cn) 4168 connlist->conn_hd = cn->c_next; 4169 rw_exit(&connlist->conn_lock); 4170 4171 return (RDMA_SUCCESS); 4172 } 4173 4174 /* ARGSUSED */ 4175 static rdma_stat 4176 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4177 int addr_type, void *handle, CONN **conn) 4178 { 4179 rdma_stat status; 4180 rpcib_ping_t rpt; 4181 4182 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4183 return (status); 4184 } 4185 4186 /* 4187 * rib_find_hca_connection 4188 * 4189 * if there is an existing connection to the specified address then 4190 * it will be returned in conn, otherwise conn will be set to NULL. 4191 * Also cleans up any connection that is in error state. 4192 */ 4193 static int 4194 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4195 struct netbuf *d_svcaddr, CONN **conn) 4196 { 4197 CONN *cn; 4198 clock_t cv_stat, timout; 4199 4200 *conn = NULL; 4201 again: 4202 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4203 cn = hca->cl_conn_list.conn_hd; 4204 while (cn != NULL) { 4205 /* 4206 * First, clear up any connection in the ERROR state 4207 */ 4208 mutex_enter(&cn->c_lock); 4209 if (cn->c_state == C_ERROR_CONN) { 4210 if (cn->c_ref == 0) { 4211 /* 4212 * Remove connection from list and destroy it. 4213 */ 4214 cn->c_state = C_DISCONN_PEND; 4215 mutex_exit(&cn->c_lock); 4216 rw_exit(&hca->cl_conn_list.conn_lock); 4217 rib_conn_close((void *)cn); 4218 goto again; 4219 } 4220 mutex_exit(&cn->c_lock); 4221 cn = cn->c_next; 4222 continue; 4223 } 4224 if (cn->c_state == C_DISCONN_PEND) { 4225 mutex_exit(&cn->c_lock); 4226 cn = cn->c_next; 4227 continue; 4228 } 4229 4230 /* 4231 * source address is only checked for if there is one, 4232 * this is the case for retries. 4233 */ 4234 if ((cn->c_raddr.len == d_svcaddr->len) && 4235 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4236 d_svcaddr->len) == 0) && 4237 ((s_svcaddr->len == 0) || 4238 ((cn->c_laddr.len == s_svcaddr->len) && 4239 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4240 s_svcaddr->len) == 0)))) { 4241 /* 4242 * Our connection. Give up conn list lock 4243 * as we are done traversing the list. 4244 */ 4245 rw_exit(&hca->cl_conn_list.conn_lock); 4246 if (cn->c_state == C_CONNECTED) { 4247 cn->c_ref++; /* sharing a conn */ 4248 mutex_exit(&cn->c_lock); 4249 *conn = cn; 4250 return (RDMA_SUCCESS); 4251 } 4252 if (cn->c_state == C_CONN_PEND) { 4253 /* 4254 * Hold a reference to this conn before 4255 * we give up the lock. 4256 */ 4257 cn->c_ref++; 4258 timout = ddi_get_lbolt() + 4259 drv_usectohz(CONN_WAIT_TIME * 1000000); 4260 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4261 &cn->c_lock, timout)) > 0 && 4262 cn->c_state == C_CONN_PEND) 4263 ; 4264 if (cv_stat == 0) { 4265 (void) rib_conn_release_locked(cn); 4266 return (RDMA_INTR); 4267 } 4268 if (cv_stat < 0) { 4269 (void) rib_conn_release_locked(cn); 4270 return (RDMA_TIMEDOUT); 4271 } 4272 if (cn->c_state == C_CONNECTED) { 4273 *conn = cn; 4274 mutex_exit(&cn->c_lock); 4275 return (RDMA_SUCCESS); 4276 } else { 4277 (void) rib_conn_release_locked(cn); 4278 return (RDMA_TIMEDOUT); 4279 } 4280 } 4281 } 4282 mutex_exit(&cn->c_lock); 4283 cn = cn->c_next; 4284 } 4285 rw_exit(&hca->cl_conn_list.conn_lock); 4286 *conn = NULL; 4287 return (RDMA_FAILED); 4288 } 4289 4290 /* 4291 * Connection management. 4292 * IBTF does not support recycling of channels. So connections are only 4293 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4294 * C_DISCONN_PEND state. No C_IDLE state. 4295 * C_CONN_PEND state: Connection establishment in progress to the server. 4296 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4297 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4298 * only in this state. 4299 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4300 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4301 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4302 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4303 * c_ref drops to 0 (this indicates that RPC has no more references to this 4304 * connection), the connection should be destroyed. A connection transitions 4305 * into this state when it is being destroyed. 4306 */ 4307 /* ARGSUSED */ 4308 static rdma_stat 4309 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4310 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4311 { 4312 CONN *cn; 4313 int status; 4314 rib_hca_t *hca; 4315 rib_qp_t *qp; 4316 int s_addr_len; 4317 char *s_addr_buf; 4318 4319 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4320 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4321 rw_enter(&hca->state_lock, RW_READER); 4322 if (hca->state != HCA_DETACHED) { 4323 status = rib_find_hca_connection(hca, s_svcaddr, 4324 d_svcaddr, conn); 4325 rw_exit(&hca->state_lock); 4326 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4327 rw_exit(&rib_stat->hcas_list_lock); 4328 return (status); 4329 } 4330 } else 4331 rw_exit(&hca->state_lock); 4332 } 4333 rw_exit(&rib_stat->hcas_list_lock); 4334 4335 /* 4336 * No existing connection found, establish a new connection. 4337 */ 4338 bzero(rpt, sizeof (rpcib_ping_t)); 4339 4340 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4341 if (status != RDMA_SUCCESS) { 4342 return (RDMA_FAILED); 4343 } 4344 hca = rpt->hca; 4345 4346 if (rpt->srcip.family == AF_INET) { 4347 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4348 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4349 } else if (rpt->srcip.family == AF_INET6) { 4350 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4351 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4352 } else { 4353 return (RDMA_FAILED); 4354 } 4355 4356 /* 4357 * Channel to server doesn't exist yet, create one. 4358 */ 4359 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4360 return (RDMA_FAILED); 4361 } 4362 cn = qptoc(qp); 4363 cn->c_state = C_CONN_PEND; 4364 cn->c_ref = 1; 4365 4366 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4367 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4368 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4369 4370 if (rpt->srcip.family == AF_INET) { 4371 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4372 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4373 } else { 4374 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4375 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4376 } 4377 4378 /* 4379 * Add to conn list. 4380 * We had given up the READER lock. In the time since then, 4381 * another thread might have created the connection we are 4382 * trying here. But for now, that is quiet alright - there 4383 * might be two connections between a pair of hosts instead 4384 * of one. If we really want to close that window, 4385 * then need to check the list after acquiring the 4386 * WRITER lock. 4387 */ 4388 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4389 status = rib_conn_to_srv(hca, qp, rpt); 4390 mutex_enter(&cn->c_lock); 4391 4392 if (cn->c_flags & C_CLOSE_PENDING) { 4393 /* 4394 * This handles a case where the module or 4395 * HCA detached in the time a connection is 4396 * established. In such a case close the 4397 * connection immediately if this is the 4398 * only reference. 4399 */ 4400 if (cn->c_ref == 1) { 4401 cn->c_ref--; 4402 cn->c_state = C_DISCONN_PEND; 4403 mutex_exit(&cn->c_lock); 4404 rib_conn_close((void *)cn); 4405 return (RDMA_FAILED); 4406 } 4407 4408 /* 4409 * Connection to be closed later when c_ref = 0 4410 */ 4411 status = RDMA_FAILED; 4412 } 4413 4414 if (status == RDMA_SUCCESS) { 4415 cn->c_state = C_CONNECTED; 4416 *conn = cn; 4417 } else { 4418 cn->c_state = C_ERROR_CONN; 4419 cn->c_ref--; 4420 } 4421 cv_signal(&cn->c_cv); 4422 mutex_exit(&cn->c_lock); 4423 return (status); 4424 } 4425 4426 static void 4427 rib_conn_close(void *rarg) 4428 { 4429 CONN *conn = (CONN *)rarg; 4430 rib_qp_t *qp = ctoqp(conn); 4431 4432 mutex_enter(&conn->c_lock); 4433 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4434 4435 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4436 4437 /* 4438 * Live connection in CONNECTED state. 4439 */ 4440 if (conn->c_state == C_CONNECTED) { 4441 conn->c_state = C_ERROR_CONN; 4442 } 4443 mutex_exit(&conn->c_lock); 4444 4445 rib_close_a_channel(conn); 4446 4447 mutex_enter(&conn->c_lock); 4448 conn->c_flags &= ~C_CLOSE_PENDING; 4449 } 4450 4451 mutex_exit(&conn->c_lock); 4452 4453 if (qp->mode == RIB_SERVER) 4454 (void) rib_disconnect_channel(conn, 4455 &qp->hca->srv_conn_list); 4456 else 4457 (void) rib_disconnect_channel(conn, 4458 &qp->hca->cl_conn_list); 4459 } 4460 4461 static void 4462 rib_conn_timeout_call(void *carg) 4463 { 4464 time_t idle_time; 4465 CONN *conn = (CONN *)carg; 4466 rib_hca_t *hca = ctoqp(conn)->hca; 4467 int error; 4468 4469 mutex_enter(&conn->c_lock); 4470 if ((conn->c_ref > 0) || 4471 (conn->c_state == C_DISCONN_PEND)) { 4472 conn->c_timeout = NULL; 4473 mutex_exit(&conn->c_lock); 4474 return; 4475 } 4476 4477 idle_time = (gethrestime_sec() - conn->c_last_used); 4478 4479 if ((idle_time <= rib_conn_timeout) && 4480 (conn->c_state != C_ERROR_CONN)) { 4481 /* 4482 * There was activity after the last timeout. 4483 * Extend the conn life. Unless the conn is 4484 * already in error state. 4485 */ 4486 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4487 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4488 mutex_exit(&conn->c_lock); 4489 return; 4490 } 4491 4492 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4493 (void *)conn, DDI_NOSLEEP); 4494 4495 /* 4496 * If taskq dispatch fails above, then reset the timeout 4497 * to try again after 10 secs. 4498 */ 4499 4500 if (error != DDI_SUCCESS) { 4501 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4502 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4503 mutex_exit(&conn->c_lock); 4504 return; 4505 } 4506 4507 conn->c_state = C_DISCONN_PEND; 4508 mutex_exit(&conn->c_lock); 4509 } 4510 4511 static rdma_stat 4512 rib_conn_release(CONN *conn) 4513 { 4514 mutex_enter(&conn->c_lock); 4515 return (rib_conn_release_locked(conn)); 4516 } 4517 4518 /* 4519 * Expects conn->c_lock to be held on entry. 4520 * c_lock released on return 4521 */ 4522 static rdma_stat 4523 rib_conn_release_locked(CONN *conn) 4524 { 4525 conn->c_ref--; 4526 4527 conn->c_last_used = gethrestime_sec(); 4528 if (conn->c_ref > 0) { 4529 mutex_exit(&conn->c_lock); 4530 return (RDMA_SUCCESS); 4531 } 4532 4533 /* 4534 * If a conn is C_ERROR_CONN, close the channel. 4535 */ 4536 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4537 conn->c_state = C_DISCONN_PEND; 4538 mutex_exit(&conn->c_lock); 4539 rib_conn_close((void *)conn); 4540 return (RDMA_SUCCESS); 4541 } 4542 4543 /* 4544 * c_ref == 0, set a timeout for conn release 4545 */ 4546 4547 if (conn->c_timeout == NULL) { 4548 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4549 SEC_TO_TICK(rib_conn_timeout)); 4550 } 4551 4552 mutex_exit(&conn->c_lock); 4553 return (RDMA_SUCCESS); 4554 } 4555 4556 /* 4557 * Add at front of list 4558 */ 4559 static struct rdma_done_list * 4560 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4561 { 4562 struct rdma_done_list *rd; 4563 4564 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4565 4566 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4567 rd->xid = xid; 4568 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4569 4570 rd->prev = NULL; 4571 rd->next = qp->rdlist; 4572 if (qp->rdlist != NULL) 4573 qp->rdlist->prev = rd; 4574 qp->rdlist = rd; 4575 4576 return (rd); 4577 } 4578 4579 static void 4580 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4581 { 4582 struct rdma_done_list *r; 4583 4584 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4585 4586 r = rd->next; 4587 if (r != NULL) { 4588 r->prev = rd->prev; 4589 } 4590 4591 r = rd->prev; 4592 if (r != NULL) { 4593 r->next = rd->next; 4594 } else { 4595 qp->rdlist = rd->next; 4596 } 4597 4598 cv_destroy(&rd->rdma_done_cv); 4599 kmem_free(rd, sizeof (*rd)); 4600 } 4601 4602 static void 4603 rdma_done_rem_list(rib_qp_t *qp) 4604 { 4605 struct rdma_done_list *r, *n; 4606 4607 mutex_enter(&qp->rdlist_lock); 4608 for (r = qp->rdlist; r != NULL; r = n) { 4609 n = r->next; 4610 rdma_done_rm(qp, r); 4611 } 4612 mutex_exit(&qp->rdlist_lock); 4613 } 4614 4615 static void 4616 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4617 { 4618 struct rdma_done_list *r = qp->rdlist; 4619 4620 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4621 4622 while (r) { 4623 if (r->xid == xid) { 4624 cv_signal(&r->rdma_done_cv); 4625 return; 4626 } else { 4627 r = r->next; 4628 } 4629 } 4630 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4631 int, xid); 4632 } 4633 4634 /* 4635 * Expects conn->c_lock to be held by the caller. 4636 */ 4637 4638 static void 4639 rib_close_a_channel(CONN *conn) 4640 { 4641 rib_qp_t *qp; 4642 qp = ctoqp(conn); 4643 4644 if (qp->qp_hdl == NULL) { 4645 /* channel already freed */ 4646 return; 4647 } 4648 4649 /* 4650 * Call ibt_close_rc_channel in blocking mode 4651 * with no callbacks. 4652 */ 4653 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4654 NULL, 0, NULL, NULL, 0); 4655 } 4656 4657 /* 4658 * Goes through all connections and closes the channel 4659 * This will cause all the WRs on those channels to be 4660 * flushed. 4661 */ 4662 static void 4663 rib_close_channels(rib_conn_list_t *connlist) 4664 { 4665 CONN *conn, *tmp; 4666 4667 rw_enter(&connlist->conn_lock, RW_READER); 4668 conn = connlist->conn_hd; 4669 while (conn != NULL) { 4670 mutex_enter(&conn->c_lock); 4671 tmp = conn->c_next; 4672 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4673 4674 if (conn->c_state == C_CONN_PEND) { 4675 conn->c_flags |= C_CLOSE_PENDING; 4676 goto next; 4677 } 4678 4679 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4680 4681 /* 4682 * Live connection in CONNECTED state. 4683 */ 4684 if (conn->c_state == C_CONNECTED) 4685 conn->c_state = C_ERROR_CONN; 4686 mutex_exit(&conn->c_lock); 4687 4688 rib_close_a_channel(conn); 4689 4690 mutex_enter(&conn->c_lock); 4691 conn->c_flags &= ~C_CLOSE_PENDING; 4692 /* Signal a pending rib_disconnect_channel() */ 4693 cv_signal(&conn->c_cv); 4694 } 4695 next: 4696 mutex_exit(&conn->c_lock); 4697 conn = tmp; 4698 } 4699 rw_exit(&connlist->conn_lock); 4700 } 4701 4702 /* 4703 * Frees up all connections that are no longer being referenced 4704 */ 4705 static void 4706 rib_purge_connlist(rib_conn_list_t *connlist) 4707 { 4708 CONN *conn; 4709 4710 top: 4711 rw_enter(&connlist->conn_lock, RW_READER); 4712 conn = connlist->conn_hd; 4713 while (conn != NULL) { 4714 mutex_enter(&conn->c_lock); 4715 4716 /* 4717 * At this point connection is either in ERROR 4718 * or DISCONN_PEND state. If in DISCONN_PEND state 4719 * then some other thread is culling that connection. 4720 * If not and if c_ref is 0, then destroy the connection. 4721 */ 4722 if (conn->c_ref == 0 && 4723 conn->c_state != C_DISCONN_PEND) { 4724 /* 4725 * Cull the connection 4726 */ 4727 conn->c_state = C_DISCONN_PEND; 4728 mutex_exit(&conn->c_lock); 4729 rw_exit(&connlist->conn_lock); 4730 (void) rib_disconnect_channel(conn, connlist); 4731 goto top; 4732 } else { 4733 /* 4734 * conn disconnect already scheduled or will 4735 * happen from conn_release when c_ref drops to 0. 4736 */ 4737 mutex_exit(&conn->c_lock); 4738 } 4739 conn = conn->c_next; 4740 } 4741 rw_exit(&connlist->conn_lock); 4742 4743 /* 4744 * At this point, only connections with c_ref != 0 are on the list 4745 */ 4746 } 4747 4748 /* 4749 * Free all the HCA resources and close 4750 * the hca. 4751 */ 4752 4753 static void 4754 rib_free_hca(rib_hca_t *hca) 4755 { 4756 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4757 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4758 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4759 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4760 4761 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4762 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4763 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4764 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4765 4766 rib_rbufpool_destroy(hca, RECV_BUFFER); 4767 rib_rbufpool_destroy(hca, SEND_BUFFER); 4768 rib_destroy_cache(hca); 4769 if (rib_mod.rdma_count == 0) 4770 (void) rdma_unregister_mod(&rib_mod); 4771 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4772 (void) ibt_close_hca(hca->hca_hdl); 4773 hca->hca_hdl = NULL; 4774 } 4775 4776 4777 static void 4778 rib_stop_hca_services(rib_hca_t *hca) 4779 { 4780 rib_stop_services(hca); 4781 rib_close_channels(&hca->cl_conn_list); 4782 rib_close_channels(&hca->srv_conn_list); 4783 4784 rib_purge_connlist(&hca->cl_conn_list); 4785 rib_purge_connlist(&hca->srv_conn_list); 4786 4787 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4788 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4789 GLOBAL_ZONEID); 4790 stats_enabled = FALSE; 4791 } 4792 4793 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4794 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4795 if (hca->srv_conn_list.conn_hd == NULL && 4796 hca->cl_conn_list.conn_hd == NULL) { 4797 /* 4798 * conn_lists are NULL, so destroy 4799 * buffers, close hca and be done. 4800 */ 4801 rib_free_hca(hca); 4802 } 4803 rw_exit(&hca->cl_conn_list.conn_lock); 4804 rw_exit(&hca->srv_conn_list.conn_lock); 4805 4806 if (hca->hca_hdl != NULL) { 4807 mutex_enter(&hca->inuse_lock); 4808 while (hca->inuse) 4809 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4810 mutex_exit(&hca->inuse_lock); 4811 4812 rib_free_hca(hca); 4813 } 4814 rw_destroy(&hca->bound_services_lock); 4815 4816 if (hca->cleanup_helper != NULL) { 4817 ddi_taskq_destroy(hca->cleanup_helper); 4818 hca->cleanup_helper = NULL; 4819 } 4820 } 4821 4822 /* 4823 * Cleans and closes up all uses of the HCA 4824 */ 4825 static void 4826 rib_detach_hca(ibt_hca_hdl_t hca_hdl) 4827 { 4828 rib_hca_t *hca = NULL; 4829 rib_hca_t **hcap; 4830 4831 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4832 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { 4833 hca = *hcap; 4834 rw_enter(&hca->state_lock, RW_WRITER); 4835 if (hca->hca_hdl == hca_hdl) { 4836 /* 4837 * Mark as detached and remove from 4838 * hca list. 4839 */ 4840 hca->state = HCA_DETACHED; 4841 *hcap = hca->next; 4842 rib_stat->nhca_inited--; 4843 rib_mod.rdma_count--; 4844 rw_exit(&hca->state_lock); 4845 break; 4846 } 4847 rw_exit(&hca->state_lock); 4848 } 4849 rw_exit(&rib_stat->hcas_list_lock); 4850 4851 if (hca == NULL) 4852 return; 4853 ASSERT(hca->hca_hdl == hca_hdl); 4854 4855 /* 4856 * Stop all services on the HCA 4857 * Go through cl_conn_list and close all rc_channels 4858 * Go through svr_conn_list and close all rc_channels 4859 * Free connections whose c_ref has dropped to 0 4860 * Destroy all CQs 4861 * Deregister and released all buffer pool memory after all 4862 * connections are destroyed 4863 * Free the protection domain 4864 * ibt_close_hca() 4865 */ 4866 rib_stop_hca_services(hca); 4867 4868 kmem_free(hca, sizeof (*hca)); 4869 } 4870 4871 static void 4872 rib_server_side_cache_reclaim(void *argp) 4873 { 4874 cache_avl_struct_t *rcas; 4875 rib_lrc_entry_t *rb; 4876 rib_hca_t *hca = (rib_hca_t *)argp; 4877 4878 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4879 rcas = avl_first(&hca->avl_tree); 4880 if (rcas != NULL) 4881 avl_remove(&hca->avl_tree, rcas); 4882 4883 while (rcas != NULL) { 4884 while (rcas->r.forw != &rcas->r) { 4885 rcas->elements--; 4886 rb = rcas->r.forw; 4887 remque(rb); 4888 if (rb->registered) 4889 (void) rib_deregistermem_via_hca(hca, 4890 rb->lrc_buf, rb->lrc_mhandle); 4891 4892 hca->cache_allocation -= rb->lrc_len; 4893 kmem_free(rb->lrc_buf, rb->lrc_len); 4894 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4895 } 4896 mutex_destroy(&rcas->node_lock); 4897 kmem_cache_free(hca->server_side_cache, rcas); 4898 rcas = avl_first(&hca->avl_tree); 4899 if (rcas != NULL) 4900 avl_remove(&hca->avl_tree, rcas); 4901 } 4902 rw_exit(&hca->avl_rw_lock); 4903 } 4904 4905 static void 4906 rib_server_side_cache_cleanup(void *argp) 4907 { 4908 cache_avl_struct_t *rcas; 4909 rib_lrc_entry_t *rb; 4910 rib_hca_t *hca = (rib_hca_t *)argp; 4911 4912 mutex_enter(&hca->cache_allocation_lock); 4913 if (hca->cache_allocation < cache_limit) { 4914 mutex_exit(&hca->cache_allocation_lock); 4915 return; 4916 } 4917 mutex_exit(&hca->cache_allocation_lock); 4918 4919 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4920 rcas = avl_last(&hca->avl_tree); 4921 if (rcas != NULL) 4922 avl_remove(&hca->avl_tree, rcas); 4923 4924 while (rcas != NULL) { 4925 while (rcas->r.forw != &rcas->r) { 4926 rcas->elements--; 4927 rb = rcas->r.forw; 4928 remque(rb); 4929 if (rb->registered) 4930 (void) rib_deregistermem_via_hca(hca, 4931 rb->lrc_buf, rb->lrc_mhandle); 4932 4933 hca->cache_allocation -= rb->lrc_len; 4934 4935 kmem_free(rb->lrc_buf, rb->lrc_len); 4936 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4937 } 4938 mutex_destroy(&rcas->node_lock); 4939 if (hca->server_side_cache) { 4940 kmem_cache_free(hca->server_side_cache, rcas); 4941 } 4942 4943 if (hca->cache_allocation < cache_limit) { 4944 rw_exit(&hca->avl_rw_lock); 4945 return; 4946 } 4947 4948 rcas = avl_last(&hca->avl_tree); 4949 if (rcas != NULL) 4950 avl_remove(&hca->avl_tree, rcas); 4951 } 4952 rw_exit(&hca->avl_rw_lock); 4953 } 4954 4955 static int 4956 avl_compare(const void *t1, const void *t2) 4957 { 4958 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4959 return (0); 4960 4961 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4962 return (-1); 4963 4964 return (1); 4965 } 4966 4967 static void 4968 rib_destroy_cache(rib_hca_t *hca) 4969 { 4970 if (hca->avl_init) { 4971 rib_server_side_cache_reclaim((void *)hca); 4972 if (hca->server_side_cache) { 4973 kmem_cache_destroy(hca->server_side_cache); 4974 hca->server_side_cache = NULL; 4975 } 4976 avl_destroy(&hca->avl_tree); 4977 mutex_destroy(&hca->cache_allocation_lock); 4978 rw_destroy(&hca->avl_rw_lock); 4979 } 4980 hca->avl_init = FALSE; 4981 } 4982 4983 static void 4984 rib_force_cleanup(void *hca) 4985 { 4986 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4987 (void) ddi_taskq_dispatch( 4988 ((rib_hca_t *)hca)->cleanup_helper, 4989 rib_server_side_cache_cleanup, 4990 (void *)hca, DDI_NOSLEEP); 4991 } 4992 4993 static rib_lrc_entry_t * 4994 rib_get_cache_buf(CONN *conn, uint32_t len) 4995 { 4996 cache_avl_struct_t cas, *rcas; 4997 rib_hca_t *hca = (ctoqp(conn))->hca; 4998 rib_lrc_entry_t *reply_buf; 4999 avl_index_t where = NULL; 5000 uint64_t c_alloc = 0; 5001 5002 if (!hca->avl_init) 5003 goto error_alloc; 5004 5005 cas.len = len; 5006 5007 rw_enter(&hca->avl_rw_lock, RW_READER); 5008 5009 mutex_enter(&hca->cache_allocation_lock); 5010 c_alloc = hca->cache_allocation; 5011 mutex_exit(&hca->cache_allocation_lock); 5012 5013 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 5014 &where)) == NULL) { 5015 /* Am I above the cache limit */ 5016 if ((c_alloc + len) >= cache_limit) { 5017 rib_force_cleanup((void *)hca); 5018 rw_exit(&hca->avl_rw_lock); 5019 mutex_enter(&hca->cache_allocation_lock); 5020 hca->cache_misses_above_the_limit ++; 5021 mutex_exit(&hca->cache_allocation_lock); 5022 5023 /* Allocate and register the buffer directly */ 5024 goto error_alloc; 5025 } 5026 5027 rw_exit(&hca->avl_rw_lock); 5028 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5029 5030 /* Recheck to make sure no other thread added the entry in */ 5031 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 5032 &cas, &where)) == NULL) { 5033 /* Allocate an avl tree entry */ 5034 rcas = (cache_avl_struct_t *) 5035 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 5036 5037 bzero(rcas, sizeof (cache_avl_struct_t)); 5038 rcas->elements = 0; 5039 rcas->r.forw = &rcas->r; 5040 rcas->r.back = &rcas->r; 5041 rcas->len = len; 5042 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5043 avl_insert(&hca->avl_tree, rcas, where); 5044 } 5045 } 5046 5047 mutex_enter(&rcas->node_lock); 5048 5049 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5050 reply_buf = rcas->r.forw; 5051 remque(reply_buf); 5052 rcas->elements--; 5053 mutex_exit(&rcas->node_lock); 5054 rw_exit(&hca->avl_rw_lock); 5055 5056 mutex_enter(&hca->cache_allocation_lock); 5057 hca->cache_hits++; 5058 hca->cache_allocation -= len; 5059 mutex_exit(&hca->cache_allocation_lock); 5060 } else { 5061 /* Am I above the cache limit */ 5062 mutex_exit(&rcas->node_lock); 5063 if ((c_alloc + len) >= cache_limit) { 5064 rib_force_cleanup((void *)hca); 5065 rw_exit(&hca->avl_rw_lock); 5066 5067 mutex_enter(&hca->cache_allocation_lock); 5068 hca->cache_misses_above_the_limit++; 5069 mutex_exit(&hca->cache_allocation_lock); 5070 /* Allocate and register the buffer directly */ 5071 goto error_alloc; 5072 } 5073 rw_exit(&hca->avl_rw_lock); 5074 mutex_enter(&hca->cache_allocation_lock); 5075 hca->cache_misses++; 5076 mutex_exit(&hca->cache_allocation_lock); 5077 /* Allocate a reply_buf entry */ 5078 reply_buf = (rib_lrc_entry_t *) 5079 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5080 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5081 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5082 reply_buf->lrc_len = len; 5083 reply_buf->registered = FALSE; 5084 reply_buf->avl_node = (void *)rcas; 5085 } 5086 5087 return (reply_buf); 5088 5089 error_alloc: 5090 reply_buf = (rib_lrc_entry_t *) 5091 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5092 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5093 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5094 reply_buf->lrc_len = len; 5095 reply_buf->registered = FALSE; 5096 reply_buf->avl_node = NULL; 5097 5098 return (reply_buf); 5099 } 5100 5101 /* 5102 * Return a pre-registered back to the cache (without 5103 * unregistering the buffer).. 5104 */ 5105 5106 static void 5107 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5108 { 5109 cache_avl_struct_t cas, *rcas; 5110 avl_index_t where = NULL; 5111 rib_hca_t *hca = (ctoqp(conn))->hca; 5112 5113 if (!hca->avl_init) 5114 goto error_free; 5115 5116 cas.len = reg_buf->lrc_len; 5117 rw_enter(&hca->avl_rw_lock, RW_READER); 5118 if ((rcas = (cache_avl_struct_t *) 5119 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5120 rw_exit(&hca->avl_rw_lock); 5121 goto error_free; 5122 } else { 5123 cas.len = reg_buf->lrc_len; 5124 mutex_enter(&rcas->node_lock); 5125 insque(reg_buf, &rcas->r); 5126 rcas->elements ++; 5127 mutex_exit(&rcas->node_lock); 5128 rw_exit(&hca->avl_rw_lock); 5129 mutex_enter(&hca->cache_allocation_lock); 5130 hca->cache_allocation += cas.len; 5131 mutex_exit(&hca->cache_allocation_lock); 5132 } 5133 5134 return; 5135 5136 error_free: 5137 5138 if (reg_buf->registered) 5139 (void) rib_deregistermem_via_hca(hca, 5140 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5141 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5142 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5143 } 5144 5145 static rdma_stat 5146 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5147 uint_t buflen, struct mrc *buf_handle) 5148 { 5149 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5150 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5151 rdma_stat status; 5152 5153 5154 /* 5155 * Note: ALL buffer pools use the same memory type RDMARW. 5156 */ 5157 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5158 if (status == RDMA_SUCCESS) { 5159 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5160 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5161 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5162 } else { 5163 buf_handle->mrc_linfo = NULL; 5164 buf_handle->mrc_lmr = 0; 5165 buf_handle->mrc_rmr = 0; 5166 } 5167 return (status); 5168 } 5169 5170 /* ARGSUSED */ 5171 static rdma_stat 5172 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5173 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5174 { 5175 5176 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5177 return (RDMA_SUCCESS); 5178 } 5179 5180 /* ARGSUSED */ 5181 static rdma_stat 5182 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5183 { 5184 5185 (void) ibt_deregister_mr(hca->hca_hdl, 5186 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5187 return (RDMA_SUCCESS); 5188 } 5189 5190 /* 5191 * Check if the IP interface named by `lifrp' is RDMA-capable. 5192 */ 5193 static boolean_t 5194 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5195 { 5196 char ifname[LIFNAMSIZ]; 5197 char *cp; 5198 5199 if (lifrp->lifr_type == IFT_IB) 5200 return (B_TRUE); 5201 5202 /* 5203 * Strip off the logical interface portion before getting 5204 * intimate with the name. 5205 */ 5206 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5207 if ((cp = strchr(ifname, ':')) != NULL) 5208 *cp = '\0'; 5209 5210 return (strcmp("lo0", ifname) == 0); 5211 } 5212 5213 static int 5214 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5215 { 5216 vnode_t *kkvp, *vp; 5217 TIUSER *tiptr; 5218 struct strioctl iocb; 5219 k_sigset_t smask; 5220 int err = 0; 5221 5222 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { 5223 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, 5224 &tiptr, CRED()) == 0) { 5225 vp = tiptr->fp->f_vnode; 5226 } else { 5227 VN_RELE(kkvp); 5228 return (EPROTO); 5229 } 5230 } else { 5231 return (EPROTO); 5232 } 5233 5234 iocb.ic_cmd = cmd; 5235 iocb.ic_timout = 0; 5236 iocb.ic_len = len; 5237 iocb.ic_dp = (caddr_t)arg; 5238 sigintr(&smask, 0); 5239 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5240 sigunintr(&smask); 5241 (void) t_kclose(tiptr, 0); 5242 VN_RELE(kkvp); 5243 return (err); 5244 } 5245 5246 /* 5247 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5248 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5249 */ 5250 static int 5251 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5252 { 5253 int err; 5254 struct lifnum lifn; 5255 5256 bzero(&lifn, sizeof (struct lifnum)); 5257 lifn.lifn_family = AF_UNSPEC; 5258 5259 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5260 if (err != 0) 5261 return (err); 5262 5263 /* 5264 * Pad the interface count to account for additional interfaces that 5265 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5266 */ 5267 lifn.lifn_count += 4; 5268 5269 bzero(lifcp, sizeof (struct lifconf)); 5270 lifcp->lifc_family = AF_UNSPEC; 5271 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5272 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5273 5274 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5275 if (err != 0) { 5276 kmem_free(lifcp->lifc_buf, *bufsizep); 5277 return (err); 5278 } 5279 return (0); 5280 } 5281 5282 static boolean_t 5283 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5284 { 5285 uint_t i, nifs; 5286 uint_t bufsize; 5287 struct lifconf lifc; 5288 struct lifreq *lifrp; 5289 struct sockaddr_in *sinp; 5290 struct sockaddr_in6 *sin6p; 5291 5292 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5293 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5294 5295 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5296 return (B_FALSE); 5297 5298 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5299 kmem_free(lifc.lifc_buf, bufsize); 5300 return (B_FALSE); 5301 } 5302 5303 /* 5304 * Worst case is that all of the addresses are IB-capable and have 5305 * the same address family, so size our buffers accordingly. 5306 */ 5307 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5308 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5309 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5310 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5311 5312 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5313 if (!rpcib_rdma_capable_interface(lifrp)) 5314 continue; 5315 5316 if (lifrp->lifr_addr.ss_family == AF_INET) { 5317 sinp = addrs4->ri_list; 5318 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5319 sizeof (struct sockaddr_in)); 5320 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5321 sin6p = addrs6->ri_list; 5322 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5323 sizeof (struct sockaddr_in6)); 5324 } 5325 } 5326 5327 kmem_free(lifc.lifc_buf, bufsize); 5328 return (B_TRUE); 5329 } 5330 5331 /* ARGSUSED */ 5332 static int 5333 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5334 { 5335 rib_hca_t *hca; 5336 5337 if (KSTAT_WRITE == rw) { 5338 return (EACCES); 5339 } 5340 5341 rpcib_kstat.cache_limit.value.ui64 = 5342 (uint64_t)cache_limit; 5343 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5344 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5345 rpcib_kstat.cache_allocation.value.ui64 += 5346 (uint64_t)hca->cache_allocation; 5347 rpcib_kstat.cache_hits.value.ui64 += 5348 (uint64_t)hca->cache_hits; 5349 rpcib_kstat.cache_misses.value.ui64 += 5350 (uint64_t)hca->cache_misses; 5351 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5352 (uint64_t)hca->cache_misses_above_the_limit; 5353 } 5354 rw_exit(&rib_stat->hcas_list_lock); 5355 return (0); 5356 } 5357