1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 #define RIBNETID_TCP "tcp" 221 #define RIBNETID_TCP6 "tcp6" 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 234 235 int preposted_rbufs = RDMA_BUFS_GRANT; 236 int send_threshold = 1; 237 238 /* 239 * Old cards with Tavor driver have limited memory footprint 240 * when booted in 32bit. The rib_max_rbufs tunable can be 241 * tuned for more buffers if needed. 242 */ 243 244 #if !defined(_ELF64) && !defined(__sparc) 245 int rib_max_rbufs = MAX_BUFS; 246 #else 247 int rib_max_rbufs = 10 * MAX_BUFS; 248 #endif /* !(_ELF64) && !(__sparc) */ 249 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 251 252 /* 253 * State of the plugin. 254 * ACCEPT = accepting new connections and requests. 255 * NO_ACCEPT = not accepting new connection and requests. 256 * This should eventually move to rpcib_state_t structure, since this 257 * will tell in which state the plugin is for a particular type of service 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 259 * state for one and in no_accept state for the other. 260 */ 261 int plugin_state; 262 kmutex_t plugin_state_lock; 263 264 ldi_ident_t rpcib_li; 265 266 /* 267 * RPCIB RDMATF operations 268 */ 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 270 static rdma_stat rib_disconnect(CONN *conn); 271 static void rib_listen(struct rdma_svc_data *rd); 272 static void rib_listen_stop(struct rdma_svc_data *rd); 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 274 uint_t buflen, struct mrc *buf_handle); 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 276 struct mrc buf_handle); 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 280 struct mrc buf_handle); 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 283 void *lrc); 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 287 caddr_t buf, int len, int cpu); 288 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 290 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 293 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 295 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 306 int addr_type, void *, CONN **); 307 static rdma_stat rib_conn_release(CONN *conn); 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 309 rpcib_ping_t *, CONN **); 310 static rdma_stat rib_getinfo(rdma_info_t *info); 311 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 314 static void rib_destroy_cache(rib_hca_t *hca); 315 static void rib_server_side_cache_reclaim(void *argp); 316 static int avl_compare(const void *t1, const void *t2); 317 318 static void rib_stop_services(rib_hca_t *); 319 static void rib_close_channels(rib_conn_list_t *); 320 static void rib_conn_close(void *); 321 322 /* 323 * RPCIB addressing operations 324 */ 325 326 /* 327 * RDMA operations the RPCIB module exports 328 */ 329 static rdmaops_t rib_ops = { 330 rib_reachable, 331 rib_conn_get, 332 rib_conn_release, 333 rib_listen, 334 rib_listen_stop, 335 rib_registermem, 336 rib_deregistermem, 337 rib_registermemsync, 338 rib_deregistermemsync, 339 rib_syncmem, 340 rib_reg_buf_alloc, 341 rib_reg_buf_free, 342 rib_send, 343 rib_send_resp, 344 rib_post_resp, 345 rib_post_resp_remove, 346 rib_post_recv, 347 rib_recv, 348 rib_read, 349 rib_write, 350 rib_getinfo, 351 }; 352 353 /* 354 * RDMATF RPCIB plugin details 355 */ 356 static rdma_mod_t rib_mod = { 357 "ibtf", /* api name */ 358 RDMATF_VERS_1, 359 0, 360 &rib_ops, /* rdma op vector for ibtf */ 361 }; 362 363 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 364 static rdma_stat rib_qp_init(rib_qp_t *, int); 365 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 366 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 367 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 368 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 369 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 370 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 371 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 372 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 373 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 374 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 375 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 376 rib_qp_t **); 377 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 378 rib_qp_t **); 379 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 380 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 381 static int rib_free_sendwait(struct send_wid *); 382 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 383 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 384 static void rdma_done_rem_list(rib_qp_t *); 385 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 386 387 static void rib_async_handler(void *, 388 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 389 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 390 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 391 static int rib_free_svc_recv(struct svc_recv *); 392 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 393 static void rib_free_wid(struct recv_wid *); 394 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 395 static void rib_detach_hca(rib_hca_t *); 396 static void rib_close_a_channel(CONN *); 397 static void rib_send_hold(rib_qp_t *); 398 static void rib_send_rele(rib_qp_t *); 399 400 /* 401 * Registration with IBTF as a consumer 402 */ 403 static struct ibt_clnt_modinfo_s rib_modinfo = { 404 IBTI_V_CURR, 405 IBT_GENERIC, 406 rib_async_handler, /* async event handler */ 407 NULL, /* Memory Region Handler */ 408 "nfs/ib" 409 }; 410 411 /* 412 * Global strucuture 413 */ 414 415 typedef struct rpcib_s { 416 dev_info_t *rpcib_dip; 417 kmutex_t rpcib_mutex; 418 } rpcib_t; 419 420 rpcib_t rpcib; 421 422 /* 423 * /etc/system controlled variable to control 424 * debugging in rpcib kernel module. 425 * Set it to values greater that 1 to control 426 * the amount of debugging messages required. 427 */ 428 int rib_debug = 0; 429 430 int 431 _init(void) 432 { 433 int error; 434 435 error = mod_install((struct modlinkage *)&rib_modlinkage); 436 if (error != 0) { 437 /* 438 * Could not load module 439 */ 440 return (error); 441 } 442 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 443 return (0); 444 } 445 446 int 447 _fini() 448 { 449 int status; 450 451 /* 452 * Remove module 453 */ 454 if ((status = mod_remove(&rib_modlinkage)) != 0) { 455 return (status); 456 } 457 mutex_destroy(&plugin_state_lock); 458 return (0); 459 } 460 461 int 462 _info(struct modinfo *modinfop) 463 { 464 return (mod_info(&rib_modlinkage, modinfop)); 465 } 466 467 /* 468 * rpcib_getinfo() 469 * Given the device number, return the devinfo pointer or the 470 * instance number. 471 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 472 */ 473 474 /*ARGSUSED*/ 475 static int 476 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 477 { 478 int ret = DDI_SUCCESS; 479 480 switch (cmd) { 481 case DDI_INFO_DEVT2DEVINFO: 482 if (rpcib.rpcib_dip != NULL) 483 *result = rpcib.rpcib_dip; 484 else { 485 *result = NULL; 486 ret = DDI_FAILURE; 487 } 488 break; 489 490 case DDI_INFO_DEVT2INSTANCE: 491 *result = NULL; 492 break; 493 494 default: 495 ret = DDI_FAILURE; 496 } 497 return (ret); 498 } 499 500 static void 501 rpcib_free_hca_list() 502 { 503 rib_hca_t *hca, *hcap; 504 505 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 506 hca = rib_stat->hcas_list; 507 rib_stat->hcas_list = NULL; 508 rw_exit(&rib_stat->hcas_list_lock); 509 while (hca != NULL) { 510 rw_enter(&hca->state_lock, RW_WRITER); 511 hcap = hca; 512 hca = hca->next; 513 rib_stat->nhca_inited--; 514 rib_mod.rdma_count--; 515 hcap->state = HCA_DETACHED; 516 rw_exit(&hcap->state_lock); 517 rib_stop_hca_services(hcap); 518 519 kmem_free(hcap, sizeof (*hcap)); 520 } 521 } 522 523 static rdma_stat 524 rpcib_free_service_list() 525 { 526 rib_service_t *service; 527 ibt_status_t ret; 528 529 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 530 while (rib_stat->service_list != NULL) { 531 service = rib_stat->service_list; 532 ret = ibt_unbind_all_services(service->srv_hdl); 533 if (ret != IBT_SUCCESS) { 534 rw_exit(&rib_stat->service_list_lock); 535 #ifdef DEBUG 536 cmn_err(CE_NOTE, "rpcib_free_service_list: " 537 "ibt_unbind_all_services failed (%d)\n", (int)ret); 538 #endif 539 return (RDMA_FAILED); 540 } 541 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 542 service->srv_hdl); 543 if (ret != IBT_SUCCESS) { 544 rw_exit(&rib_stat->service_list_lock); 545 #ifdef DEBUG 546 cmn_err(CE_NOTE, "rpcib_free_service_list: " 547 "ibt_deregister_service failed (%d)\n", (int)ret); 548 #endif 549 return (RDMA_FAILED); 550 } 551 rib_stat->service_list = service->next; 552 kmem_free(service, sizeof (rib_service_t)); 553 } 554 rw_exit(&rib_stat->service_list_lock); 555 556 return (RDMA_SUCCESS); 557 } 558 559 static int 560 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 561 { 562 ibt_status_t ibt_status; 563 rdma_stat r_status; 564 565 switch (cmd) { 566 case DDI_ATTACH: 567 break; 568 case DDI_RESUME: 569 return (DDI_SUCCESS); 570 default: 571 return (DDI_FAILURE); 572 } 573 574 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 575 576 mutex_enter(&rpcib.rpcib_mutex); 577 if (rpcib.rpcib_dip != NULL) { 578 mutex_exit(&rpcib.rpcib_mutex); 579 return (DDI_FAILURE); 580 } 581 rpcib.rpcib_dip = dip; 582 mutex_exit(&rpcib.rpcib_mutex); 583 /* 584 * Create the "rpcib" minor-node. 585 */ 586 if (ddi_create_minor_node(dip, 587 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 588 /* Error message, no cmn_err as they print on console */ 589 return (DDI_FAILURE); 590 } 591 592 if (rib_stat == NULL) { 593 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 594 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 595 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 596 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 597 } 598 599 rib_stat->hca_count = ibt_get_hca_list(NULL); 600 if (rib_stat->hca_count < 1) { 601 mutex_destroy(&rib_stat->listen_lock); 602 rw_destroy(&rib_stat->hcas_list_lock); 603 mutex_destroy(&rib_stat->open_hca_lock); 604 kmem_free(rib_stat, sizeof (*rib_stat)); 605 rib_stat = NULL; 606 return (DDI_FAILURE); 607 } 608 609 ibt_status = ibt_attach(&rib_modinfo, dip, 610 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 611 612 if (ibt_status != IBT_SUCCESS) { 613 mutex_destroy(&rib_stat->listen_lock); 614 rw_destroy(&rib_stat->hcas_list_lock); 615 mutex_destroy(&rib_stat->open_hca_lock); 616 kmem_free(rib_stat, sizeof (*rib_stat)); 617 rib_stat = NULL; 618 return (DDI_FAILURE); 619 } 620 621 rib_stat->service_list = NULL; 622 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 623 mutex_enter(&rib_stat->open_hca_lock); 624 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 625 mutex_exit(&rib_stat->open_hca_lock); 626 goto open_fail; 627 } 628 mutex_exit(&rib_stat->open_hca_lock); 629 630 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 631 DDI_PROP_SUCCESS) { 632 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 633 "failed."); 634 goto register_fail; 635 } 636 637 /* 638 * Register with rdmatf 639 */ 640 r_status = rdma_register_mod(&rib_mod); 641 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 642 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 643 "status = %d", r_status); 644 goto register_fail; 645 } 646 647 return (DDI_SUCCESS); 648 649 register_fail: 650 651 open_fail: 652 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 653 rpcib_free_hca_list(); 654 (void) rpcib_free_service_list(); 655 mutex_destroy(&rib_stat->listen_lock); 656 rw_destroy(&rib_stat->hcas_list_lock); 657 mutex_destroy(&rib_stat->open_hca_lock); 658 rw_destroy(&rib_stat->service_list_lock); 659 kmem_free(rib_stat, sizeof (*rib_stat)); 660 rib_stat = NULL; 661 return (DDI_FAILURE); 662 } 663 664 /*ARGSUSED*/ 665 static int 666 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 667 { 668 switch (cmd) { 669 670 case DDI_DETACH: 671 break; 672 673 case DDI_SUSPEND: 674 default: 675 return (DDI_FAILURE); 676 } 677 678 /* 679 * Detach the hca and free resources 680 */ 681 mutex_enter(&plugin_state_lock); 682 plugin_state = NO_ACCEPT; 683 mutex_exit(&plugin_state_lock); 684 685 if (rpcib_free_service_list() != RDMA_SUCCESS) 686 return (DDI_FAILURE); 687 rpcib_free_hca_list(); 688 689 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 690 mutex_destroy(&rib_stat->listen_lock); 691 rw_destroy(&rib_stat->hcas_list_lock); 692 mutex_destroy(&rib_stat->open_hca_lock); 693 rw_destroy(&rib_stat->service_list_lock); 694 695 kmem_free(rib_stat, sizeof (*rib_stat)); 696 rib_stat = NULL; 697 698 mutex_enter(&rpcib.rpcib_mutex); 699 rpcib.rpcib_dip = NULL; 700 mutex_exit(&rpcib.rpcib_mutex); 701 mutex_destroy(&rpcib.rpcib_mutex); 702 return (DDI_SUCCESS); 703 } 704 705 706 static void rib_rbufpool_free(rib_hca_t *, int); 707 static void rib_rbufpool_deregister(rib_hca_t *, int); 708 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 709 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 710 static rdma_stat rib_rem_replylist(rib_qp_t *); 711 static int rib_remreply(rib_qp_t *, struct reply *); 712 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 713 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 714 715 716 /* 717 * One CQ pair per HCA 718 */ 719 static rdma_stat 720 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 721 rib_cq_t **cqp) 722 { 723 rib_cq_t *cq; 724 ibt_cq_attr_t cq_attr; 725 uint32_t real_size; 726 ibt_status_t status; 727 rdma_stat error = RDMA_SUCCESS; 728 729 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 730 cq->rib_hca = hca; 731 cq_attr.cq_size = cq_size; 732 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 733 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 734 &real_size); 735 if (status != IBT_SUCCESS) { 736 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 737 " status=%d", status); 738 error = RDMA_FAILED; 739 goto fail; 740 } 741 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 742 743 /* 744 * Enable CQ callbacks. CQ Callbacks are single shot 745 * (e.g. you have to call ibt_enable_cq_notify() 746 * after each callback to get another one). 747 */ 748 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 749 if (status != IBT_SUCCESS) { 750 cmn_err(CE_WARN, "rib_create_cq: " 751 "enable_cq_notify failed, status %d", status); 752 error = RDMA_FAILED; 753 goto fail; 754 } 755 *cqp = cq; 756 757 return (error); 758 fail: 759 if (cq->rib_cq_hdl) 760 (void) ibt_free_cq(cq->rib_cq_hdl); 761 if (cq) 762 kmem_free(cq, sizeof (rib_cq_t)); 763 return (error); 764 } 765 766 /* 767 * rpcib_find_hca 768 * 769 * Caller should have already locked the hcas_lock before calling 770 * this function. 771 */ 772 static rib_hca_t * 773 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 774 { 775 rib_hca_t *hca = ribstat->hcas_list; 776 777 while (hca && hca->hca_guid != guid) 778 hca = hca->next; 779 780 return (hca); 781 } 782 783 static rdma_stat 784 rpcib_open_hcas(rpcib_state_t *ribstat) 785 { 786 rib_hca_t *hca; 787 ibt_status_t ibt_status; 788 rdma_stat status; 789 ibt_hca_portinfo_t *pinfop; 790 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 791 uint_t size, cq_size; 792 int i; 793 kstat_t *ksp; 794 cache_avl_struct_t example_avl_node; 795 char rssc_name[32]; 796 int old_nhca_inited = ribstat->nhca_inited; 797 ib_guid_t *hca_guids; 798 799 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 800 801 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 802 if (ribstat->hca_count == 0) 803 return (RDMA_FAILED); 804 805 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 806 /* 807 * Open a hca and setup for RDMA 808 */ 809 for (i = 0; i < ribstat->hca_count; i++) { 810 if (rpcib_find_hca(ribstat, hca_guids[i])) 811 continue; 812 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 813 814 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 815 hca_guids[i], &hca->hca_hdl); 816 if (ibt_status != IBT_SUCCESS) { 817 kmem_free(hca, sizeof (rib_hca_t)); 818 continue; 819 } 820 hca->hca_guid = hca_guids[i]; 821 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 822 hca->state = HCA_INITED; 823 824 /* 825 * query HCA info 826 */ 827 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 828 if (ibt_status != IBT_SUCCESS) { 829 goto fail1; 830 } 831 832 /* 833 * One PD (Protection Domain) per HCA. 834 * A qp is allowed to access a memory region 835 * only when it's in the same PD as that of 836 * the memory region. 837 */ 838 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 839 if (ibt_status != IBT_SUCCESS) { 840 goto fail1; 841 } 842 843 /* 844 * query HCA ports 845 */ 846 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 847 0, &pinfop, &hca->hca_nports, &size); 848 if (ibt_status != IBT_SUCCESS) { 849 goto fail2; 850 } 851 hca->hca_ports = pinfop; 852 hca->hca_pinfosz = size; 853 pinfop = NULL; 854 855 cq_size = DEF_CQ_SIZE; /* default cq size */ 856 /* 857 * Create 2 pairs of cq's (1 pair for client 858 * and the other pair for server) on this hca. 859 * If number of qp's gets too large, then several 860 * cq's will be needed. 861 */ 862 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 863 &hca->svc_rcq); 864 if (status != RDMA_SUCCESS) { 865 goto fail3; 866 } 867 868 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 869 &hca->svc_scq); 870 if (status != RDMA_SUCCESS) { 871 goto fail3; 872 } 873 874 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 875 &hca->clnt_rcq); 876 if (status != RDMA_SUCCESS) { 877 goto fail3; 878 } 879 880 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 881 &hca->clnt_scq); 882 if (status != RDMA_SUCCESS) { 883 goto fail3; 884 } 885 886 /* 887 * Create buffer pools. 888 * Note rib_rbuf_create also allocates memory windows. 889 */ 890 hca->recv_pool = rib_rbufpool_create(hca, 891 RECV_BUFFER, rib_max_rbufs); 892 if (hca->recv_pool == NULL) { 893 goto fail3; 894 } 895 896 hca->send_pool = rib_rbufpool_create(hca, 897 SEND_BUFFER, rib_max_rbufs); 898 if (hca->send_pool == NULL) { 899 rib_rbufpool_destroy(hca, RECV_BUFFER); 900 goto fail3; 901 } 902 903 if (hca->server_side_cache == NULL) { 904 (void) sprintf(rssc_name, 905 "rib_srvr_cache_%llx", 906 (long long unsigned int) hca->hca_guid); 907 hca->server_side_cache = kmem_cache_create( 908 rssc_name, 909 sizeof (cache_avl_struct_t), 0, 910 NULL, 911 NULL, 912 rib_server_side_cache_reclaim, 913 hca, NULL, 0); 914 } 915 916 avl_create(&hca->avl_tree, 917 avl_compare, 918 sizeof (cache_avl_struct_t), 919 (uint_t)(uintptr_t)&example_avl_node.avl_link- 920 (uint_t)(uintptr_t)&example_avl_node); 921 922 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 923 hca->iblock); 924 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 925 rw_init(&hca->avl_rw_lock, 926 NULL, RW_DRIVER, hca->iblock); 927 mutex_init(&hca->cache_allocation_lock, 928 NULL, MUTEX_DRIVER, NULL); 929 hca->avl_init = TRUE; 930 931 /* Create kstats for the cache */ 932 ASSERT(INGLOBALZONE(curproc)); 933 934 if (!stats_enabled) { 935 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 936 KSTAT_TYPE_NAMED, 937 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 938 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 939 GLOBAL_ZONEID); 940 if (ksp) { 941 ksp->ks_data = (void *) &rpcib_kstat; 942 ksp->ks_update = rpcib_cache_kstat_update; 943 kstat_install(ksp); 944 stats_enabled = TRUE; 945 } 946 } 947 if (hca->cleanup_helper == NULL) { 948 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 949 950 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 951 (unsigned long long int) hca->hca_guid); 952 hca->cleanup_helper = ddi_taskq_create(NULL, 953 tq_name, 1, TASKQ_DEFAULTPRI, 0); 954 } 955 956 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 957 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 958 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 959 hca->iblock); 960 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 961 hca->iblock); 962 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 963 hca->inuse = TRUE; 964 965 hca->next = ribstat->hcas_list; 966 ribstat->hcas_list = hca; 967 ribstat->nhca_inited++; 968 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 969 continue; 970 971 fail3: 972 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 973 fail2: 974 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 975 fail1: 976 (void) ibt_close_hca(hca->hca_hdl); 977 kmem_free(hca, sizeof (rib_hca_t)); 978 } 979 rw_exit(&ribstat->hcas_list_lock); 980 ibt_free_hca_list(hca_guids, ribstat->hca_count); 981 rib_mod.rdma_count = rib_stat->nhca_inited; 982 983 /* 984 * return success if at least one new hca has been configured. 985 */ 986 if (ribstat->nhca_inited != old_nhca_inited) 987 return (RDMA_SUCCESS); 988 else 989 return (RDMA_FAILED); 990 } 991 992 /* 993 * Callback routines 994 */ 995 996 /* 997 * SCQ handlers 998 */ 999 /* ARGSUSED */ 1000 static void 1001 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1002 { 1003 ibt_status_t ibt_status; 1004 ibt_wc_t wc; 1005 struct send_wid *wd; 1006 CONN *conn; 1007 rib_qp_t *qp; 1008 int i; 1009 1010 /* 1011 * Re-enable cq notify here to avoid missing any 1012 * completion queue notification. 1013 */ 1014 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1015 1016 ibt_status = IBT_SUCCESS; 1017 while (ibt_status != IBT_CQ_EMPTY) { 1018 bzero(&wc, sizeof (wc)); 1019 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1020 if (ibt_status != IBT_SUCCESS) 1021 return; 1022 1023 /* 1024 * Got a send completion 1025 */ 1026 if (wc.wc_id != RDMA_DUMMY_WRID) { 1027 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1028 qp = wd->qp; 1029 conn = qptoc(qp); 1030 1031 mutex_enter(&wd->sendwait_lock); 1032 switch (wc.wc_status) { 1033 case IBT_WC_SUCCESS: 1034 wd->status = RDMA_SUCCESS; 1035 break; 1036 default: 1037 /* 1038 * RC Send Q Error Code Local state Remote State 1039 * ==================== =========== ============ 1040 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1041 * IBT_WC_LOCAL_LEN_ERR ERROR None 1042 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1043 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1044 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1045 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1046 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1047 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1048 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1049 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1050 * IBT_WC_WR_FLUSHED_ERR ERROR None 1051 */ 1052 /* 1053 * Channel in error state. Set connection to 1054 * ERROR and cleanup will happen either from 1055 * conn_release or from rib_conn_get 1056 */ 1057 wd->status = RDMA_FAILED; 1058 mutex_enter(&conn->c_lock); 1059 if (conn->c_state != C_DISCONN_PEND) 1060 conn->c_state = C_ERROR_CONN; 1061 mutex_exit(&conn->c_lock); 1062 break; 1063 } 1064 1065 if (wd->cv_sig == 1) { 1066 /* 1067 * Notify poster 1068 */ 1069 cv_signal(&wd->wait_cv); 1070 mutex_exit(&wd->sendwait_lock); 1071 } else { 1072 /* 1073 * Poster not waiting for notification. 1074 * Free the send buffers and send_wid 1075 */ 1076 for (i = 0; i < wd->nsbufs; i++) { 1077 rib_rbuf_free(qptoc(wd->qp), 1078 SEND_BUFFER, 1079 (void *)(uintptr_t)wd->sbufaddr[i]); 1080 } 1081 1082 /* decrement the send ref count */ 1083 rib_send_rele(qp); 1084 1085 mutex_exit(&wd->sendwait_lock); 1086 (void) rib_free_sendwait(wd); 1087 } 1088 } 1089 } 1090 } 1091 1092 /* ARGSUSED */ 1093 static void 1094 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1095 { 1096 ibt_status_t ibt_status; 1097 ibt_wc_t wc; 1098 struct send_wid *wd; 1099 rib_qp_t *qp; 1100 CONN *conn; 1101 int i; 1102 1103 /* 1104 * Re-enable cq notify here to avoid missing any 1105 * completion queue notification. 1106 */ 1107 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1108 1109 ibt_status = IBT_SUCCESS; 1110 while (ibt_status != IBT_CQ_EMPTY) { 1111 bzero(&wc, sizeof (wc)); 1112 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1113 if (ibt_status != IBT_SUCCESS) 1114 return; 1115 1116 /* 1117 * Got a send completion 1118 */ 1119 if (wc.wc_id != RDMA_DUMMY_WRID) { 1120 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1121 qp = wd->qp; 1122 conn = qptoc(qp); 1123 mutex_enter(&wd->sendwait_lock); 1124 1125 switch (wc.wc_status) { 1126 case IBT_WC_SUCCESS: 1127 wd->status = RDMA_SUCCESS; 1128 break; 1129 default: 1130 /* 1131 * Channel in error state. Set connection to 1132 * ERROR and cleanup will happen either from 1133 * conn_release or conn timeout. 1134 */ 1135 wd->status = RDMA_FAILED; 1136 mutex_enter(&conn->c_lock); 1137 if (conn->c_state != C_DISCONN_PEND) 1138 conn->c_state = C_ERROR_CONN; 1139 mutex_exit(&conn->c_lock); 1140 break; 1141 } 1142 1143 if (wd->cv_sig == 1) { 1144 /* 1145 * Update completion status and notify poster 1146 */ 1147 cv_signal(&wd->wait_cv); 1148 mutex_exit(&wd->sendwait_lock); 1149 } else { 1150 /* 1151 * Poster not waiting for notification. 1152 * Free the send buffers and send_wid 1153 */ 1154 for (i = 0; i < wd->nsbufs; i++) { 1155 rib_rbuf_free(qptoc(wd->qp), 1156 SEND_BUFFER, 1157 (void *)(uintptr_t)wd->sbufaddr[i]); 1158 } 1159 1160 /* decrement the send ref count */ 1161 rib_send_rele(qp); 1162 1163 mutex_exit(&wd->sendwait_lock); 1164 (void) rib_free_sendwait(wd); 1165 } 1166 } 1167 } 1168 } 1169 1170 /* 1171 * RCQ handler 1172 */ 1173 /* ARGSUSED */ 1174 static void 1175 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1176 { 1177 rib_qp_t *qp; 1178 ibt_status_t ibt_status; 1179 ibt_wc_t wc; 1180 struct recv_wid *rwid; 1181 1182 /* 1183 * Re-enable cq notify here to avoid missing any 1184 * completion queue notification. 1185 */ 1186 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1187 1188 ibt_status = IBT_SUCCESS; 1189 while (ibt_status != IBT_CQ_EMPTY) { 1190 bzero(&wc, sizeof (wc)); 1191 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1192 if (ibt_status != IBT_SUCCESS) 1193 return; 1194 1195 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1196 qp = rwid->qp; 1197 if (wc.wc_status == IBT_WC_SUCCESS) { 1198 XDR inxdrs, *xdrs; 1199 uint_t xid, vers, op, find_xid = 0; 1200 struct reply *r; 1201 CONN *conn = qptoc(qp); 1202 uint32_t rdma_credit = 0; 1203 1204 xdrs = &inxdrs; 1205 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1206 wc.wc_bytes_xfer, XDR_DECODE); 1207 /* 1208 * Treat xid as opaque (xid is the first entity 1209 * in the rpc rdma message). 1210 */ 1211 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1212 1213 /* Skip xid and set the xdr position accordingly. */ 1214 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1215 (void) xdr_u_int(xdrs, &vers); 1216 (void) xdr_u_int(xdrs, &rdma_credit); 1217 (void) xdr_u_int(xdrs, &op); 1218 XDR_DESTROY(xdrs); 1219 1220 if (vers != RPCRDMA_VERS) { 1221 /* 1222 * Invalid RPC/RDMA version. Cannot 1223 * interoperate. Set connection to 1224 * ERROR state and bail out. 1225 */ 1226 mutex_enter(&conn->c_lock); 1227 if (conn->c_state != C_DISCONN_PEND) 1228 conn->c_state = C_ERROR_CONN; 1229 mutex_exit(&conn->c_lock); 1230 rib_rbuf_free(conn, RECV_BUFFER, 1231 (void *)(uintptr_t)rwid->addr); 1232 rib_free_wid(rwid); 1233 continue; 1234 } 1235 1236 mutex_enter(&qp->replylist_lock); 1237 for (r = qp->replylist; r != NULL; r = r->next) { 1238 if (r->xid == xid) { 1239 find_xid = 1; 1240 switch (op) { 1241 case RDMA_MSG: 1242 case RDMA_NOMSG: 1243 case RDMA_MSGP: 1244 r->status = RDMA_SUCCESS; 1245 r->vaddr_cq = rwid->addr; 1246 r->bytes_xfer = 1247 wc.wc_bytes_xfer; 1248 cv_signal(&r->wait_cv); 1249 break; 1250 default: 1251 rib_rbuf_free(qptoc(qp), 1252 RECV_BUFFER, 1253 (void *)(uintptr_t) 1254 rwid->addr); 1255 break; 1256 } 1257 break; 1258 } 1259 } 1260 mutex_exit(&qp->replylist_lock); 1261 if (find_xid == 0) { 1262 /* RPC caller not waiting for reply */ 1263 1264 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1265 int, xid); 1266 1267 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1268 (void *)(uintptr_t)rwid->addr); 1269 } 1270 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1271 CONN *conn = qptoc(qp); 1272 1273 /* 1274 * Connection being flushed. Just free 1275 * the posted buffer 1276 */ 1277 rib_rbuf_free(conn, RECV_BUFFER, 1278 (void *)(uintptr_t)rwid->addr); 1279 } else { 1280 CONN *conn = qptoc(qp); 1281 /* 1282 * RC Recv Q Error Code Local state Remote State 1283 * ==================== =========== ============ 1284 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1285 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1286 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1287 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1288 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_WR_FLUSHED_ERR None None 1290 */ 1291 /* 1292 * Channel in error state. Set connection 1293 * in ERROR state. 1294 */ 1295 mutex_enter(&conn->c_lock); 1296 if (conn->c_state != C_DISCONN_PEND) 1297 conn->c_state = C_ERROR_CONN; 1298 mutex_exit(&conn->c_lock); 1299 rib_rbuf_free(conn, RECV_BUFFER, 1300 (void *)(uintptr_t)rwid->addr); 1301 } 1302 rib_free_wid(rwid); 1303 } 1304 } 1305 1306 /* Server side */ 1307 /* ARGSUSED */ 1308 static void 1309 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1310 { 1311 rdma_recv_data_t *rdp; 1312 rib_qp_t *qp; 1313 ibt_status_t ibt_status; 1314 ibt_wc_t wc; 1315 struct svc_recv *s_recvp; 1316 CONN *conn; 1317 mblk_t *mp; 1318 1319 /* 1320 * Re-enable cq notify here to avoid missing any 1321 * completion queue notification. 1322 */ 1323 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1324 1325 ibt_status = IBT_SUCCESS; 1326 while (ibt_status != IBT_CQ_EMPTY) { 1327 bzero(&wc, sizeof (wc)); 1328 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1329 if (ibt_status != IBT_SUCCESS) 1330 return; 1331 1332 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1333 qp = s_recvp->qp; 1334 conn = qptoc(qp); 1335 mutex_enter(&qp->posted_rbufs_lock); 1336 qp->n_posted_rbufs--; 1337 if (qp->n_posted_rbufs == 0) 1338 cv_signal(&qp->posted_rbufs_cv); 1339 mutex_exit(&qp->posted_rbufs_lock); 1340 1341 if (wc.wc_status == IBT_WC_SUCCESS) { 1342 XDR inxdrs, *xdrs; 1343 uint_t xid, vers, op; 1344 uint32_t rdma_credit; 1345 1346 xdrs = &inxdrs; 1347 /* s_recvp->vaddr stores data */ 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1349 wc.wc_bytes_xfer, XDR_DECODE); 1350 1351 /* 1352 * Treat xid as opaque (xid is the first entity 1353 * in the rpc rdma message). 1354 */ 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1356 /* Skip xid and set the xdr position accordingly. */ 1357 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1358 if (!xdr_u_int(xdrs, &vers) || 1359 !xdr_u_int(xdrs, &rdma_credit) || 1360 !xdr_u_int(xdrs, &op)) { 1361 rib_rbuf_free(conn, RECV_BUFFER, 1362 (void *)(uintptr_t)s_recvp->vaddr); 1363 XDR_DESTROY(xdrs); 1364 (void) rib_free_svc_recv(s_recvp); 1365 continue; 1366 } 1367 XDR_DESTROY(xdrs); 1368 1369 if (vers != RPCRDMA_VERS) { 1370 /* 1371 * Invalid RPC/RDMA version. 1372 * Drop rpc rdma message. 1373 */ 1374 rib_rbuf_free(conn, RECV_BUFFER, 1375 (void *)(uintptr_t)s_recvp->vaddr); 1376 (void) rib_free_svc_recv(s_recvp); 1377 continue; 1378 } 1379 /* 1380 * Is this for RDMA_DONE? 1381 */ 1382 if (op == RDMA_DONE) { 1383 rib_rbuf_free(conn, RECV_BUFFER, 1384 (void *)(uintptr_t)s_recvp->vaddr); 1385 /* 1386 * Wake up the thread waiting on 1387 * a RDMA_DONE for xid 1388 */ 1389 mutex_enter(&qp->rdlist_lock); 1390 rdma_done_notify(qp, xid); 1391 mutex_exit(&qp->rdlist_lock); 1392 (void) rib_free_svc_recv(s_recvp); 1393 continue; 1394 } 1395 1396 mutex_enter(&plugin_state_lock); 1397 if (plugin_state == ACCEPT) { 1398 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1399 == NULL) 1400 (void) strwaitbuf( 1401 sizeof (*rdp), BPRI_LO); 1402 /* 1403 * Plugin is in accept state, hence the master 1404 * transport queue for this is still accepting 1405 * requests. Hence we can call svc_queuereq to 1406 * queue this recieved msg. 1407 */ 1408 rdp = (rdma_recv_data_t *)mp->b_rptr; 1409 rdp->conn = conn; 1410 rdp->rpcmsg.addr = 1411 (caddr_t)(uintptr_t)s_recvp->vaddr; 1412 rdp->rpcmsg.type = RECV_BUFFER; 1413 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1414 rdp->status = wc.wc_status; 1415 mutex_enter(&conn->c_lock); 1416 conn->c_ref++; 1417 mutex_exit(&conn->c_lock); 1418 mp->b_wptr += sizeof (*rdp); 1419 svc_queuereq((queue_t *)rib_stat->q, mp); 1420 mutex_exit(&plugin_state_lock); 1421 } else { 1422 /* 1423 * The master transport for this is going 1424 * away and the queue is not accepting anymore 1425 * requests for krpc, so don't do anything, just 1426 * free the msg. 1427 */ 1428 mutex_exit(&plugin_state_lock); 1429 rib_rbuf_free(conn, RECV_BUFFER, 1430 (void *)(uintptr_t)s_recvp->vaddr); 1431 } 1432 } else { 1433 rib_rbuf_free(conn, RECV_BUFFER, 1434 (void *)(uintptr_t)s_recvp->vaddr); 1435 } 1436 (void) rib_free_svc_recv(s_recvp); 1437 } 1438 } 1439 1440 static void 1441 rib_attach_hca() 1442 { 1443 mutex_enter(&rib_stat->open_hca_lock); 1444 (void) rpcib_open_hcas(rib_stat); 1445 rib_listen(NULL); 1446 mutex_exit(&rib_stat->open_hca_lock); 1447 } 1448 1449 /* 1450 * Handles DR event of IBT_HCA_DETACH_EVENT. 1451 */ 1452 /* ARGSUSED */ 1453 static void 1454 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1455 ibt_async_code_t code, ibt_async_event_t *event) 1456 { 1457 switch (code) { 1458 case IBT_HCA_ATTACH_EVENT: 1459 rib_attach_hca(); 1460 break; 1461 case IBT_HCA_DETACH_EVENT: 1462 { 1463 rib_hca_t *hca; 1464 1465 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1466 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1467 rw_enter(&hca->state_lock, RW_READER); 1468 if ((hca->state != HCA_DETACHED) && 1469 (hca->hca_hdl == hca_hdl)) { 1470 rw_exit(&hca->state_lock); 1471 break; 1472 } 1473 rw_exit(&hca->state_lock); 1474 } 1475 rw_exit(&rib_stat->hcas_list_lock); 1476 1477 if (hca == NULL) 1478 return; 1479 ASSERT(hca->hca_hdl == hca_hdl); 1480 rib_detach_hca(hca); 1481 #ifdef DEBUG 1482 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1483 #endif 1484 break; 1485 } 1486 case IBT_EVENT_PORT_UP: 1487 /* 1488 * A port is up. We should call rib_listen() since there is 1489 * a chance that rib_listen() may have failed during 1490 * rib_attach_hca() because the port had not been up yet. 1491 */ 1492 rib_listen(NULL); 1493 #ifdef DEBUG 1494 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1495 #endif 1496 break; 1497 #ifdef DEBUG 1498 case IBT_EVENT_PATH_MIGRATED: 1499 cmn_err(CE_NOTE, "rib_async_handler(): " 1500 "IBT_EVENT_PATH_MIGRATED\n"); 1501 break; 1502 case IBT_EVENT_SQD: 1503 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1504 break; 1505 case IBT_EVENT_COM_EST: 1506 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1507 break; 1508 case IBT_ERROR_CATASTROPHIC_CHAN: 1509 cmn_err(CE_NOTE, "rib_async_handler(): " 1510 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1511 break; 1512 case IBT_ERROR_INVALID_REQUEST_CHAN: 1513 cmn_err(CE_NOTE, "rib_async_handler(): " 1514 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1515 break; 1516 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1517 cmn_err(CE_NOTE, "rib_async_handler(): " 1518 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1519 break; 1520 case IBT_ERROR_PATH_MIGRATE_REQ: 1521 cmn_err(CE_NOTE, "rib_async_handler(): " 1522 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1523 break; 1524 case IBT_ERROR_CQ: 1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1526 break; 1527 case IBT_ERROR_PORT_DOWN: 1528 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1529 break; 1530 case IBT_ASYNC_OPAQUE1: 1531 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1532 break; 1533 case IBT_ASYNC_OPAQUE2: 1534 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1535 break; 1536 case IBT_ASYNC_OPAQUE3: 1537 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1538 break; 1539 case IBT_ASYNC_OPAQUE4: 1540 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1541 break; 1542 #endif 1543 default: 1544 break; 1545 } 1546 } 1547 1548 /* 1549 * Client's reachable function. 1550 */ 1551 static rdma_stat 1552 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1553 { 1554 rdma_stat status; 1555 rpcib_ping_t rpt; 1556 struct netbuf saddr; 1557 CONN *conn; 1558 1559 bzero(&saddr, sizeof (struct netbuf)); 1560 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1561 1562 if (status == RDMA_SUCCESS) { 1563 *handle = (void *)rpt.hca; 1564 /* release the reference */ 1565 (void) rib_conn_release(conn); 1566 return (RDMA_SUCCESS); 1567 } else { 1568 *handle = NULL; 1569 DTRACE_PROBE(rpcib__i__pingfailed); 1570 return (RDMA_FAILED); 1571 } 1572 } 1573 1574 /* Client side qp creation */ 1575 static rdma_stat 1576 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1577 { 1578 rib_qp_t *kqp = NULL; 1579 CONN *conn; 1580 rdma_clnt_cred_ctrl_t *cc_info; 1581 1582 ASSERT(qp != NULL); 1583 *qp = NULL; 1584 1585 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1586 conn = qptoc(kqp); 1587 kqp->hca = hca; 1588 kqp->rdmaconn.c_rdmamod = &rib_mod; 1589 kqp->rdmaconn.c_private = (caddr_t)kqp; 1590 1591 kqp->mode = RIB_CLIENT; 1592 kqp->chan_flags = IBT_BLOCKING; 1593 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1594 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1595 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1596 /* 1597 * Initialize 1598 */ 1599 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1600 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1601 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1602 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1603 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1604 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1605 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1606 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1607 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1608 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1609 /* 1610 * Initialize the client credit control 1611 * portion of the rdmaconn struct. 1612 */ 1613 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1614 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1615 cc_info->clnt_cc_granted_ops = 0; 1616 cc_info->clnt_cc_in_flight_ops = 0; 1617 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1618 1619 *qp = kqp; 1620 return (RDMA_SUCCESS); 1621 } 1622 1623 /* Server side qp creation */ 1624 static rdma_stat 1625 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1626 { 1627 rib_qp_t *kqp = NULL; 1628 ibt_chan_sizes_t chan_sizes; 1629 ibt_rc_chan_alloc_args_t qp_attr; 1630 ibt_status_t ibt_status; 1631 rdma_srv_cred_ctrl_t *cc_info; 1632 1633 *qp = NULL; 1634 1635 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1636 kqp->hca = hca; 1637 kqp->port_num = port; 1638 kqp->rdmaconn.c_rdmamod = &rib_mod; 1639 kqp->rdmaconn.c_private = (caddr_t)kqp; 1640 1641 /* 1642 * Create the qp handle 1643 */ 1644 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1645 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1646 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1647 qp_attr.rc_pd = hca->pd_hdl; 1648 qp_attr.rc_hca_port_num = port; 1649 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1650 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1651 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1652 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1653 qp_attr.rc_clone_chan = NULL; 1654 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1655 qp_attr.rc_flags = IBT_WR_SIGNALED; 1656 1657 rw_enter(&hca->state_lock, RW_READER); 1658 if (hca->state != HCA_DETACHED) { 1659 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1660 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1661 &chan_sizes); 1662 } else { 1663 rw_exit(&hca->state_lock); 1664 goto fail; 1665 } 1666 rw_exit(&hca->state_lock); 1667 1668 if (ibt_status != IBT_SUCCESS) { 1669 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1670 int, ibt_status); 1671 goto fail; 1672 } 1673 1674 kqp->mode = RIB_SERVER; 1675 kqp->chan_flags = IBT_BLOCKING; 1676 kqp->q = q; /* server ONLY */ 1677 1678 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1679 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1680 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1681 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1682 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1683 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1684 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1685 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1686 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1687 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1688 /* 1689 * Set the private data area to qp to be used in callbacks 1690 */ 1691 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1692 kqp->rdmaconn.c_state = C_CONNECTED; 1693 1694 /* 1695 * Initialize the server credit control 1696 * portion of the rdmaconn struct. 1697 */ 1698 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1699 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1700 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1701 cc_info->srv_cc_cur_buffers_used = 0; 1702 cc_info->srv_cc_posted = preposted_rbufs; 1703 1704 *qp = kqp; 1705 1706 return (RDMA_SUCCESS); 1707 fail: 1708 if (kqp) 1709 kmem_free(kqp, sizeof (rib_qp_t)); 1710 1711 return (RDMA_FAILED); 1712 } 1713 1714 /* ARGSUSED */ 1715 ibt_cm_status_t 1716 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1717 ibt_cm_return_args_t *ret_args, void *priv_data, 1718 ibt_priv_data_len_t len) 1719 { 1720 rib_hca_t *hca; 1721 1722 hca = (rib_hca_t *)clnt_hdl; 1723 1724 switch (event->cm_type) { 1725 1726 /* got a connection close event */ 1727 case IBT_CM_EVENT_CONN_CLOSED: 1728 { 1729 CONN *conn; 1730 rib_qp_t *qp; 1731 1732 /* check reason why connection was closed */ 1733 switch (event->cm_event.closed) { 1734 case IBT_CM_CLOSED_DREP_RCVD: 1735 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1736 case IBT_CM_CLOSED_DUP: 1737 case IBT_CM_CLOSED_ABORT: 1738 case IBT_CM_CLOSED_ALREADY: 1739 /* 1740 * These cases indicate the local end initiated 1741 * the closing of the channel. Nothing to do here. 1742 */ 1743 break; 1744 default: 1745 /* 1746 * Reason for CONN_CLOSED event must be one of 1747 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1748 * or IBT_CM_CLOSED_STALE. These indicate cases were 1749 * the remote end is closing the channel. In these 1750 * cases free the channel and transition to error 1751 * state 1752 */ 1753 qp = ibt_get_chan_private(event->cm_channel); 1754 conn = qptoc(qp); 1755 mutex_enter(&conn->c_lock); 1756 if (conn->c_state == C_DISCONN_PEND) { 1757 mutex_exit(&conn->c_lock); 1758 break; 1759 } 1760 1761 conn->c_state = C_ERROR_CONN; 1762 1763 /* 1764 * Free the conn if c_ref is down to 0 already 1765 */ 1766 if (conn->c_ref == 0) { 1767 /* 1768 * Remove from list and free conn 1769 */ 1770 conn->c_state = C_DISCONN_PEND; 1771 mutex_exit(&conn->c_lock); 1772 rw_enter(&hca->state_lock, RW_READER); 1773 if (hca->state != HCA_DETACHED) 1774 (void) rib_disconnect_channel(conn, 1775 &hca->cl_conn_list); 1776 rw_exit(&hca->state_lock); 1777 } else { 1778 /* 1779 * conn will be freed when c_ref goes to 0. 1780 * Indicate to cleaning thread not to close 1781 * the connection, but just free the channel. 1782 */ 1783 conn->c_flags |= C_CLOSE_NOTNEEDED; 1784 mutex_exit(&conn->c_lock); 1785 } 1786 #ifdef DEBUG 1787 if (rib_debug) 1788 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1789 "(CONN_CLOSED) channel disconnected"); 1790 #endif 1791 break; 1792 } 1793 break; 1794 } 1795 default: 1796 break; 1797 } 1798 return (IBT_CM_ACCEPT); 1799 } 1800 1801 /* 1802 * Connect to the server. 1803 */ 1804 rdma_stat 1805 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1806 { 1807 ibt_chan_open_args_t chan_args; /* channel args */ 1808 ibt_chan_sizes_t chan_sizes; 1809 ibt_rc_chan_alloc_args_t qp_attr; 1810 ibt_status_t ibt_status; 1811 ibt_rc_returns_t ret_args; /* conn reject info */ 1812 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1813 ibt_ip_cm_info_t ipcm_info; 1814 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1815 1816 1817 (void) bzero(&chan_args, sizeof (chan_args)); 1818 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1819 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1820 1821 ipcm_info.src_addr.family = rptp->srcip.family; 1822 switch (ipcm_info.src_addr.family) { 1823 case AF_INET: 1824 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1825 break; 1826 case AF_INET6: 1827 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1828 break; 1829 } 1830 1831 ipcm_info.dst_addr.family = rptp->srcip.family; 1832 switch (ipcm_info.dst_addr.family) { 1833 case AF_INET: 1834 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1835 break; 1836 case AF_INET6: 1837 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1838 break; 1839 } 1840 1841 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1842 1843 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1844 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1845 1846 if (ibt_status != IBT_SUCCESS) { 1847 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1848 return (-1); 1849 } 1850 1851 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1852 /* Alloc a RC channel */ 1853 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1854 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1855 qp_attr.rc_pd = hca->pd_hdl; 1856 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1857 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1858 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1859 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1860 qp_attr.rc_clone_chan = NULL; 1861 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1862 qp_attr.rc_flags = IBT_WR_SIGNALED; 1863 1864 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1865 chan_args.oc_path = &rptp->path; 1866 1867 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1868 chan_args.oc_cm_clnt_private = (void *)hca; 1869 chan_args.oc_rdma_ra_out = 4; 1870 chan_args.oc_rdma_ra_in = 4; 1871 chan_args.oc_path_retry_cnt = 2; 1872 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1873 chan_args.oc_priv_data = cmp_ip_pvt; 1874 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1875 1876 refresh: 1877 rw_enter(&hca->state_lock, RW_READER); 1878 if (hca->state != HCA_DETACHED) { 1879 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1880 IBT_ACHAN_NO_FLAGS, 1881 &qp_attr, &qp->qp_hdl, 1882 &chan_sizes); 1883 } else { 1884 rw_exit(&hca->state_lock); 1885 return (RDMA_FAILED); 1886 } 1887 rw_exit(&hca->state_lock); 1888 1889 if (ibt_status != IBT_SUCCESS) { 1890 DTRACE_PROBE1(rpcib__i_conntosrv, 1891 int, ibt_status); 1892 return (RDMA_FAILED); 1893 } 1894 1895 /* Connect to the Server */ 1896 (void) bzero(&ret_args, sizeof (ret_args)); 1897 mutex_enter(&qp->cb_lock); 1898 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1899 IBT_BLOCKING, &chan_args, &ret_args); 1900 if (ibt_status != IBT_SUCCESS) { 1901 DTRACE_PROBE2(rpcib__i_openrctosrv, 1902 int, ibt_status, int, ret_args.rc_status); 1903 1904 (void) ibt_free_channel(qp->qp_hdl); 1905 qp->qp_hdl = NULL; 1906 mutex_exit(&qp->cb_lock); 1907 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1908 ret_args.rc_status == IBT_CM_CONN_STALE) { 1909 /* 1910 * Got IBT_CM_CONN_STALE probably because of stale 1911 * data on the passive end of a channel that existed 1912 * prior to reboot. Retry establishing a channel 1913 * REFRESH_ATTEMPTS times, during which time the 1914 * stale conditions on the server might clear up. 1915 */ 1916 goto refresh; 1917 } 1918 return (RDMA_FAILED); 1919 } 1920 mutex_exit(&qp->cb_lock); 1921 /* 1922 * Set the private data area to qp to be used in callbacks 1923 */ 1924 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1925 return (RDMA_SUCCESS); 1926 } 1927 1928 rdma_stat 1929 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1930 { 1931 uint_t i, addr_count; 1932 ibt_status_t ibt_status; 1933 uint8_t num_paths_p; 1934 ibt_ip_path_attr_t ipattr; 1935 ibt_path_ip_src_t srcip; 1936 rpcib_ipaddrs_t addrs4; 1937 rpcib_ipaddrs_t addrs6; 1938 struct sockaddr_in *sinp; 1939 struct sockaddr_in6 *sin6p; 1940 rdma_stat retval = RDMA_FAILED; 1941 rib_hca_t *hca; 1942 1943 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1944 return (RDMA_INVAL); 1945 ASSERT(raddr->buf != NULL); 1946 1947 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1948 1949 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1950 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1951 retval = RDMA_FAILED; 1952 goto done2; 1953 } 1954 1955 if (addr_type == AF_INET) { 1956 addr_count = addrs4.ri_count; 1957 sinp = (struct sockaddr_in *)raddr->buf; 1958 rptp->dstip.family = AF_INET; 1959 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1960 sinp = addrs4.ri_list; 1961 } else { 1962 addr_count = addrs6.ri_count; 1963 sin6p = (struct sockaddr_in6 *)raddr->buf; 1964 rptp->dstip.family = AF_INET6; 1965 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1966 sin6p = addrs6.ri_list; 1967 } 1968 1969 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1970 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1971 rw_enter(&hca->state_lock, RW_READER); 1972 if (hca->state == HCA_DETACHED) { 1973 rw_exit(&hca->state_lock); 1974 continue; 1975 } 1976 1977 ipattr.ipa_dst_ip = &rptp->dstip; 1978 ipattr.ipa_hca_guid = hca->hca_guid; 1979 ipattr.ipa_ndst = 1; 1980 ipattr.ipa_max_paths = 1; 1981 ipattr.ipa_src_ip.family = rptp->dstip.family; 1982 for (i = 0; i < addr_count; i++) { 1983 num_paths_p = 0; 1984 if (addr_type == AF_INET) { 1985 ipattr.ipa_src_ip.un.ip4addr = 1986 sinp[i].sin_addr.s_addr; 1987 } else { 1988 ipattr.ipa_src_ip.un.ip6addr = 1989 sin6p[i].sin6_addr; 1990 } 1991 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1992 1993 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1994 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1995 &num_paths_p, &srcip); 1996 if (ibt_status == IBT_SUCCESS && 1997 num_paths_p != 0 && 1998 rptp->path.pi_hca_guid == hca->hca_guid) { 1999 rptp->hca = hca; 2000 rw_exit(&hca->state_lock); 2001 if (addr_type == AF_INET) { 2002 rptp->srcip.family = AF_INET; 2003 rptp->srcip.un.ip4addr = 2004 srcip.ip_primary.un.ip4addr; 2005 } else { 2006 rptp->srcip.family = AF_INET6; 2007 rptp->srcip.un.ip6addr = 2008 srcip.ip_primary.un.ip6addr; 2009 2010 } 2011 retval = RDMA_SUCCESS; 2012 goto done1; 2013 } 2014 } 2015 rw_exit(&hca->state_lock); 2016 } 2017 done1: 2018 rw_exit(&rib_stat->hcas_list_lock); 2019 done2: 2020 if (addrs4.ri_size > 0) 2021 kmem_free(addrs4.ri_list, addrs4.ri_size); 2022 if (addrs6.ri_size > 0) 2023 kmem_free(addrs6.ri_list, addrs6.ri_size); 2024 return (retval); 2025 } 2026 2027 /* 2028 * Close channel, remove from connection list and 2029 * free up resources allocated for that channel. 2030 */ 2031 rdma_stat 2032 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2033 { 2034 rib_qp_t *qp = ctoqp(conn); 2035 rib_hca_t *hca; 2036 2037 mutex_enter(&conn->c_lock); 2038 if (conn->c_timeout != NULL) { 2039 mutex_exit(&conn->c_lock); 2040 (void) untimeout(conn->c_timeout); 2041 mutex_enter(&conn->c_lock); 2042 } 2043 2044 while (conn->c_flags & C_CLOSE_PENDING) { 2045 cv_wait(&conn->c_cv, &conn->c_lock); 2046 } 2047 mutex_exit(&conn->c_lock); 2048 2049 /* 2050 * c_ref == 0 and connection is in C_DISCONN_PEND 2051 */ 2052 hca = qp->hca; 2053 if (conn_list != NULL) 2054 (void) rib_rm_conn(conn, conn_list); 2055 2056 /* 2057 * There is only one case where we get here with 2058 * qp_hdl = NULL, which is during connection setup on 2059 * the client. In such a case there are no posted 2060 * send/recv buffers. 2061 */ 2062 if (qp->qp_hdl != NULL) { 2063 mutex_enter(&qp->posted_rbufs_lock); 2064 while (qp->n_posted_rbufs) 2065 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2066 mutex_exit(&qp->posted_rbufs_lock); 2067 2068 mutex_enter(&qp->send_rbufs_lock); 2069 while (qp->n_send_rbufs) 2070 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2071 mutex_exit(&qp->send_rbufs_lock); 2072 2073 (void) ibt_free_channel(qp->qp_hdl); 2074 qp->qp_hdl = NULL; 2075 } 2076 2077 ASSERT(qp->rdlist == NULL); 2078 2079 if (qp->replylist != NULL) { 2080 (void) rib_rem_replylist(qp); 2081 } 2082 2083 cv_destroy(&qp->cb_conn_cv); 2084 cv_destroy(&qp->posted_rbufs_cv); 2085 cv_destroy(&qp->send_rbufs_cv); 2086 mutex_destroy(&qp->cb_lock); 2087 mutex_destroy(&qp->replylist_lock); 2088 mutex_destroy(&qp->posted_rbufs_lock); 2089 mutex_destroy(&qp->send_rbufs_lock); 2090 mutex_destroy(&qp->rdlist_lock); 2091 2092 cv_destroy(&conn->c_cv); 2093 mutex_destroy(&conn->c_lock); 2094 2095 if (conn->c_raddr.buf != NULL) { 2096 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2097 } 2098 if (conn->c_laddr.buf != NULL) { 2099 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2100 } 2101 if (conn->c_netid != NULL) { 2102 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2103 } 2104 2105 /* 2106 * Credit control cleanup. 2107 */ 2108 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2109 rdma_clnt_cred_ctrl_t *cc_info; 2110 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2111 cv_destroy(&cc_info->clnt_cc_cv); 2112 } 2113 2114 kmem_free(qp, sizeof (rib_qp_t)); 2115 2116 /* 2117 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2118 * then the hca is no longer being used. 2119 */ 2120 if (conn_list != NULL) { 2121 rw_enter(&hca->state_lock, RW_READER); 2122 if (hca->state == HCA_DETACHED) { 2123 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2124 if (hca->srv_conn_list.conn_hd == NULL) { 2125 rw_enter(&hca->cl_conn_list.conn_lock, 2126 RW_READER); 2127 2128 if (hca->cl_conn_list.conn_hd == NULL) { 2129 mutex_enter(&hca->inuse_lock); 2130 hca->inuse = FALSE; 2131 cv_signal(&hca->cb_cv); 2132 mutex_exit(&hca->inuse_lock); 2133 } 2134 rw_exit(&hca->cl_conn_list.conn_lock); 2135 } 2136 rw_exit(&hca->srv_conn_list.conn_lock); 2137 } 2138 rw_exit(&hca->state_lock); 2139 } 2140 2141 return (RDMA_SUCCESS); 2142 } 2143 2144 /* 2145 * All sends are done under the protection of 2146 * the wdesc->sendwait_lock. n_send_rbufs count 2147 * is protected using the send_rbufs_lock. 2148 * lock ordering is: 2149 * sendwait_lock -> send_rbufs_lock 2150 */ 2151 2152 void 2153 rib_send_hold(rib_qp_t *qp) 2154 { 2155 mutex_enter(&qp->send_rbufs_lock); 2156 qp->n_send_rbufs++; 2157 mutex_exit(&qp->send_rbufs_lock); 2158 } 2159 2160 void 2161 rib_send_rele(rib_qp_t *qp) 2162 { 2163 mutex_enter(&qp->send_rbufs_lock); 2164 qp->n_send_rbufs--; 2165 if (qp->n_send_rbufs == 0) 2166 cv_signal(&qp->send_rbufs_cv); 2167 mutex_exit(&qp->send_rbufs_lock); 2168 } 2169 2170 /* 2171 * Wait for send completion notification. Only on receiving a 2172 * notification be it a successful or error completion, free the 2173 * send_wid. 2174 */ 2175 static rdma_stat 2176 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2177 { 2178 clock_t timout, cv_wait_ret; 2179 rdma_stat error = RDMA_SUCCESS; 2180 int i; 2181 2182 /* 2183 * Wait for send to complete 2184 */ 2185 ASSERT(wd != NULL); 2186 mutex_enter(&wd->sendwait_lock); 2187 if (wd->status == (uint_t)SEND_WAIT) { 2188 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2189 ddi_get_lbolt(); 2190 2191 if (qp->mode == RIB_SERVER) { 2192 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2193 &wd->sendwait_lock, timout)) > 0 && 2194 wd->status == (uint_t)SEND_WAIT) 2195 ; 2196 switch (cv_wait_ret) { 2197 case -1: /* timeout */ 2198 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2199 2200 wd->cv_sig = 0; /* no signal needed */ 2201 error = RDMA_TIMEDOUT; 2202 break; 2203 default: /* got send completion */ 2204 break; 2205 } 2206 } else { 2207 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2208 &wd->sendwait_lock, timout)) > 0 && 2209 wd->status == (uint_t)SEND_WAIT) 2210 ; 2211 switch (cv_wait_ret) { 2212 case -1: /* timeout */ 2213 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2214 2215 wd->cv_sig = 0; /* no signal needed */ 2216 error = RDMA_TIMEDOUT; 2217 break; 2218 case 0: /* interrupted */ 2219 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2220 2221 wd->cv_sig = 0; /* no signal needed */ 2222 error = RDMA_INTR; 2223 break; 2224 default: /* got send completion */ 2225 break; 2226 } 2227 } 2228 } 2229 2230 if (wd->status != (uint_t)SEND_WAIT) { 2231 /* got send completion */ 2232 if (wd->status != RDMA_SUCCESS) { 2233 switch (wd->status) { 2234 case RDMA_CONNLOST: 2235 error = RDMA_CONNLOST; 2236 break; 2237 default: 2238 error = RDMA_FAILED; 2239 break; 2240 } 2241 } 2242 for (i = 0; i < wd->nsbufs; i++) { 2243 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2244 (void *)(uintptr_t)wd->sbufaddr[i]); 2245 } 2246 2247 rib_send_rele(qp); 2248 2249 mutex_exit(&wd->sendwait_lock); 2250 (void) rib_free_sendwait(wd); 2251 2252 } else { 2253 mutex_exit(&wd->sendwait_lock); 2254 } 2255 return (error); 2256 } 2257 2258 static struct send_wid * 2259 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2260 { 2261 struct send_wid *wd; 2262 2263 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2264 wd->xid = xid; 2265 wd->cv_sig = cv_sig; 2266 wd->qp = qp; 2267 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2268 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2269 wd->status = (uint_t)SEND_WAIT; 2270 2271 return (wd); 2272 } 2273 2274 static int 2275 rib_free_sendwait(struct send_wid *wdesc) 2276 { 2277 cv_destroy(&wdesc->wait_cv); 2278 mutex_destroy(&wdesc->sendwait_lock); 2279 kmem_free(wdesc, sizeof (*wdesc)); 2280 2281 return (0); 2282 } 2283 2284 static rdma_stat 2285 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2286 { 2287 mutex_enter(&qp->replylist_lock); 2288 if (rep != NULL) { 2289 (void) rib_remreply(qp, rep); 2290 mutex_exit(&qp->replylist_lock); 2291 return (RDMA_SUCCESS); 2292 } 2293 mutex_exit(&qp->replylist_lock); 2294 return (RDMA_FAILED); 2295 } 2296 2297 /* 2298 * Send buffers are freed here only in case of error in posting 2299 * on QP. If the post succeeded, the send buffers are freed upon 2300 * send completion in rib_sendwait() or in the scq_handler. 2301 */ 2302 rdma_stat 2303 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2304 int send_sig, int cv_sig, caddr_t *swid) 2305 { 2306 struct send_wid *wdesc; 2307 struct clist *clp; 2308 ibt_status_t ibt_status = IBT_SUCCESS; 2309 rdma_stat ret = RDMA_SUCCESS; 2310 ibt_send_wr_t tx_wr; 2311 int i, nds; 2312 ibt_wr_ds_t sgl[DSEG_MAX]; 2313 uint_t total_msg_size; 2314 rib_qp_t *qp; 2315 2316 qp = ctoqp(conn); 2317 2318 ASSERT(cl != NULL); 2319 2320 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2321 2322 nds = 0; 2323 total_msg_size = 0; 2324 clp = cl; 2325 while (clp != NULL) { 2326 if (nds >= DSEG_MAX) { 2327 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2328 return (RDMA_FAILED); 2329 } 2330 sgl[nds].ds_va = clp->w.c_saddr; 2331 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2332 sgl[nds].ds_len = clp->c_len; 2333 total_msg_size += clp->c_len; 2334 clp = clp->c_next; 2335 nds++; 2336 } 2337 2338 if (send_sig) { 2339 /* Set SEND_SIGNAL flag. */ 2340 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2341 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2342 *swid = (caddr_t)wdesc; 2343 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2344 mutex_enter(&wdesc->sendwait_lock); 2345 wdesc->nsbufs = nds; 2346 for (i = 0; i < nds; i++) { 2347 wdesc->sbufaddr[i] = sgl[i].ds_va; 2348 } 2349 } else { 2350 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2351 *swid = NULL; 2352 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2353 } 2354 2355 tx_wr.wr_opcode = IBT_WRC_SEND; 2356 tx_wr.wr_trans = IBT_RC_SRV; 2357 tx_wr.wr_nds = nds; 2358 tx_wr.wr_sgl = sgl; 2359 2360 mutex_enter(&conn->c_lock); 2361 if (conn->c_state == C_CONNECTED) { 2362 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2363 } 2364 if (conn->c_state != C_CONNECTED || 2365 ibt_status != IBT_SUCCESS) { 2366 if (conn->c_state != C_DISCONN_PEND) 2367 conn->c_state = C_ERROR_CONN; 2368 mutex_exit(&conn->c_lock); 2369 if (send_sig) { 2370 for (i = 0; i < nds; i++) { 2371 rib_rbuf_free(conn, SEND_BUFFER, 2372 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2373 } 2374 mutex_exit(&wdesc->sendwait_lock); 2375 (void) rib_free_sendwait(wdesc); 2376 } 2377 return (RDMA_CONNLOST); 2378 } 2379 2380 mutex_exit(&conn->c_lock); 2381 2382 if (send_sig) { 2383 rib_send_hold(qp); 2384 mutex_exit(&wdesc->sendwait_lock); 2385 if (cv_sig) { 2386 /* 2387 * cv_wait for send to complete. 2388 * We can fail due to a timeout or signal or 2389 * unsuccessful send. 2390 */ 2391 ret = rib_sendwait(qp, wdesc); 2392 2393 return (ret); 2394 } 2395 } 2396 2397 return (RDMA_SUCCESS); 2398 } 2399 2400 2401 rdma_stat 2402 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2403 { 2404 rdma_stat ret; 2405 caddr_t wd; 2406 2407 /* send-wait & cv_signal */ 2408 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2409 return (ret); 2410 } 2411 2412 /* 2413 * Deprecated/obsolete interface not used currently 2414 * but earlier used for READ-READ protocol. 2415 * Send RPC reply and wait for RDMA_DONE. 2416 */ 2417 rdma_stat 2418 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2419 { 2420 rdma_stat ret = RDMA_SUCCESS; 2421 struct rdma_done_list *rd; 2422 clock_t timout, cv_wait_ret; 2423 caddr_t *wid = NULL; 2424 rib_qp_t *qp = ctoqp(conn); 2425 2426 mutex_enter(&qp->rdlist_lock); 2427 rd = rdma_done_add(qp, msgid); 2428 2429 /* No cv_signal (whether send-wait or no-send-wait) */ 2430 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2431 2432 if (ret != RDMA_SUCCESS) { 2433 rdma_done_rm(qp, rd); 2434 } else { 2435 /* 2436 * Wait for RDMA_DONE from remote end 2437 */ 2438 timout = 2439 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2440 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2441 &qp->rdlist_lock, 2442 timout); 2443 2444 rdma_done_rm(qp, rd); 2445 2446 if (cv_wait_ret < 0) { 2447 ret = RDMA_TIMEDOUT; 2448 } 2449 } 2450 2451 mutex_exit(&qp->rdlist_lock); 2452 return (ret); 2453 } 2454 2455 static struct recv_wid * 2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2457 { 2458 struct recv_wid *rwid; 2459 2460 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2461 rwid->xid = msgid; 2462 rwid->addr = sgl->ds_va; 2463 rwid->qp = qp; 2464 2465 return (rwid); 2466 } 2467 2468 static void 2469 rib_free_wid(struct recv_wid *rwid) 2470 { 2471 kmem_free(rwid, sizeof (struct recv_wid)); 2472 } 2473 2474 rdma_stat 2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2476 { 2477 rib_qp_t *qp = ctoqp(conn); 2478 struct clist *clp = cl; 2479 struct reply *rep; 2480 struct recv_wid *rwid; 2481 int nds; 2482 ibt_wr_ds_t sgl[DSEG_MAX]; 2483 ibt_recv_wr_t recv_wr; 2484 rdma_stat ret; 2485 ibt_status_t ibt_status; 2486 2487 /* 2488 * rdma_clnt_postrecv uses RECV_BUFFER. 2489 */ 2490 2491 nds = 0; 2492 while (cl != NULL) { 2493 if (nds >= DSEG_MAX) { 2494 ret = RDMA_FAILED; 2495 goto done; 2496 } 2497 sgl[nds].ds_va = cl->w.c_saddr; 2498 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2499 sgl[nds].ds_len = cl->c_len; 2500 cl = cl->c_next; 2501 nds++; 2502 } 2503 2504 if (nds != 1) { 2505 ret = RDMA_FAILED; 2506 goto done; 2507 } 2508 2509 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2510 recv_wr.wr_nds = nds; 2511 recv_wr.wr_sgl = sgl; 2512 2513 rwid = rib_create_wid(qp, &sgl[0], msgid); 2514 if (rwid) { 2515 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2516 } else { 2517 ret = RDMA_NORESOURCE; 2518 goto done; 2519 } 2520 rep = rib_addreplylist(qp, msgid); 2521 if (!rep) { 2522 rib_free_wid(rwid); 2523 ret = RDMA_NORESOURCE; 2524 goto done; 2525 } 2526 2527 mutex_enter(&conn->c_lock); 2528 2529 if (conn->c_state == C_CONNECTED) { 2530 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2531 } 2532 2533 if (conn->c_state != C_CONNECTED || 2534 ibt_status != IBT_SUCCESS) { 2535 if (conn->c_state != C_DISCONN_PEND) 2536 conn->c_state = C_ERROR_CONN; 2537 mutex_exit(&conn->c_lock); 2538 rib_free_wid(rwid); 2539 (void) rib_rem_rep(qp, rep); 2540 ret = RDMA_CONNLOST; 2541 goto done; 2542 } 2543 mutex_exit(&conn->c_lock); 2544 return (RDMA_SUCCESS); 2545 2546 done: 2547 while (clp != NULL) { 2548 rib_rbuf_free(conn, RECV_BUFFER, 2549 (void *)(uintptr_t)clp->w.c_saddr3); 2550 clp = clp->c_next; 2551 } 2552 return (ret); 2553 } 2554 2555 rdma_stat 2556 rib_svc_post(CONN* conn, struct clist *cl) 2557 { 2558 rib_qp_t *qp = ctoqp(conn); 2559 struct svc_recv *s_recvp; 2560 int nds; 2561 ibt_wr_ds_t sgl[DSEG_MAX]; 2562 ibt_recv_wr_t recv_wr; 2563 ibt_status_t ibt_status; 2564 2565 nds = 0; 2566 while (cl != NULL) { 2567 if (nds >= DSEG_MAX) { 2568 return (RDMA_FAILED); 2569 } 2570 sgl[nds].ds_va = cl->w.c_saddr; 2571 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2572 sgl[nds].ds_len = cl->c_len; 2573 cl = cl->c_next; 2574 nds++; 2575 } 2576 2577 if (nds != 1) { 2578 rib_rbuf_free(conn, RECV_BUFFER, 2579 (caddr_t)(uintptr_t)sgl[0].ds_va); 2580 2581 return (RDMA_FAILED); 2582 } 2583 2584 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2585 recv_wr.wr_nds = nds; 2586 recv_wr.wr_sgl = sgl; 2587 2588 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2589 /* Use s_recvp's addr as wr id */ 2590 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2591 mutex_enter(&conn->c_lock); 2592 if (conn->c_state == C_CONNECTED) { 2593 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2594 } 2595 if (conn->c_state != C_CONNECTED || 2596 ibt_status != IBT_SUCCESS) { 2597 if (conn->c_state != C_DISCONN_PEND) 2598 conn->c_state = C_ERROR_CONN; 2599 mutex_exit(&conn->c_lock); 2600 rib_rbuf_free(conn, RECV_BUFFER, 2601 (caddr_t)(uintptr_t)sgl[0].ds_va); 2602 (void) rib_free_svc_recv(s_recvp); 2603 2604 return (RDMA_CONNLOST); 2605 } 2606 mutex_exit(&conn->c_lock); 2607 2608 return (RDMA_SUCCESS); 2609 } 2610 2611 /* Client */ 2612 rdma_stat 2613 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2614 { 2615 2616 return (rib_clnt_post(conn, cl, msgid)); 2617 } 2618 2619 /* Client */ 2620 rdma_stat 2621 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2622 { 2623 rib_qp_t *qp = ctoqp(conn); 2624 struct reply *rep; 2625 2626 mutex_enter(&qp->replylist_lock); 2627 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2628 if (rep->xid == msgid) { 2629 if (rep->vaddr_cq) { 2630 rib_rbuf_free(conn, RECV_BUFFER, 2631 (caddr_t)(uintptr_t)rep->vaddr_cq); 2632 } 2633 (void) rib_remreply(qp, rep); 2634 break; 2635 } 2636 } 2637 mutex_exit(&qp->replylist_lock); 2638 2639 return (RDMA_SUCCESS); 2640 } 2641 2642 /* Server */ 2643 rdma_stat 2644 rib_post_recv(CONN *conn, struct clist *cl) 2645 { 2646 rib_qp_t *qp = ctoqp(conn); 2647 2648 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2649 mutex_enter(&qp->posted_rbufs_lock); 2650 qp->n_posted_rbufs++; 2651 mutex_exit(&qp->posted_rbufs_lock); 2652 return (RDMA_SUCCESS); 2653 } 2654 return (RDMA_FAILED); 2655 } 2656 2657 /* 2658 * Client side only interface to "recv" the rpc reply buf 2659 * posted earlier by rib_post_resp(conn, cl, msgid). 2660 */ 2661 rdma_stat 2662 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2663 { 2664 struct reply *rep = NULL; 2665 clock_t timout, cv_wait_ret; 2666 rdma_stat ret = RDMA_SUCCESS; 2667 rib_qp_t *qp = ctoqp(conn); 2668 2669 /* 2670 * Find the reply structure for this msgid 2671 */ 2672 mutex_enter(&qp->replylist_lock); 2673 2674 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2675 if (rep->xid == msgid) 2676 break; 2677 } 2678 2679 if (rep != NULL) { 2680 /* 2681 * If message not yet received, wait. 2682 */ 2683 if (rep->status == (uint_t)REPLY_WAIT) { 2684 timout = ddi_get_lbolt() + 2685 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2686 2687 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2688 &qp->replylist_lock, timout)) > 0 && 2689 rep->status == (uint_t)REPLY_WAIT) 2690 ; 2691 2692 switch (cv_wait_ret) { 2693 case -1: /* timeout */ 2694 ret = RDMA_TIMEDOUT; 2695 break; 2696 case 0: 2697 ret = RDMA_INTR; 2698 break; 2699 default: 2700 break; 2701 } 2702 } 2703 2704 if (rep->status == RDMA_SUCCESS) { 2705 struct clist *cl = NULL; 2706 2707 /* 2708 * Got message successfully 2709 */ 2710 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2711 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2712 *clp = cl; 2713 } else { 2714 if (rep->status != (uint_t)REPLY_WAIT) { 2715 /* 2716 * Got error in reply message. Free 2717 * recv buffer here. 2718 */ 2719 ret = rep->status; 2720 rib_rbuf_free(conn, RECV_BUFFER, 2721 (caddr_t)(uintptr_t)rep->vaddr_cq); 2722 } 2723 } 2724 (void) rib_remreply(qp, rep); 2725 } else { 2726 /* 2727 * No matching reply structure found for given msgid on the 2728 * reply wait list. 2729 */ 2730 ret = RDMA_INVAL; 2731 DTRACE_PROBE(rpcib__i__nomatchxid2); 2732 } 2733 2734 /* 2735 * Done. 2736 */ 2737 mutex_exit(&qp->replylist_lock); 2738 return (ret); 2739 } 2740 2741 /* 2742 * RDMA write a buffer to the remote address. 2743 */ 2744 rdma_stat 2745 rib_write(CONN *conn, struct clist *cl, int wait) 2746 { 2747 ibt_send_wr_t tx_wr; 2748 int cv_sig; 2749 ibt_wr_ds_t sgl[DSEG_MAX]; 2750 struct send_wid *wdesc; 2751 ibt_status_t ibt_status; 2752 rdma_stat ret = RDMA_SUCCESS; 2753 rib_qp_t *qp = ctoqp(conn); 2754 uint64_t n_writes = 0; 2755 2756 if (cl == NULL) { 2757 return (RDMA_FAILED); 2758 } 2759 2760 while ((cl != NULL)) { 2761 if (cl->c_len > 0) { 2762 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2763 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2764 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2765 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2766 sgl[0].ds_va = cl->w.c_saddr; 2767 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2768 sgl[0].ds_len = cl->c_len; 2769 2770 if (wait) { 2771 cv_sig = 1; 2772 } else { 2773 if (n_writes > max_unsignaled_rws) { 2774 n_writes = 0; 2775 cv_sig = 1; 2776 } else { 2777 cv_sig = 0; 2778 } 2779 } 2780 2781 if (cv_sig) { 2782 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2783 wdesc = rib_init_sendwait(0, cv_sig, qp); 2784 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2785 mutex_enter(&wdesc->sendwait_lock); 2786 } else { 2787 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2788 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2789 } 2790 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2791 tx_wr.wr_trans = IBT_RC_SRV; 2792 tx_wr.wr_nds = 1; 2793 tx_wr.wr_sgl = sgl; 2794 2795 mutex_enter(&conn->c_lock); 2796 if (conn->c_state == C_CONNECTED) { 2797 ibt_status = 2798 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2799 } 2800 if (conn->c_state != C_CONNECTED || 2801 ibt_status != IBT_SUCCESS) { 2802 if (conn->c_state != C_DISCONN_PEND) 2803 conn->c_state = C_ERROR_CONN; 2804 mutex_exit(&conn->c_lock); 2805 if (cv_sig) { 2806 mutex_exit(&wdesc->sendwait_lock); 2807 (void) rib_free_sendwait(wdesc); 2808 } 2809 return (RDMA_CONNLOST); 2810 } 2811 2812 mutex_exit(&conn->c_lock); 2813 2814 /* 2815 * Wait for send to complete 2816 */ 2817 if (cv_sig) { 2818 2819 rib_send_hold(qp); 2820 mutex_exit(&wdesc->sendwait_lock); 2821 2822 ret = rib_sendwait(qp, wdesc); 2823 if (ret != 0) 2824 return (ret); 2825 } 2826 n_writes ++; 2827 } 2828 cl = cl->c_next; 2829 } 2830 return (RDMA_SUCCESS); 2831 } 2832 2833 /* 2834 * RDMA Read a buffer from the remote address. 2835 */ 2836 rdma_stat 2837 rib_read(CONN *conn, struct clist *cl, int wait) 2838 { 2839 ibt_send_wr_t rx_wr; 2840 int cv_sig = 0; 2841 ibt_wr_ds_t sgl; 2842 struct send_wid *wdesc; 2843 ibt_status_t ibt_status = IBT_SUCCESS; 2844 rdma_stat ret = RDMA_SUCCESS; 2845 rib_qp_t *qp = ctoqp(conn); 2846 2847 if (cl == NULL) { 2848 return (RDMA_FAILED); 2849 } 2850 2851 while (cl != NULL) { 2852 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2853 /* 2854 * Remote address is at the head chunk item in list. 2855 */ 2856 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2857 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2858 2859 sgl.ds_va = cl->u.c_daddr; 2860 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2861 sgl.ds_len = cl->c_len; 2862 2863 /* 2864 * If there are multiple chunks to be read, and 2865 * wait is set, ask for signal only for the last chunk 2866 * and wait only on the last chunk. The completion of 2867 * RDMA_READ on last chunk ensures that reads on all 2868 * previous chunks are also completed. 2869 */ 2870 if (wait && (cl->c_next == NULL)) { 2871 cv_sig = 1; 2872 wdesc = rib_init_sendwait(0, cv_sig, qp); 2873 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2874 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2875 mutex_enter(&wdesc->sendwait_lock); 2876 } else { 2877 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2878 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2879 } 2880 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2881 rx_wr.wr_trans = IBT_RC_SRV; 2882 rx_wr.wr_nds = 1; 2883 rx_wr.wr_sgl = &sgl; 2884 2885 mutex_enter(&conn->c_lock); 2886 if (conn->c_state == C_CONNECTED) { 2887 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2888 } 2889 if (conn->c_state != C_CONNECTED || 2890 ibt_status != IBT_SUCCESS) { 2891 if (conn->c_state != C_DISCONN_PEND) 2892 conn->c_state = C_ERROR_CONN; 2893 mutex_exit(&conn->c_lock); 2894 if (wait && (cl->c_next == NULL)) { 2895 mutex_exit(&wdesc->sendwait_lock); 2896 (void) rib_free_sendwait(wdesc); 2897 } 2898 return (RDMA_CONNLOST); 2899 } 2900 2901 mutex_exit(&conn->c_lock); 2902 2903 /* 2904 * Wait for send to complete if this is the 2905 * last item in the list. 2906 */ 2907 if (wait && cl->c_next == NULL) { 2908 rib_send_hold(qp); 2909 mutex_exit(&wdesc->sendwait_lock); 2910 2911 ret = rib_sendwait(qp, wdesc); 2912 2913 if (ret != 0) 2914 return (ret); 2915 } 2916 cl = cl->c_next; 2917 } 2918 return (RDMA_SUCCESS); 2919 } 2920 2921 /* 2922 * rib_srv_cm_handler() 2923 * Connection Manager callback to handle RC connection requests. 2924 */ 2925 /* ARGSUSED */ 2926 static ibt_cm_status_t 2927 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2928 ibt_cm_return_args_t *ret_args, void *priv_data, 2929 ibt_priv_data_len_t len) 2930 { 2931 queue_t *q; 2932 rib_qp_t *qp; 2933 rib_hca_t *hca; 2934 rdma_stat status = RDMA_SUCCESS; 2935 int i; 2936 struct clist cl; 2937 rdma_buf_t rdbuf = {0}; 2938 void *buf = NULL; 2939 CONN *conn; 2940 ibt_ip_cm_info_t ipinfo; 2941 struct sockaddr_in *s; 2942 struct sockaddr_in6 *s6; 2943 int sin_size = sizeof (struct sockaddr_in); 2944 int in_size = sizeof (struct in_addr); 2945 int sin6_size = sizeof (struct sockaddr_in6); 2946 2947 ASSERT(any != NULL); 2948 ASSERT(event != NULL); 2949 2950 hca = (rib_hca_t *)any; 2951 2952 /* got a connection request */ 2953 switch (event->cm_type) { 2954 case IBT_CM_EVENT_REQ_RCV: 2955 /* 2956 * If the plugin is in the NO_ACCEPT state, bail out. 2957 */ 2958 mutex_enter(&plugin_state_lock); 2959 if (plugin_state == NO_ACCEPT) { 2960 mutex_exit(&plugin_state_lock); 2961 return (IBT_CM_REJECT); 2962 } 2963 mutex_exit(&plugin_state_lock); 2964 2965 /* 2966 * Need to send a MRA MAD to CM so that it does not 2967 * timeout on us. 2968 */ 2969 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2970 event->cm_event.req.req_timeout * 8, NULL, 0); 2971 2972 mutex_enter(&rib_stat->open_hca_lock); 2973 q = rib_stat->q; 2974 mutex_exit(&rib_stat->open_hca_lock); 2975 2976 status = rib_svc_create_chan(hca, (caddr_t)q, 2977 event->cm_event.req.req_prim_hca_port, &qp); 2978 2979 if (status) { 2980 return (IBT_CM_REJECT); 2981 } 2982 2983 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2984 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2985 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2986 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2987 2988 /* 2989 * Pre-posts RECV buffers 2990 */ 2991 conn = qptoc(qp); 2992 for (i = 0; i < preposted_rbufs; i++) { 2993 bzero(&rdbuf, sizeof (rdbuf)); 2994 rdbuf.type = RECV_BUFFER; 2995 buf = rib_rbuf_alloc(conn, &rdbuf); 2996 if (buf == NULL) { 2997 /* 2998 * A connection is not established yet. 2999 * Just flush the channel. Buffers 3000 * posted till now will error out with 3001 * IBT_WC_WR_FLUSHED_ERR. 3002 */ 3003 (void) ibt_flush_channel(qp->qp_hdl); 3004 (void) rib_disconnect_channel(conn, NULL); 3005 return (IBT_CM_REJECT); 3006 } 3007 3008 bzero(&cl, sizeof (cl)); 3009 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3010 cl.c_len = rdbuf.len; 3011 cl.c_smemhandle.mrc_lmr = 3012 rdbuf.handle.mrc_lmr; /* lkey */ 3013 cl.c_next = NULL; 3014 status = rib_post_recv(conn, &cl); 3015 if (status != RDMA_SUCCESS) { 3016 /* 3017 * A connection is not established yet. 3018 * Just flush the channel. Buffers 3019 * posted till now will error out with 3020 * IBT_WC_WR_FLUSHED_ERR. 3021 */ 3022 (void) ibt_flush_channel(qp->qp_hdl); 3023 (void) rib_disconnect_channel(conn, NULL); 3024 return (IBT_CM_REJECT); 3025 } 3026 } 3027 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3028 3029 /* 3030 * Get the address translation 3031 */ 3032 rw_enter(&hca->state_lock, RW_READER); 3033 if (hca->state == HCA_DETACHED) { 3034 rw_exit(&hca->state_lock); 3035 return (IBT_CM_REJECT); 3036 } 3037 rw_exit(&hca->state_lock); 3038 3039 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3040 3041 if (ibt_get_ip_data(event->cm_priv_data_len, 3042 event->cm_priv_data, 3043 &ipinfo) != IBT_SUCCESS) { 3044 3045 return (IBT_CM_REJECT); 3046 } 3047 3048 switch (ipinfo.src_addr.family) { 3049 case AF_INET: 3050 3051 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3052 KM_SLEEP); 3053 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3054 3055 conn->c_raddr.maxlen = 3056 conn->c_raddr.len = sin_size; 3057 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3058 3059 s = (struct sockaddr_in *)conn->c_raddr.buf; 3060 s->sin_family = AF_INET; 3061 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3062 &s->sin_addr, in_size); 3063 3064 conn->c_laddr.maxlen = 3065 conn->c_laddr.len = sin_size; 3066 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3067 3068 s = (struct sockaddr_in *)conn->c_laddr.buf; 3069 s->sin_family = AF_INET; 3070 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3071 &s->sin_addr, in_size); 3072 3073 break; 3074 3075 case AF_INET6: 3076 3077 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3078 KM_SLEEP); 3079 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3080 3081 conn->c_raddr.maxlen = 3082 conn->c_raddr.len = sin6_size; 3083 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3084 3085 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3086 s6->sin6_family = AF_INET6; 3087 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3088 &s6->sin6_addr, 3089 sizeof (struct in6_addr)); 3090 3091 conn->c_laddr.maxlen = 3092 conn->c_laddr.len = sin6_size; 3093 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3094 3095 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3096 s6->sin6_family = AF_INET6; 3097 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3098 &s6->sin6_addr, 3099 sizeof (struct in6_addr)); 3100 3101 break; 3102 3103 default: 3104 return (IBT_CM_REJECT); 3105 } 3106 3107 break; 3108 3109 case IBT_CM_EVENT_CONN_CLOSED: 3110 { 3111 CONN *conn; 3112 rib_qp_t *qp; 3113 3114 switch (event->cm_event.closed) { 3115 case IBT_CM_CLOSED_DREP_RCVD: 3116 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3117 case IBT_CM_CLOSED_DUP: 3118 case IBT_CM_CLOSED_ABORT: 3119 case IBT_CM_CLOSED_ALREADY: 3120 /* 3121 * These cases indicate the local end initiated 3122 * the closing of the channel. Nothing to do here. 3123 */ 3124 break; 3125 default: 3126 /* 3127 * Reason for CONN_CLOSED event must be one of 3128 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3129 * or IBT_CM_CLOSED_STALE. These indicate cases were 3130 * the remote end is closing the channel. In these 3131 * cases free the channel and transition to error 3132 * state 3133 */ 3134 qp = ibt_get_chan_private(event->cm_channel); 3135 conn = qptoc(qp); 3136 mutex_enter(&conn->c_lock); 3137 if (conn->c_state == C_DISCONN_PEND) { 3138 mutex_exit(&conn->c_lock); 3139 break; 3140 } 3141 conn->c_state = C_ERROR_CONN; 3142 3143 /* 3144 * Free the conn if c_ref goes down to 0 3145 */ 3146 if (conn->c_ref == 0) { 3147 /* 3148 * Remove from list and free conn 3149 */ 3150 conn->c_state = C_DISCONN_PEND; 3151 mutex_exit(&conn->c_lock); 3152 (void) rib_disconnect_channel(conn, 3153 &hca->srv_conn_list); 3154 } else { 3155 /* 3156 * conn will be freed when c_ref goes to 0. 3157 * Indicate to cleaning thread not to close 3158 * the connection, but just free the channel. 3159 */ 3160 conn->c_flags |= C_CLOSE_NOTNEEDED; 3161 mutex_exit(&conn->c_lock); 3162 } 3163 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3164 break; 3165 } 3166 break; 3167 } 3168 case IBT_CM_EVENT_CONN_EST: 3169 /* 3170 * RTU received, hence connection established. 3171 */ 3172 if (rib_debug > 1) 3173 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3174 "(CONN_EST) channel established"); 3175 break; 3176 3177 default: 3178 if (rib_debug > 2) { 3179 /* Let CM handle the following events. */ 3180 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3181 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3182 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3183 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3184 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3185 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3186 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3187 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3188 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3189 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3190 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3191 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3192 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3193 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3194 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3195 } 3196 } 3197 return (IBT_CM_DEFAULT); 3198 } 3199 3200 /* accept all other CM messages (i.e. let the CM handle them) */ 3201 return (IBT_CM_ACCEPT); 3202 } 3203 3204 static rdma_stat 3205 rib_register_service(rib_hca_t *hca, int service_type, 3206 uint8_t protocol_num, in_port_t dst_port) 3207 { 3208 ibt_srv_desc_t sdesc; 3209 ibt_hca_portinfo_t *port_infop; 3210 ib_svc_id_t srv_id; 3211 ibt_srv_hdl_t srv_hdl; 3212 uint_t port_size; 3213 uint_t pki, i, num_ports, nbinds; 3214 ibt_status_t ibt_status; 3215 rib_service_t *service; 3216 ib_pkey_t pkey; 3217 3218 /* 3219 * Query all ports for the given HCA 3220 */ 3221 rw_enter(&hca->state_lock, RW_READER); 3222 if (hca->state != HCA_DETACHED) { 3223 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3224 &num_ports, &port_size); 3225 rw_exit(&hca->state_lock); 3226 } else { 3227 rw_exit(&hca->state_lock); 3228 return (RDMA_FAILED); 3229 } 3230 if (ibt_status != IBT_SUCCESS) { 3231 return (RDMA_FAILED); 3232 } 3233 3234 DTRACE_PROBE1(rpcib__i__regservice_numports, 3235 int, num_ports); 3236 3237 for (i = 0; i < num_ports; i++) { 3238 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3239 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3240 int, i+1); 3241 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3242 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3243 int, i+1); 3244 } 3245 } 3246 3247 /* 3248 * Get all the IP addresses on this system to register the 3249 * given "service type" on all DNS recognized IP addrs. 3250 * Each service type such as NFS will have all the systems 3251 * IP addresses as its different names. For now the only 3252 * type of service we support in RPCIB is NFS. 3253 */ 3254 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3255 /* 3256 * Start registering and binding service to active 3257 * on active ports on this HCA. 3258 */ 3259 nbinds = 0; 3260 for (service = rib_stat->service_list; 3261 service && (service->srv_type != service_type); 3262 service = service->next) 3263 ; 3264 3265 if (service == NULL) { 3266 /* 3267 * We use IP addresses as the service names for 3268 * service registration. Register each of them 3269 * with CM to obtain a svc_id and svc_hdl. We do not 3270 * register the service with machine's loopback address. 3271 */ 3272 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3273 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3274 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3275 sdesc.sd_handler = rib_srv_cm_handler; 3276 sdesc.sd_flags = 0; 3277 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3278 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3279 1, &srv_hdl, &srv_id); 3280 if ((ibt_status != IBT_SUCCESS) && 3281 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3282 rw_exit(&rib_stat->service_list_lock); 3283 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3284 int, ibt_status); 3285 ibt_free_portinfo(port_infop, port_size); 3286 return (RDMA_FAILED); 3287 } 3288 3289 /* 3290 * Allocate and prepare a service entry 3291 */ 3292 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3293 3294 service->srv_type = service_type; 3295 service->srv_hdl = srv_hdl; 3296 service->srv_id = srv_id; 3297 3298 service->next = rib_stat->service_list; 3299 rib_stat->service_list = service; 3300 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3301 int, service->srv_type); 3302 } else { 3303 srv_hdl = service->srv_hdl; 3304 srv_id = service->srv_id; 3305 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3306 int, service->srv_type); 3307 } 3308 3309 for (i = 0; i < num_ports; i++) { 3310 ibt_sbind_hdl_t sbp; 3311 rib_hca_service_t *hca_srv; 3312 ib_gid_t gid; 3313 3314 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3315 continue; 3316 3317 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3318 pkey = port_infop[i].p_pkey_tbl[pki]; 3319 3320 rw_enter(&hca->bound_services_lock, RW_READER); 3321 gid = port_infop[i].p_sgid_tbl[0]; 3322 for (hca_srv = hca->bound_services; hca_srv; 3323 hca_srv = hca_srv->next) { 3324 if ((hca_srv->srv_id == service->srv_id) && 3325 (hca_srv->gid.gid_prefix == 3326 gid.gid_prefix) && 3327 (hca_srv->gid.gid_guid == gid.gid_guid)) 3328 break; 3329 } 3330 rw_exit(&hca->bound_services_lock); 3331 if (hca_srv != NULL) { 3332 /* 3333 * port is alreay bound the the service 3334 */ 3335 DTRACE_PROBE1( 3336 rpcib__i__regservice__already__bound, 3337 int, i+1); 3338 nbinds++; 3339 continue; 3340 } 3341 3342 if ((pkey & IBSRM_HB) && 3343 (pkey != IB_PKEY_INVALID_FULL)) { 3344 3345 sbp = NULL; 3346 ibt_status = ibt_bind_service(srv_hdl, 3347 gid, NULL, hca, &sbp); 3348 3349 if (ibt_status == IBT_SUCCESS) { 3350 hca_srv = kmem_zalloc( 3351 sizeof (rib_hca_service_t), 3352 KM_SLEEP); 3353 hca_srv->srv_id = srv_id; 3354 hca_srv->gid = gid; 3355 hca_srv->sbind_hdl = sbp; 3356 3357 rw_enter(&hca->bound_services_lock, 3358 RW_WRITER); 3359 hca_srv->next = hca->bound_services; 3360 hca->bound_services = hca_srv; 3361 rw_exit(&hca->bound_services_lock); 3362 nbinds++; 3363 } 3364 3365 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3366 int, ibt_status); 3367 } 3368 } 3369 } 3370 rw_exit(&rib_stat->service_list_lock); 3371 3372 ibt_free_portinfo(port_infop, port_size); 3373 3374 if (nbinds == 0) { 3375 return (RDMA_FAILED); 3376 } else { 3377 /* 3378 * Put this plugin into accept state, since atleast 3379 * one registration was successful. 3380 */ 3381 mutex_enter(&plugin_state_lock); 3382 plugin_state = ACCEPT; 3383 mutex_exit(&plugin_state_lock); 3384 return (RDMA_SUCCESS); 3385 } 3386 } 3387 3388 void 3389 rib_listen(struct rdma_svc_data *rd) 3390 { 3391 rdma_stat status; 3392 int n_listening = 0; 3393 rib_hca_t *hca; 3394 3395 mutex_enter(&rib_stat->listen_lock); 3396 /* 3397 * if rd parameter is NULL then it means that rib_stat->q is 3398 * already initialized by a call from RDMA and we just want to 3399 * add a newly attached HCA to the same listening state as other 3400 * HCAs. 3401 */ 3402 if (rd == NULL) { 3403 if (rib_stat->q == NULL) { 3404 mutex_exit(&rib_stat->listen_lock); 3405 return; 3406 } 3407 } else { 3408 rib_stat->q = &rd->q; 3409 } 3410 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3411 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3412 /* 3413 * First check if a hca is still attached 3414 */ 3415 rw_enter(&hca->state_lock, RW_READER); 3416 if (hca->state != HCA_INITED) { 3417 rw_exit(&hca->state_lock); 3418 continue; 3419 } 3420 rw_exit(&hca->state_lock); 3421 3422 /* 3423 * Right now the only service type is NFS. Hence 3424 * force feed this value. Ideally to communicate 3425 * the service type it should be passed down in 3426 * rdma_svc_data. 3427 */ 3428 status = rib_register_service(hca, NFS, 3429 IPPROTO_TCP, nfs_rdma_port); 3430 if (status == RDMA_SUCCESS) 3431 n_listening++; 3432 } 3433 rw_exit(&rib_stat->hcas_list_lock); 3434 3435 /* 3436 * Service active on an HCA, check rd->err_code for more 3437 * explainable errors. 3438 */ 3439 if (rd) { 3440 if (n_listening > 0) { 3441 rd->active = 1; 3442 rd->err_code = RDMA_SUCCESS; 3443 } else { 3444 rd->active = 0; 3445 rd->err_code = RDMA_FAILED; 3446 } 3447 } 3448 mutex_exit(&rib_stat->listen_lock); 3449 } 3450 3451 /* XXXX */ 3452 /* ARGSUSED */ 3453 static void 3454 rib_listen_stop(struct rdma_svc_data *svcdata) 3455 { 3456 rib_hca_t *hca; 3457 3458 mutex_enter(&rib_stat->listen_lock); 3459 /* 3460 * KRPC called the RDMATF to stop the listeners, this means 3461 * stop sending incomming or recieved requests to KRPC master 3462 * transport handle for RDMA-IB. This is also means that the 3463 * master transport handle, responsible for us, is going away. 3464 */ 3465 mutex_enter(&plugin_state_lock); 3466 plugin_state = NO_ACCEPT; 3467 if (svcdata != NULL) 3468 svcdata->active = 0; 3469 mutex_exit(&plugin_state_lock); 3470 3471 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3472 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3473 /* 3474 * First check if a hca is still attached 3475 */ 3476 rw_enter(&hca->state_lock, RW_READER); 3477 if (hca->state == HCA_DETACHED) { 3478 rw_exit(&hca->state_lock); 3479 continue; 3480 } 3481 rib_close_channels(&hca->srv_conn_list); 3482 rib_stop_services(hca); 3483 rw_exit(&hca->state_lock); 3484 } 3485 rw_exit(&rib_stat->hcas_list_lock); 3486 3487 /* 3488 * Avoid rib_listen() using the stale q field. 3489 * This could happen if a port goes up after all services 3490 * are already unregistered. 3491 */ 3492 rib_stat->q = NULL; 3493 mutex_exit(&rib_stat->listen_lock); 3494 } 3495 3496 /* 3497 * Traverse the HCA's service list to unbind and deregister services. 3498 * For each bound service of HCA to be removed, first find the corresponding 3499 * service handle (srv_hdl) and then unbind the service by calling 3500 * ibt_unbind_service(). 3501 */ 3502 static void 3503 rib_stop_services(rib_hca_t *hca) 3504 { 3505 rib_hca_service_t *srv_list, *to_remove; 3506 3507 /* 3508 * unbind and deregister the services for this service type. 3509 * Right now there is only one service type. In future it will 3510 * be passed down to this function. 3511 */ 3512 rw_enter(&hca->bound_services_lock, RW_READER); 3513 srv_list = hca->bound_services; 3514 hca->bound_services = NULL; 3515 rw_exit(&hca->bound_services_lock); 3516 3517 while (srv_list != NULL) { 3518 rib_service_t *sc; 3519 3520 to_remove = srv_list; 3521 srv_list = to_remove->next; 3522 rw_enter(&rib_stat->service_list_lock, RW_READER); 3523 for (sc = rib_stat->service_list; 3524 sc && (sc->srv_id != to_remove->srv_id); 3525 sc = sc->next) 3526 ; 3527 /* 3528 * if sc is NULL then the service doesn't exist anymore, 3529 * probably just removed completely through rib_stat. 3530 */ 3531 if (sc != NULL) 3532 (void) ibt_unbind_service(sc->srv_hdl, 3533 to_remove->sbind_hdl); 3534 rw_exit(&rib_stat->service_list_lock); 3535 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3536 } 3537 } 3538 3539 static struct svc_recv * 3540 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3541 { 3542 struct svc_recv *recvp; 3543 3544 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3545 recvp->vaddr = sgl->ds_va; 3546 recvp->qp = qp; 3547 recvp->bytes_xfer = 0; 3548 return (recvp); 3549 } 3550 3551 static int 3552 rib_free_svc_recv(struct svc_recv *recvp) 3553 { 3554 kmem_free(recvp, sizeof (*recvp)); 3555 3556 return (0); 3557 } 3558 3559 static struct reply * 3560 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3561 { 3562 struct reply *rep; 3563 3564 3565 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3566 if (rep == NULL) { 3567 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3568 return (NULL); 3569 } 3570 rep->xid = msgid; 3571 rep->vaddr_cq = NULL; 3572 rep->bytes_xfer = 0; 3573 rep->status = (uint_t)REPLY_WAIT; 3574 rep->prev = NULL; 3575 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3576 3577 mutex_enter(&qp->replylist_lock); 3578 if (qp->replylist) { 3579 rep->next = qp->replylist; 3580 qp->replylist->prev = rep; 3581 } 3582 qp->rep_list_size++; 3583 3584 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3585 int, qp->rep_list_size); 3586 3587 qp->replylist = rep; 3588 mutex_exit(&qp->replylist_lock); 3589 3590 return (rep); 3591 } 3592 3593 static rdma_stat 3594 rib_rem_replylist(rib_qp_t *qp) 3595 { 3596 struct reply *r, *n; 3597 3598 mutex_enter(&qp->replylist_lock); 3599 for (r = qp->replylist; r != NULL; r = n) { 3600 n = r->next; 3601 (void) rib_remreply(qp, r); 3602 } 3603 mutex_exit(&qp->replylist_lock); 3604 3605 return (RDMA_SUCCESS); 3606 } 3607 3608 static int 3609 rib_remreply(rib_qp_t *qp, struct reply *rep) 3610 { 3611 3612 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3613 if (rep->prev) { 3614 rep->prev->next = rep->next; 3615 } 3616 if (rep->next) { 3617 rep->next->prev = rep->prev; 3618 } 3619 if (qp->replylist == rep) 3620 qp->replylist = rep->next; 3621 3622 cv_destroy(&rep->wait_cv); 3623 qp->rep_list_size--; 3624 3625 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3626 int, qp->rep_list_size); 3627 3628 kmem_free(rep, sizeof (*rep)); 3629 3630 return (0); 3631 } 3632 3633 rdma_stat 3634 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3635 struct mrc *buf_handle) 3636 { 3637 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3638 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3639 rdma_stat status; 3640 rib_hca_t *hca = (ctoqp(conn))->hca; 3641 3642 /* 3643 * Note: ALL buffer pools use the same memory type RDMARW. 3644 */ 3645 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3646 if (status == RDMA_SUCCESS) { 3647 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3648 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3649 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3650 } else { 3651 buf_handle->mrc_linfo = NULL; 3652 buf_handle->mrc_lmr = 0; 3653 buf_handle->mrc_rmr = 0; 3654 } 3655 return (status); 3656 } 3657 3658 static rdma_stat 3659 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3660 ibt_mr_flags_t spec, 3661 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3662 { 3663 ibt_mr_attr_t mem_attr; 3664 ibt_status_t ibt_status; 3665 mem_attr.mr_vaddr = (uintptr_t)buf; 3666 mem_attr.mr_len = (ib_msglen_t)size; 3667 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3668 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3669 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3670 IBT_MR_ENABLE_WINDOW_BIND | spec; 3671 3672 rw_enter(&hca->state_lock, RW_READER); 3673 if (hca->state != HCA_DETACHED) { 3674 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3675 &mem_attr, mr_hdlp, mr_descp); 3676 rw_exit(&hca->state_lock); 3677 } else { 3678 rw_exit(&hca->state_lock); 3679 return (RDMA_FAILED); 3680 } 3681 3682 if (ibt_status != IBT_SUCCESS) { 3683 return (RDMA_FAILED); 3684 } 3685 return (RDMA_SUCCESS); 3686 } 3687 3688 rdma_stat 3689 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3690 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3691 { 3692 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3693 rib_lrc_entry_t *l; 3694 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3695 rdma_stat status; 3696 rib_hca_t *hca = (ctoqp(conn))->hca; 3697 3698 /* 3699 * Non-coherent memory registration. 3700 */ 3701 l = (rib_lrc_entry_t *)lrc; 3702 if (l) { 3703 if (l->registered) { 3704 buf_handle->mrc_linfo = 3705 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3706 buf_handle->mrc_lmr = 3707 (uint32_t)l->lrc_mhandle.mrc_lmr; 3708 buf_handle->mrc_rmr = 3709 (uint32_t)l->lrc_mhandle.mrc_rmr; 3710 *sync_handle = (RIB_SYNCMEM_HANDLE) 3711 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3712 return (RDMA_SUCCESS); 3713 } else { 3714 /* Always register the whole buffer */ 3715 buf = (caddr_t)l->lrc_buf; 3716 buflen = l->lrc_len; 3717 } 3718 } 3719 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3720 3721 if (status == RDMA_SUCCESS) { 3722 if (l) { 3723 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3724 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3725 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3726 l->registered = TRUE; 3727 } 3728 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3729 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3730 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3731 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3732 } else { 3733 buf_handle->mrc_linfo = NULL; 3734 buf_handle->mrc_lmr = 0; 3735 buf_handle->mrc_rmr = 0; 3736 } 3737 return (status); 3738 } 3739 3740 /* ARGSUSED */ 3741 rdma_stat 3742 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3743 { 3744 rib_hca_t *hca = (ctoqp(conn))->hca; 3745 /* 3746 * Allow memory deregistration even if HCA is 3747 * getting detached. Need all outstanding 3748 * memory registrations to be deregistered 3749 * before HCA_DETACH_EVENT can be accepted. 3750 */ 3751 (void) ibt_deregister_mr(hca->hca_hdl, 3752 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3753 return (RDMA_SUCCESS); 3754 } 3755 3756 /* ARGSUSED */ 3757 rdma_stat 3758 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3759 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3760 { 3761 rib_lrc_entry_t *l; 3762 l = (rib_lrc_entry_t *)lrc; 3763 if (l) 3764 if (l->registered) 3765 return (RDMA_SUCCESS); 3766 3767 (void) rib_deregistermem(conn, buf, buf_handle); 3768 3769 return (RDMA_SUCCESS); 3770 } 3771 3772 /* ARGSUSED */ 3773 rdma_stat 3774 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3775 int len, int cpu) 3776 { 3777 ibt_status_t status; 3778 rib_hca_t *hca = (ctoqp(conn))->hca; 3779 ibt_mr_sync_t mr_segment; 3780 3781 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3782 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3783 mr_segment.ms_len = (ib_memlen_t)len; 3784 if (cpu) { 3785 /* make incoming data visible to memory */ 3786 mr_segment.ms_flags = IBT_SYNC_WRITE; 3787 } else { 3788 /* make memory changes visible to IO */ 3789 mr_segment.ms_flags = IBT_SYNC_READ; 3790 } 3791 rw_enter(&hca->state_lock, RW_READER); 3792 if (hca->state != HCA_DETACHED) { 3793 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3794 rw_exit(&hca->state_lock); 3795 } else { 3796 rw_exit(&hca->state_lock); 3797 return (RDMA_FAILED); 3798 } 3799 3800 if (status == IBT_SUCCESS) 3801 return (RDMA_SUCCESS); 3802 else { 3803 return (RDMA_FAILED); 3804 } 3805 } 3806 3807 /* 3808 * XXXX ???? 3809 */ 3810 static rdma_stat 3811 rib_getinfo(rdma_info_t *info) 3812 { 3813 /* 3814 * XXXX Hack! 3815 */ 3816 info->addrlen = 16; 3817 info->mts = 1000000; 3818 info->mtu = 1000000; 3819 3820 return (RDMA_SUCCESS); 3821 } 3822 3823 rib_bufpool_t * 3824 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3825 { 3826 rib_bufpool_t *rbp = NULL; 3827 bufpool_t *bp = NULL; 3828 caddr_t buf; 3829 ibt_mr_attr_t mem_attr; 3830 ibt_status_t ibt_status; 3831 int i, j; 3832 3833 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3834 3835 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3836 num * sizeof (void *), KM_SLEEP); 3837 3838 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3839 bp->numelems = num; 3840 3841 3842 switch (ptype) { 3843 case SEND_BUFFER: 3844 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3845 bp->rsize = RPC_MSG_SZ; 3846 break; 3847 case RECV_BUFFER: 3848 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3849 bp->rsize = RPC_BUF_SIZE; 3850 break; 3851 default: 3852 goto fail; 3853 } 3854 3855 /* 3856 * Register the pool. 3857 */ 3858 bp->bufsize = num * bp->rsize; 3859 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3860 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3861 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3862 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3863 sizeof (ibt_mr_desc_t), KM_SLEEP); 3864 rw_enter(&hca->state_lock, RW_READER); 3865 3866 if (hca->state == HCA_DETACHED) { 3867 rw_exit(&hca->state_lock); 3868 goto fail; 3869 } 3870 3871 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3872 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3873 mem_attr.mr_vaddr = (uintptr_t)buf; 3874 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3875 mem_attr.mr_as = NULL; 3876 ibt_status = ibt_register_mr(hca->hca_hdl, 3877 hca->pd_hdl, &mem_attr, 3878 &rbp->mr_hdl[i], 3879 &rbp->mr_desc[i]); 3880 if (ibt_status != IBT_SUCCESS) { 3881 for (j = 0; j < i; j++) { 3882 (void) ibt_deregister_mr(hca->hca_hdl, 3883 rbp->mr_hdl[j]); 3884 } 3885 rw_exit(&hca->state_lock); 3886 goto fail; 3887 } 3888 } 3889 rw_exit(&hca->state_lock); 3890 buf = (caddr_t)bp->buf; 3891 for (i = 0; i < num; i++, buf += bp->rsize) { 3892 bp->buflist[i] = (void *)buf; 3893 } 3894 bp->buffree = num - 1; /* no. of free buffers */ 3895 rbp->bpool = bp; 3896 3897 return (rbp); 3898 fail: 3899 if (bp) { 3900 if (bp->buf) 3901 kmem_free(bp->buf, bp->bufsize); 3902 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3903 } 3904 if (rbp) { 3905 if (rbp->mr_hdl) 3906 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3907 if (rbp->mr_desc) 3908 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3909 kmem_free(rbp, sizeof (rib_bufpool_t)); 3910 } 3911 return (NULL); 3912 } 3913 3914 static void 3915 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3916 { 3917 int i; 3918 rib_bufpool_t *rbp = NULL; 3919 bufpool_t *bp; 3920 3921 /* 3922 * Obtain pool address based on type of pool 3923 */ 3924 switch (ptype) { 3925 case SEND_BUFFER: 3926 rbp = hca->send_pool; 3927 break; 3928 case RECV_BUFFER: 3929 rbp = hca->recv_pool; 3930 break; 3931 default: 3932 return; 3933 } 3934 if (rbp == NULL) 3935 return; 3936 3937 bp = rbp->bpool; 3938 3939 /* 3940 * Deregister the pool memory and free it. 3941 */ 3942 for (i = 0; i < bp->numelems; i++) { 3943 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3944 } 3945 } 3946 3947 static void 3948 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3949 { 3950 3951 rib_bufpool_t *rbp = NULL; 3952 bufpool_t *bp; 3953 3954 /* 3955 * Obtain pool address based on type of pool 3956 */ 3957 switch (ptype) { 3958 case SEND_BUFFER: 3959 rbp = hca->send_pool; 3960 break; 3961 case RECV_BUFFER: 3962 rbp = hca->recv_pool; 3963 break; 3964 default: 3965 return; 3966 } 3967 if (rbp == NULL) 3968 return; 3969 3970 bp = rbp->bpool; 3971 3972 /* 3973 * Free the pool memory. 3974 */ 3975 if (rbp->mr_hdl) 3976 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3977 3978 if (rbp->mr_desc) 3979 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3980 if (bp->buf) 3981 kmem_free(bp->buf, bp->bufsize); 3982 mutex_destroy(&bp->buflock); 3983 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3984 kmem_free(rbp, sizeof (rib_bufpool_t)); 3985 } 3986 3987 void 3988 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3989 { 3990 /* 3991 * Deregister the pool memory and free it. 3992 */ 3993 rib_rbufpool_deregister(hca, ptype); 3994 rib_rbufpool_free(hca, ptype); 3995 } 3996 3997 /* 3998 * Fetch a buffer from the pool of type specified in rdbuf->type. 3999 */ 4000 static rdma_stat 4001 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4002 { 4003 rib_lrc_entry_t *rlep; 4004 4005 if (rdbuf->type == RDMA_LONG_BUFFER) { 4006 rlep = rib_get_cache_buf(conn, rdbuf->len); 4007 rdbuf->rb_private = (caddr_t)rlep; 4008 rdbuf->addr = rlep->lrc_buf; 4009 rdbuf->handle = rlep->lrc_mhandle; 4010 return (RDMA_SUCCESS); 4011 } 4012 4013 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4014 if (rdbuf->addr) { 4015 switch (rdbuf->type) { 4016 case SEND_BUFFER: 4017 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4018 break; 4019 case RECV_BUFFER: 4020 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4021 break; 4022 default: 4023 rdbuf->len = 0; 4024 } 4025 return (RDMA_SUCCESS); 4026 } else 4027 return (RDMA_FAILED); 4028 } 4029 4030 /* 4031 * Fetch a buffer of specified type. 4032 * Note that rdbuf->handle is mw's rkey. 4033 */ 4034 static void * 4035 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4036 { 4037 rib_qp_t *qp = ctoqp(conn); 4038 rib_hca_t *hca = qp->hca; 4039 rdma_btype ptype = rdbuf->type; 4040 void *buf; 4041 rib_bufpool_t *rbp = NULL; 4042 bufpool_t *bp; 4043 int i; 4044 4045 /* 4046 * Obtain pool address based on type of pool 4047 */ 4048 switch (ptype) { 4049 case SEND_BUFFER: 4050 rbp = hca->send_pool; 4051 break; 4052 case RECV_BUFFER: 4053 rbp = hca->recv_pool; 4054 break; 4055 default: 4056 return (NULL); 4057 } 4058 if (rbp == NULL) 4059 return (NULL); 4060 4061 bp = rbp->bpool; 4062 4063 mutex_enter(&bp->buflock); 4064 if (bp->buffree < 0) { 4065 mutex_exit(&bp->buflock); 4066 return (NULL); 4067 } 4068 4069 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4070 buf = bp->buflist[bp->buffree]; 4071 rdbuf->addr = buf; 4072 rdbuf->len = bp->rsize; 4073 for (i = bp->numelems - 1; i >= 0; i--) { 4074 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4075 rdbuf->handle.mrc_rmr = 4076 (uint32_t)rbp->mr_desc[i].md_rkey; 4077 rdbuf->handle.mrc_linfo = 4078 (uintptr_t)rbp->mr_hdl[i]; 4079 rdbuf->handle.mrc_lmr = 4080 (uint32_t)rbp->mr_desc[i].md_lkey; 4081 bp->buffree--; 4082 4083 mutex_exit(&bp->buflock); 4084 4085 return (buf); 4086 } 4087 } 4088 4089 mutex_exit(&bp->buflock); 4090 4091 return (NULL); 4092 } 4093 4094 static void 4095 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4096 { 4097 4098 if (rdbuf->type == RDMA_LONG_BUFFER) { 4099 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4100 rdbuf->rb_private = NULL; 4101 return; 4102 } 4103 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4104 } 4105 4106 static void 4107 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4108 { 4109 rib_qp_t *qp = ctoqp(conn); 4110 rib_hca_t *hca = qp->hca; 4111 rib_bufpool_t *rbp = NULL; 4112 bufpool_t *bp; 4113 4114 /* 4115 * Obtain pool address based on type of pool 4116 */ 4117 switch (ptype) { 4118 case SEND_BUFFER: 4119 rbp = hca->send_pool; 4120 break; 4121 case RECV_BUFFER: 4122 rbp = hca->recv_pool; 4123 break; 4124 default: 4125 return; 4126 } 4127 if (rbp == NULL) 4128 return; 4129 4130 bp = rbp->bpool; 4131 4132 mutex_enter(&bp->buflock); 4133 if (++bp->buffree >= bp->numelems) { 4134 /* 4135 * Should never happen 4136 */ 4137 bp->buffree--; 4138 } else { 4139 bp->buflist[bp->buffree] = buf; 4140 } 4141 mutex_exit(&bp->buflock); 4142 } 4143 4144 static rdma_stat 4145 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4146 { 4147 rw_enter(&connlist->conn_lock, RW_WRITER); 4148 if (connlist->conn_hd) { 4149 cn->c_next = connlist->conn_hd; 4150 connlist->conn_hd->c_prev = cn; 4151 } 4152 connlist->conn_hd = cn; 4153 rw_exit(&connlist->conn_lock); 4154 4155 return (RDMA_SUCCESS); 4156 } 4157 4158 static rdma_stat 4159 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4160 { 4161 rw_enter(&connlist->conn_lock, RW_WRITER); 4162 if (cn->c_prev) { 4163 cn->c_prev->c_next = cn->c_next; 4164 } 4165 if (cn->c_next) { 4166 cn->c_next->c_prev = cn->c_prev; 4167 } 4168 if (connlist->conn_hd == cn) 4169 connlist->conn_hd = cn->c_next; 4170 rw_exit(&connlist->conn_lock); 4171 4172 return (RDMA_SUCCESS); 4173 } 4174 4175 /* ARGSUSED */ 4176 static rdma_stat 4177 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4178 int addr_type, void *handle, CONN **conn) 4179 { 4180 rdma_stat status; 4181 rpcib_ping_t rpt; 4182 4183 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4184 return (status); 4185 } 4186 4187 /* 4188 * rib_find_hca_connection 4189 * 4190 * if there is an existing connection to the specified address then 4191 * it will be returned in conn, otherwise conn will be set to NULL. 4192 * Also cleans up any connection that is in error state. 4193 */ 4194 static int 4195 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4196 struct netbuf *d_svcaddr, CONN **conn) 4197 { 4198 CONN *cn; 4199 clock_t cv_stat, timout; 4200 4201 *conn = NULL; 4202 again: 4203 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4204 cn = hca->cl_conn_list.conn_hd; 4205 while (cn != NULL) { 4206 /* 4207 * First, clear up any connection in the ERROR state 4208 */ 4209 mutex_enter(&cn->c_lock); 4210 if (cn->c_state == C_ERROR_CONN) { 4211 if (cn->c_ref == 0) { 4212 /* 4213 * Remove connection from list and destroy it. 4214 */ 4215 cn->c_state = C_DISCONN_PEND; 4216 mutex_exit(&cn->c_lock); 4217 rw_exit(&hca->cl_conn_list.conn_lock); 4218 rib_conn_close((void *)cn); 4219 goto again; 4220 } 4221 mutex_exit(&cn->c_lock); 4222 cn = cn->c_next; 4223 continue; 4224 } 4225 if (cn->c_state == C_DISCONN_PEND) { 4226 mutex_exit(&cn->c_lock); 4227 cn = cn->c_next; 4228 continue; 4229 } 4230 4231 /* 4232 * source address is only checked for if there is one, 4233 * this is the case for retries. 4234 */ 4235 if ((cn->c_raddr.len == d_svcaddr->len) && 4236 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4237 d_svcaddr->len) == 0) && 4238 ((s_svcaddr->len == 0) || 4239 ((cn->c_laddr.len == s_svcaddr->len) && 4240 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4241 s_svcaddr->len) == 0)))) { 4242 /* 4243 * Our connection. Give up conn list lock 4244 * as we are done traversing the list. 4245 */ 4246 rw_exit(&hca->cl_conn_list.conn_lock); 4247 if (cn->c_state == C_CONNECTED) { 4248 cn->c_ref++; /* sharing a conn */ 4249 mutex_exit(&cn->c_lock); 4250 *conn = cn; 4251 return (RDMA_SUCCESS); 4252 } 4253 if (cn->c_state == C_CONN_PEND) { 4254 /* 4255 * Hold a reference to this conn before 4256 * we give up the lock. 4257 */ 4258 cn->c_ref++; 4259 timout = ddi_get_lbolt() + 4260 drv_usectohz(CONN_WAIT_TIME * 1000000); 4261 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4262 &cn->c_lock, timout)) > 0 && 4263 cn->c_state == C_CONN_PEND) 4264 ; 4265 if (cv_stat == 0) { 4266 cn->c_ref--; 4267 mutex_exit(&cn->c_lock); 4268 return (RDMA_INTR); 4269 } 4270 if (cv_stat < 0) { 4271 cn->c_ref--; 4272 mutex_exit(&cn->c_lock); 4273 return (RDMA_TIMEDOUT); 4274 } 4275 if (cn->c_state == C_CONNECTED) { 4276 *conn = cn; 4277 mutex_exit(&cn->c_lock); 4278 return (RDMA_SUCCESS); 4279 } else { 4280 cn->c_ref--; 4281 mutex_exit(&cn->c_lock); 4282 return (RDMA_TIMEDOUT); 4283 } 4284 } 4285 } 4286 mutex_exit(&cn->c_lock); 4287 cn = cn->c_next; 4288 } 4289 rw_exit(&hca->cl_conn_list.conn_lock); 4290 *conn = NULL; 4291 return (RDMA_FAILED); 4292 } 4293 4294 /* 4295 * Connection management. 4296 * IBTF does not support recycling of channels. So connections are only 4297 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4298 * C_DISCONN_PEND state. No C_IDLE state. 4299 * C_CONN_PEND state: Connection establishment in progress to the server. 4300 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4301 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4302 * only in this state. 4303 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4304 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4305 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4306 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4307 * c_ref drops to 0 (this indicates that RPC has no more references to this 4308 * connection), the connection should be destroyed. A connection transitions 4309 * into this state when it is being destroyed. 4310 */ 4311 /* ARGSUSED */ 4312 static rdma_stat 4313 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4314 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4315 { 4316 CONN *cn; 4317 int status; 4318 rib_hca_t *hca; 4319 rib_qp_t *qp; 4320 int s_addr_len; 4321 char *s_addr_buf; 4322 4323 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4324 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4325 rw_enter(&hca->state_lock, RW_READER); 4326 if (hca->state != HCA_DETACHED) { 4327 status = rib_find_hca_connection(hca, s_svcaddr, 4328 d_svcaddr, conn); 4329 rw_exit(&hca->state_lock); 4330 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4331 rw_exit(&rib_stat->hcas_list_lock); 4332 return (status); 4333 } 4334 } else 4335 rw_exit(&hca->state_lock); 4336 } 4337 rw_exit(&rib_stat->hcas_list_lock); 4338 4339 /* 4340 * No existing connection found, establish a new connection. 4341 */ 4342 bzero(rpt, sizeof (rpcib_ping_t)); 4343 4344 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4345 if (status != RDMA_SUCCESS) { 4346 return (RDMA_FAILED); 4347 } 4348 hca = rpt->hca; 4349 4350 if (rpt->srcip.family == AF_INET) { 4351 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4352 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4353 } else if (rpt->srcip.family == AF_INET6) { 4354 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4355 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4356 } else { 4357 return (RDMA_FAILED); 4358 } 4359 4360 /* 4361 * Channel to server doesn't exist yet, create one. 4362 */ 4363 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4364 return (RDMA_FAILED); 4365 } 4366 cn = qptoc(qp); 4367 cn->c_state = C_CONN_PEND; 4368 cn->c_ref = 1; 4369 4370 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4371 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4372 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4373 4374 if (rpt->srcip.family == AF_INET) { 4375 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4376 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4377 } else { 4378 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4379 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4380 } 4381 4382 /* 4383 * Add to conn list. 4384 * We had given up the READER lock. In the time since then, 4385 * another thread might have created the connection we are 4386 * trying here. But for now, that is quiet alright - there 4387 * might be two connections between a pair of hosts instead 4388 * of one. If we really want to close that window, 4389 * then need to check the list after acquiring the 4390 * WRITER lock. 4391 */ 4392 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4393 status = rib_conn_to_srv(hca, qp, rpt); 4394 mutex_enter(&cn->c_lock); 4395 if (status == RDMA_SUCCESS) { 4396 cn->c_state = C_CONNECTED; 4397 *conn = cn; 4398 } else { 4399 cn->c_state = C_ERROR_CONN; 4400 cn->c_ref--; 4401 } 4402 cv_broadcast(&cn->c_cv); 4403 mutex_exit(&cn->c_lock); 4404 return (status); 4405 } 4406 4407 static void 4408 rib_conn_close(void *rarg) 4409 { 4410 CONN *conn = (CONN *)rarg; 4411 rib_qp_t *qp = ctoqp(conn); 4412 4413 mutex_enter(&conn->c_lock); 4414 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4415 4416 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4417 /* 4418 * Live connection in CONNECTED state. 4419 */ 4420 if (conn->c_state == C_CONNECTED) { 4421 conn->c_state = C_ERROR_CONN; 4422 } 4423 mutex_exit(&conn->c_lock); 4424 4425 rib_close_a_channel(conn); 4426 4427 mutex_enter(&conn->c_lock); 4428 conn->c_flags &= ~C_CLOSE_PENDING; 4429 cv_signal(&conn->c_cv); 4430 } 4431 4432 mutex_exit(&conn->c_lock); 4433 4434 if (qp->mode == RIB_SERVER) 4435 (void) rib_disconnect_channel(conn, 4436 &qp->hca->srv_conn_list); 4437 else 4438 (void) rib_disconnect_channel(conn, 4439 &qp->hca->cl_conn_list); 4440 } 4441 4442 static void 4443 rib_conn_timeout_call(void *carg) 4444 { 4445 time_t idle_time; 4446 CONN *conn = (CONN *)carg; 4447 rib_hca_t *hca = ctoqp(conn)->hca; 4448 int error; 4449 4450 mutex_enter(&conn->c_lock); 4451 if ((conn->c_ref > 0) || 4452 (conn->c_state == C_DISCONN_PEND)) { 4453 conn->c_timeout = NULL; 4454 mutex_exit(&conn->c_lock); 4455 return; 4456 } 4457 4458 idle_time = (gethrestime_sec() - conn->c_last_used); 4459 4460 if ((idle_time <= rib_conn_timeout) && 4461 (conn->c_state != C_ERROR_CONN)) { 4462 /* 4463 * There was activity after the last timeout. 4464 * Extend the conn life. Unless the conn is 4465 * already in error state. 4466 */ 4467 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4468 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4469 mutex_exit(&conn->c_lock); 4470 return; 4471 } 4472 4473 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4474 (void *)conn, DDI_NOSLEEP); 4475 4476 /* 4477 * If taskq dispatch fails above, then reset the timeout 4478 * to try again after 10 secs. 4479 */ 4480 4481 if (error != DDI_SUCCESS) { 4482 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4483 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4484 mutex_exit(&conn->c_lock); 4485 return; 4486 } 4487 4488 conn->c_state = C_DISCONN_PEND; 4489 mutex_exit(&conn->c_lock); 4490 } 4491 4492 static rdma_stat 4493 rib_conn_release(CONN *conn) 4494 { 4495 4496 mutex_enter(&conn->c_lock); 4497 conn->c_ref--; 4498 4499 conn->c_last_used = gethrestime_sec(); 4500 if (conn->c_ref > 0) { 4501 mutex_exit(&conn->c_lock); 4502 return (RDMA_SUCCESS); 4503 } 4504 4505 /* 4506 * If a conn is C_ERROR_CONN, close the channel. 4507 */ 4508 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4509 conn->c_state = C_DISCONN_PEND; 4510 mutex_exit(&conn->c_lock); 4511 rib_conn_close((void *)conn); 4512 return (RDMA_SUCCESS); 4513 } 4514 4515 /* 4516 * c_ref == 0, set a timeout for conn release 4517 */ 4518 4519 if (conn->c_timeout == NULL) { 4520 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4521 SEC_TO_TICK(rib_conn_timeout)); 4522 } 4523 4524 mutex_exit(&conn->c_lock); 4525 return (RDMA_SUCCESS); 4526 } 4527 4528 /* 4529 * Add at front of list 4530 */ 4531 static struct rdma_done_list * 4532 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4533 { 4534 struct rdma_done_list *rd; 4535 4536 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4537 4538 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4539 rd->xid = xid; 4540 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4541 4542 rd->prev = NULL; 4543 rd->next = qp->rdlist; 4544 if (qp->rdlist != NULL) 4545 qp->rdlist->prev = rd; 4546 qp->rdlist = rd; 4547 4548 return (rd); 4549 } 4550 4551 static void 4552 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4553 { 4554 struct rdma_done_list *r; 4555 4556 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4557 4558 r = rd->next; 4559 if (r != NULL) { 4560 r->prev = rd->prev; 4561 } 4562 4563 r = rd->prev; 4564 if (r != NULL) { 4565 r->next = rd->next; 4566 } else { 4567 qp->rdlist = rd->next; 4568 } 4569 4570 cv_destroy(&rd->rdma_done_cv); 4571 kmem_free(rd, sizeof (*rd)); 4572 } 4573 4574 static void 4575 rdma_done_rem_list(rib_qp_t *qp) 4576 { 4577 struct rdma_done_list *r, *n; 4578 4579 mutex_enter(&qp->rdlist_lock); 4580 for (r = qp->rdlist; r != NULL; r = n) { 4581 n = r->next; 4582 rdma_done_rm(qp, r); 4583 } 4584 mutex_exit(&qp->rdlist_lock); 4585 } 4586 4587 static void 4588 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4589 { 4590 struct rdma_done_list *r = qp->rdlist; 4591 4592 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4593 4594 while (r) { 4595 if (r->xid == xid) { 4596 cv_signal(&r->rdma_done_cv); 4597 return; 4598 } else { 4599 r = r->next; 4600 } 4601 } 4602 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4603 int, xid); 4604 } 4605 4606 /* 4607 * Expects conn->c_lock to be held by the caller. 4608 */ 4609 4610 static void 4611 rib_close_a_channel(CONN *conn) 4612 { 4613 rib_qp_t *qp; 4614 qp = ctoqp(conn); 4615 4616 if (qp->qp_hdl == NULL) { 4617 /* channel already freed */ 4618 return; 4619 } 4620 4621 /* 4622 * Call ibt_close_rc_channel in blocking mode 4623 * with no callbacks. 4624 */ 4625 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4626 NULL, 0, NULL, NULL, 0); 4627 } 4628 4629 /* 4630 * Goes through all connections and closes the channel 4631 * This will cause all the WRs on those channels to be 4632 * flushed. 4633 */ 4634 static void 4635 rib_close_channels(rib_conn_list_t *connlist) 4636 { 4637 CONN *conn, *tmp; 4638 4639 rw_enter(&connlist->conn_lock, RW_READER); 4640 conn = connlist->conn_hd; 4641 while (conn != NULL) { 4642 mutex_enter(&conn->c_lock); 4643 tmp = conn->c_next; 4644 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4645 4646 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4647 4648 /* 4649 * Live connection in CONNECTED state. 4650 */ 4651 if (conn->c_state == C_CONNECTED) 4652 conn->c_state = C_ERROR_CONN; 4653 mutex_exit(&conn->c_lock); 4654 4655 rib_close_a_channel(conn); 4656 4657 mutex_enter(&conn->c_lock); 4658 conn->c_flags &= ~C_CLOSE_PENDING; 4659 /* Signal a pending rib_disconnect_channel() */ 4660 cv_signal(&conn->c_cv); 4661 } 4662 mutex_exit(&conn->c_lock); 4663 conn = tmp; 4664 } 4665 rw_exit(&connlist->conn_lock); 4666 } 4667 4668 /* 4669 * Frees up all connections that are no longer being referenced 4670 */ 4671 static void 4672 rib_purge_connlist(rib_conn_list_t *connlist) 4673 { 4674 CONN *conn; 4675 4676 top: 4677 rw_enter(&connlist->conn_lock, RW_READER); 4678 conn = connlist->conn_hd; 4679 while (conn != NULL) { 4680 mutex_enter(&conn->c_lock); 4681 4682 /* 4683 * At this point connection is either in ERROR 4684 * or DISCONN_PEND state. If in DISCONN_PEND state 4685 * then some other thread is culling that connection. 4686 * If not and if c_ref is 0, then destroy the connection. 4687 */ 4688 if (conn->c_ref == 0 && 4689 conn->c_state != C_DISCONN_PEND) { 4690 /* 4691 * Cull the connection 4692 */ 4693 conn->c_state = C_DISCONN_PEND; 4694 mutex_exit(&conn->c_lock); 4695 rw_exit(&connlist->conn_lock); 4696 (void) rib_disconnect_channel(conn, connlist); 4697 goto top; 4698 } else { 4699 /* 4700 * conn disconnect already scheduled or will 4701 * happen from conn_release when c_ref drops to 0. 4702 */ 4703 mutex_exit(&conn->c_lock); 4704 } 4705 conn = conn->c_next; 4706 } 4707 rw_exit(&connlist->conn_lock); 4708 4709 /* 4710 * At this point, only connections with c_ref != 0 are on the list 4711 */ 4712 } 4713 4714 /* 4715 * Free all the HCA resources and close 4716 * the hca. 4717 */ 4718 4719 static void 4720 rib_free_hca(rib_hca_t *hca) 4721 { 4722 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4723 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4724 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4725 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4726 4727 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4728 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4729 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4730 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4731 4732 rib_rbufpool_destroy(hca, RECV_BUFFER); 4733 rib_rbufpool_destroy(hca, SEND_BUFFER); 4734 rib_destroy_cache(hca); 4735 if (rib_mod.rdma_count == 0) 4736 rdma_unregister_mod(&rib_mod); 4737 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4738 (void) ibt_close_hca(hca->hca_hdl); 4739 hca->hca_hdl = NULL; 4740 } 4741 4742 4743 static void 4744 rib_stop_hca_services(rib_hca_t *hca) 4745 { 4746 rib_stop_services(hca); 4747 rib_close_channels(&hca->cl_conn_list); 4748 rib_close_channels(&hca->srv_conn_list); 4749 4750 rib_purge_connlist(&hca->cl_conn_list); 4751 rib_purge_connlist(&hca->srv_conn_list); 4752 4753 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4754 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4755 GLOBAL_ZONEID); 4756 stats_enabled = FALSE; 4757 } 4758 4759 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4760 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4761 if (hca->srv_conn_list.conn_hd == NULL && 4762 hca->cl_conn_list.conn_hd == NULL) { 4763 /* 4764 * conn_lists are NULL, so destroy 4765 * buffers, close hca and be done. 4766 */ 4767 rib_free_hca(hca); 4768 } 4769 rw_exit(&hca->cl_conn_list.conn_lock); 4770 rw_exit(&hca->srv_conn_list.conn_lock); 4771 4772 if (hca->hca_hdl != NULL) { 4773 mutex_enter(&hca->inuse_lock); 4774 while (hca->inuse) 4775 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4776 mutex_exit(&hca->inuse_lock); 4777 4778 rib_free_hca(hca); 4779 } 4780 rw_destroy(&hca->bound_services_lock); 4781 4782 if (hca->cleanup_helper != NULL) { 4783 ddi_taskq_destroy(hca->cleanup_helper); 4784 hca->cleanup_helper = NULL; 4785 } 4786 } 4787 4788 /* 4789 * Cleans and closes up all uses of the HCA 4790 */ 4791 static void 4792 rib_detach_hca(rib_hca_t *hca) 4793 { 4794 rib_hca_t **hcap; 4795 4796 /* 4797 * Stop all services on the HCA 4798 * Go through cl_conn_list and close all rc_channels 4799 * Go through svr_conn_list and close all rc_channels 4800 * Free connections whose c_ref has dropped to 0 4801 * Destroy all CQs 4802 * Deregister and released all buffer pool memory after all 4803 * connections are destroyed 4804 * Free the protection domain 4805 * ibt_close_hca() 4806 */ 4807 rw_enter(&hca->state_lock, RW_WRITER); 4808 if (hca->state == HCA_DETACHED) { 4809 rw_exit(&hca->state_lock); 4810 return; 4811 } 4812 4813 hca->state = HCA_DETACHED; 4814 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4815 for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca); 4816 hcap = &(*hcap)->next) 4817 ; 4818 ASSERT(*hcap == hca); 4819 *hcap = hca->next; 4820 rib_stat->nhca_inited--; 4821 rib_mod.rdma_count--; 4822 rw_exit(&rib_stat->hcas_list_lock); 4823 rw_exit(&hca->state_lock); 4824 4825 rib_stop_hca_services(hca); 4826 4827 kmem_free(hca, sizeof (*hca)); 4828 } 4829 4830 static void 4831 rib_server_side_cache_reclaim(void *argp) 4832 { 4833 cache_avl_struct_t *rcas; 4834 rib_lrc_entry_t *rb; 4835 rib_hca_t *hca = (rib_hca_t *)argp; 4836 4837 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4838 rcas = avl_first(&hca->avl_tree); 4839 if (rcas != NULL) 4840 avl_remove(&hca->avl_tree, rcas); 4841 4842 while (rcas != NULL) { 4843 while (rcas->r.forw != &rcas->r) { 4844 rcas->elements--; 4845 rb = rcas->r.forw; 4846 remque(rb); 4847 if (rb->registered) 4848 (void) rib_deregistermem_via_hca(hca, 4849 rb->lrc_buf, rb->lrc_mhandle); 4850 4851 hca->cache_allocation -= rb->lrc_len; 4852 kmem_free(rb->lrc_buf, rb->lrc_len); 4853 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4854 } 4855 mutex_destroy(&rcas->node_lock); 4856 kmem_cache_free(hca->server_side_cache, rcas); 4857 rcas = avl_first(&hca->avl_tree); 4858 if (rcas != NULL) 4859 avl_remove(&hca->avl_tree, rcas); 4860 } 4861 rw_exit(&hca->avl_rw_lock); 4862 } 4863 4864 static void 4865 rib_server_side_cache_cleanup(void *argp) 4866 { 4867 cache_avl_struct_t *rcas; 4868 rib_lrc_entry_t *rb; 4869 rib_hca_t *hca = (rib_hca_t *)argp; 4870 4871 mutex_enter(&hca->cache_allocation_lock); 4872 if (hca->cache_allocation < cache_limit) { 4873 mutex_exit(&hca->cache_allocation_lock); 4874 return; 4875 } 4876 mutex_exit(&hca->cache_allocation_lock); 4877 4878 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4879 rcas = avl_last(&hca->avl_tree); 4880 if (rcas != NULL) 4881 avl_remove(&hca->avl_tree, rcas); 4882 4883 while (rcas != NULL) { 4884 while (rcas->r.forw != &rcas->r) { 4885 rcas->elements--; 4886 rb = rcas->r.forw; 4887 remque(rb); 4888 if (rb->registered) 4889 (void) rib_deregistermem_via_hca(hca, 4890 rb->lrc_buf, rb->lrc_mhandle); 4891 4892 hca->cache_allocation -= rb->lrc_len; 4893 4894 kmem_free(rb->lrc_buf, rb->lrc_len); 4895 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4896 } 4897 mutex_destroy(&rcas->node_lock); 4898 if (hca->server_side_cache) { 4899 kmem_cache_free(hca->server_side_cache, rcas); 4900 } 4901 4902 if (hca->cache_allocation < cache_limit) { 4903 rw_exit(&hca->avl_rw_lock); 4904 return; 4905 } 4906 4907 rcas = avl_last(&hca->avl_tree); 4908 if (rcas != NULL) 4909 avl_remove(&hca->avl_tree, rcas); 4910 } 4911 rw_exit(&hca->avl_rw_lock); 4912 } 4913 4914 static int 4915 avl_compare(const void *t1, const void *t2) 4916 { 4917 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4918 return (0); 4919 4920 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4921 return (-1); 4922 4923 return (1); 4924 } 4925 4926 static void 4927 rib_destroy_cache(rib_hca_t *hca) 4928 { 4929 if (hca->avl_init) { 4930 rib_server_side_cache_reclaim((void *)hca); 4931 if (hca->server_side_cache) { 4932 kmem_cache_destroy(hca->server_side_cache); 4933 hca->server_side_cache = NULL; 4934 } 4935 avl_destroy(&hca->avl_tree); 4936 mutex_destroy(&hca->cache_allocation_lock); 4937 rw_destroy(&hca->avl_rw_lock); 4938 } 4939 hca->avl_init = FALSE; 4940 } 4941 4942 static void 4943 rib_force_cleanup(void *hca) 4944 { 4945 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4946 (void) ddi_taskq_dispatch( 4947 ((rib_hca_t *)hca)->cleanup_helper, 4948 rib_server_side_cache_cleanup, 4949 (void *)hca, DDI_NOSLEEP); 4950 } 4951 4952 static rib_lrc_entry_t * 4953 rib_get_cache_buf(CONN *conn, uint32_t len) 4954 { 4955 cache_avl_struct_t cas, *rcas; 4956 rib_hca_t *hca = (ctoqp(conn))->hca; 4957 rib_lrc_entry_t *reply_buf; 4958 avl_index_t where = NULL; 4959 uint64_t c_alloc = 0; 4960 4961 if (!hca->avl_init) 4962 goto error_alloc; 4963 4964 cas.len = len; 4965 4966 rw_enter(&hca->avl_rw_lock, RW_READER); 4967 4968 mutex_enter(&hca->cache_allocation_lock); 4969 c_alloc = hca->cache_allocation; 4970 mutex_exit(&hca->cache_allocation_lock); 4971 4972 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4973 &where)) == NULL) { 4974 /* Am I above the cache limit */ 4975 if ((c_alloc + len) >= cache_limit) { 4976 rib_force_cleanup((void *)hca); 4977 rw_exit(&hca->avl_rw_lock); 4978 mutex_enter(&hca->cache_allocation_lock); 4979 hca->cache_misses_above_the_limit ++; 4980 mutex_exit(&hca->cache_allocation_lock); 4981 4982 /* Allocate and register the buffer directly */ 4983 goto error_alloc; 4984 } 4985 4986 rw_exit(&hca->avl_rw_lock); 4987 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4988 4989 /* Recheck to make sure no other thread added the entry in */ 4990 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4991 &cas, &where)) == NULL) { 4992 /* Allocate an avl tree entry */ 4993 rcas = (cache_avl_struct_t *) 4994 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4995 4996 bzero(rcas, sizeof (cache_avl_struct_t)); 4997 rcas->elements = 0; 4998 rcas->r.forw = &rcas->r; 4999 rcas->r.back = &rcas->r; 5000 rcas->len = len; 5001 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5002 avl_insert(&hca->avl_tree, rcas, where); 5003 } 5004 } 5005 5006 mutex_enter(&rcas->node_lock); 5007 5008 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5009 reply_buf = rcas->r.forw; 5010 remque(reply_buf); 5011 rcas->elements--; 5012 mutex_exit(&rcas->node_lock); 5013 rw_exit(&hca->avl_rw_lock); 5014 5015 mutex_enter(&hca->cache_allocation_lock); 5016 hca->cache_hits++; 5017 hca->cache_allocation -= len; 5018 mutex_exit(&hca->cache_allocation_lock); 5019 } else { 5020 /* Am I above the cache limit */ 5021 mutex_exit(&rcas->node_lock); 5022 if ((c_alloc + len) >= cache_limit) { 5023 rib_force_cleanup((void *)hca); 5024 rw_exit(&hca->avl_rw_lock); 5025 5026 mutex_enter(&hca->cache_allocation_lock); 5027 hca->cache_misses_above_the_limit++; 5028 mutex_exit(&hca->cache_allocation_lock); 5029 /* Allocate and register the buffer directly */ 5030 goto error_alloc; 5031 } 5032 rw_exit(&hca->avl_rw_lock); 5033 mutex_enter(&hca->cache_allocation_lock); 5034 hca->cache_misses++; 5035 mutex_exit(&hca->cache_allocation_lock); 5036 /* Allocate a reply_buf entry */ 5037 reply_buf = (rib_lrc_entry_t *) 5038 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5039 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5040 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5041 reply_buf->lrc_len = len; 5042 reply_buf->registered = FALSE; 5043 reply_buf->avl_node = (void *)rcas; 5044 } 5045 5046 return (reply_buf); 5047 5048 error_alloc: 5049 reply_buf = (rib_lrc_entry_t *) 5050 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5051 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5052 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5053 reply_buf->lrc_len = len; 5054 reply_buf->registered = FALSE; 5055 reply_buf->avl_node = NULL; 5056 5057 return (reply_buf); 5058 } 5059 5060 /* 5061 * Return a pre-registered back to the cache (without 5062 * unregistering the buffer).. 5063 */ 5064 5065 static void 5066 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5067 { 5068 cache_avl_struct_t cas, *rcas; 5069 avl_index_t where = NULL; 5070 rib_hca_t *hca = (ctoqp(conn))->hca; 5071 5072 if (!hca->avl_init) 5073 goto error_free; 5074 5075 cas.len = reg_buf->lrc_len; 5076 rw_enter(&hca->avl_rw_lock, RW_READER); 5077 if ((rcas = (cache_avl_struct_t *) 5078 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5079 rw_exit(&hca->avl_rw_lock); 5080 goto error_free; 5081 } else { 5082 cas.len = reg_buf->lrc_len; 5083 mutex_enter(&rcas->node_lock); 5084 insque(reg_buf, &rcas->r); 5085 rcas->elements ++; 5086 mutex_exit(&rcas->node_lock); 5087 rw_exit(&hca->avl_rw_lock); 5088 mutex_enter(&hca->cache_allocation_lock); 5089 hca->cache_allocation += cas.len; 5090 mutex_exit(&hca->cache_allocation_lock); 5091 } 5092 5093 return; 5094 5095 error_free: 5096 5097 if (reg_buf->registered) 5098 (void) rib_deregistermem_via_hca(hca, 5099 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5100 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5101 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5102 } 5103 5104 static rdma_stat 5105 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5106 uint_t buflen, struct mrc *buf_handle) 5107 { 5108 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5109 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5110 rdma_stat status; 5111 5112 5113 /* 5114 * Note: ALL buffer pools use the same memory type RDMARW. 5115 */ 5116 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5117 if (status == RDMA_SUCCESS) { 5118 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5119 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5120 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5121 } else { 5122 buf_handle->mrc_linfo = NULL; 5123 buf_handle->mrc_lmr = 0; 5124 buf_handle->mrc_rmr = 0; 5125 } 5126 return (status); 5127 } 5128 5129 /* ARGSUSED */ 5130 static rdma_stat 5131 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5132 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5133 { 5134 5135 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5136 return (RDMA_SUCCESS); 5137 } 5138 5139 /* ARGSUSED */ 5140 static rdma_stat 5141 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5142 { 5143 5144 (void) ibt_deregister_mr(hca->hca_hdl, 5145 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5146 return (RDMA_SUCCESS); 5147 } 5148 5149 /* 5150 * Check if the IP interface named by `lifrp' is RDMA-capable. 5151 */ 5152 static boolean_t 5153 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5154 { 5155 char ifname[LIFNAMSIZ]; 5156 char *cp; 5157 5158 if (lifrp->lifr_type == IFT_IB) 5159 return (B_TRUE); 5160 5161 /* 5162 * Strip off the logical interface portion before getting 5163 * intimate with the name. 5164 */ 5165 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5166 if ((cp = strchr(ifname, ':')) != NULL) 5167 *cp = '\0'; 5168 5169 return (strcmp("lo0", ifname) == 0); 5170 } 5171 5172 static int 5173 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5174 { 5175 vnode_t *kvp, *vp; 5176 TIUSER *tiptr; 5177 struct strioctl iocb; 5178 k_sigset_t smask; 5179 int err = 0; 5180 5181 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 5182 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 5183 &tiptr, CRED()) == 0) { 5184 vp = tiptr->fp->f_vnode; 5185 } else { 5186 VN_RELE(kvp); 5187 return (EPROTO); 5188 } 5189 } else { 5190 return (EPROTO); 5191 } 5192 5193 iocb.ic_cmd = cmd; 5194 iocb.ic_timout = 0; 5195 iocb.ic_len = len; 5196 iocb.ic_dp = (caddr_t)arg; 5197 sigintr(&smask, 0); 5198 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5199 sigunintr(&smask); 5200 (void) t_kclose(tiptr, 0); 5201 VN_RELE(kvp); 5202 return (err); 5203 } 5204 5205 /* 5206 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5207 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5208 */ 5209 static int 5210 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5211 { 5212 int err; 5213 struct lifnum lifn; 5214 5215 bzero(&lifn, sizeof (struct lifnum)); 5216 lifn.lifn_family = AF_UNSPEC; 5217 5218 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5219 if (err != 0) 5220 return (err); 5221 5222 /* 5223 * Pad the interface count to account for additional interfaces that 5224 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5225 */ 5226 lifn.lifn_count += 4; 5227 5228 bzero(lifcp, sizeof (struct lifconf)); 5229 lifcp->lifc_family = AF_UNSPEC; 5230 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5231 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5232 5233 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5234 if (err != 0) { 5235 kmem_free(lifcp->lifc_buf, *bufsizep); 5236 return (err); 5237 } 5238 return (0); 5239 } 5240 5241 static boolean_t 5242 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5243 { 5244 uint_t i, nifs; 5245 uint_t bufsize; 5246 struct lifconf lifc; 5247 struct lifreq *lifrp; 5248 struct sockaddr_in *sinp; 5249 struct sockaddr_in6 *sin6p; 5250 5251 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5252 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5253 5254 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5255 return (B_FALSE); 5256 5257 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5258 kmem_free(lifc.lifc_buf, bufsize); 5259 return (B_FALSE); 5260 } 5261 5262 /* 5263 * Worst case is that all of the addresses are IB-capable and have 5264 * the same address family, so size our buffers accordingly. 5265 */ 5266 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5267 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5268 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5269 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5270 5271 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5272 if (!rpcib_rdma_capable_interface(lifrp)) 5273 continue; 5274 5275 if (lifrp->lifr_addr.ss_family == AF_INET) { 5276 sinp = addrs4->ri_list; 5277 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5278 sizeof (struct sockaddr_in)); 5279 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5280 sin6p = addrs6->ri_list; 5281 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5282 sizeof (struct sockaddr_in6)); 5283 } 5284 } 5285 5286 kmem_free(lifc.lifc_buf, bufsize); 5287 return (B_TRUE); 5288 } 5289 5290 /* ARGSUSED */ 5291 static int 5292 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5293 { 5294 rib_hca_t *hca; 5295 5296 if (KSTAT_WRITE == rw) { 5297 return (EACCES); 5298 } 5299 5300 rpcib_kstat.cache_limit.value.ui64 = 5301 (uint64_t)cache_limit; 5302 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5303 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5304 rpcib_kstat.cache_allocation.value.ui64 += 5305 (uint64_t)hca->cache_allocation; 5306 rpcib_kstat.cache_hits.value.ui64 += 5307 (uint64_t)hca->cache_hits; 5308 rpcib_kstat.cache_misses.value.ui64 += 5309 (uint64_t)hca->cache_misses; 5310 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5311 (uint64_t)hca->cache_misses_above_the_limit; 5312 } 5313 rw_exit(&rib_stat->hcas_list_lock); 5314 return (0); 5315 } 5316