1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 /* 221 * rib_stat: private data pointer used when registering 222 * with the IBTF. It is returned to the consumer 223 * in all callbacks. 224 */ 225 static rpcib_state_t *rib_stat = NULL; 226 227 #define RNR_RETRIES IBT_RNR_RETRY_1 228 #define MAX_PORTS 2 229 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 230 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 231 232 int preposted_rbufs = RDMA_BUFS_GRANT; 233 int send_threshold = 1; 234 235 /* 236 * Old cards with Tavor driver have limited memory footprint 237 * when booted in 32bit. The rib_max_rbufs tunable can be 238 * tuned for more buffers if needed. 239 */ 240 241 #if !defined(_ELF64) && !defined(__sparc) 242 int rib_max_rbufs = MAX_BUFS; 243 #else 244 int rib_max_rbufs = 10 * MAX_BUFS; 245 #endif /* !(_ELF64) && !(__sparc) */ 246 247 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 248 249 /* 250 * State of the plugin. 251 * ACCEPT = accepting new connections and requests. 252 * NO_ACCEPT = not accepting new connection and requests. 253 * This should eventually move to rpcib_state_t structure, since this 254 * will tell in which state the plugin is for a particular type of service 255 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 256 * state for one and in no_accept state for the other. 257 */ 258 int plugin_state; 259 kmutex_t plugin_state_lock; 260 261 ldi_ident_t rpcib_li; 262 263 /* 264 * RPCIB RDMATF operations 265 */ 266 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 267 static rdma_stat rib_disconnect(CONN *conn); 268 static void rib_listen(struct rdma_svc_data *rd); 269 static void rib_listen_stop(struct rdma_svc_data *rd); 270 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 271 uint_t buflen, struct mrc *buf_handle); 272 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 273 struct mrc buf_handle); 274 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 275 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 276 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 277 struct mrc buf_handle); 278 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 279 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 280 void *lrc); 281 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 282 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 283 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 284 caddr_t buf, int len, int cpu); 285 286 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 287 288 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 289 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 290 291 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 292 293 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 294 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 295 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 296 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 297 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 298 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 299 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 300 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 301 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 302 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 303 int addr_type, void *, CONN **); 304 static rdma_stat rib_conn_release(CONN *conn); 305 static rdma_stat rib_getinfo(rdma_info_t *info); 306 307 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 308 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 309 static void rib_destroy_cache(rib_hca_t *hca); 310 static void rib_server_side_cache_reclaim(void *argp); 311 static int avl_compare(const void *t1, const void *t2); 312 313 static void rib_stop_services(rib_hca_t *); 314 static void rib_close_channels(rib_conn_list_t *); 315 static void rib_conn_close(void *); 316 317 /* 318 * RPCIB addressing operations 319 */ 320 321 /* 322 * RDMA operations the RPCIB module exports 323 */ 324 static rdmaops_t rib_ops = { 325 rib_reachable, 326 rib_conn_get, 327 rib_conn_release, 328 rib_listen, 329 rib_listen_stop, 330 rib_registermem, 331 rib_deregistermem, 332 rib_registermemsync, 333 rib_deregistermemsync, 334 rib_syncmem, 335 rib_reg_buf_alloc, 336 rib_reg_buf_free, 337 rib_send, 338 rib_send_resp, 339 rib_post_resp, 340 rib_post_resp_remove, 341 rib_post_recv, 342 rib_recv, 343 rib_read, 344 rib_write, 345 rib_getinfo, 346 }; 347 348 /* 349 * RDMATF RPCIB plugin details 350 */ 351 static rdma_mod_t rib_mod = { 352 "ibtf", /* api name */ 353 RDMATF_VERS_1, 354 0, 355 &rib_ops, /* rdma op vector for ibtf */ 356 }; 357 358 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 359 static rdma_stat rib_qp_init(rib_qp_t *, int); 360 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 361 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 362 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 363 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 364 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 365 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 366 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 367 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 368 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 369 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 370 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 371 rib_qp_t **); 372 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 373 rib_qp_t **); 374 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 375 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 376 static int rib_free_sendwait(struct send_wid *); 377 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 378 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 379 static void rdma_done_rem_list(rib_qp_t *); 380 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 381 382 static void rib_async_handler(void *, 383 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 384 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 385 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 386 static int rib_free_svc_recv(struct svc_recv *); 387 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 388 static void rib_free_wid(struct recv_wid *); 389 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 390 static void rib_detach_hca(rib_hca_t *); 391 static void rib_close_a_channel(CONN *); 392 static void rib_send_hold(rib_qp_t *); 393 static void rib_send_rele(rib_qp_t *); 394 395 /* 396 * Registration with IBTF as a consumer 397 */ 398 static struct ibt_clnt_modinfo_s rib_modinfo = { 399 IBTI_V_CURR, 400 IBT_GENERIC, 401 rib_async_handler, /* async event handler */ 402 NULL, /* Memory Region Handler */ 403 "nfs/ib" 404 }; 405 406 /* 407 * Global strucuture 408 */ 409 410 typedef struct rpcib_s { 411 dev_info_t *rpcib_dip; 412 kmutex_t rpcib_mutex; 413 } rpcib_t; 414 415 rpcib_t rpcib; 416 417 /* 418 * /etc/system controlled variable to control 419 * debugging in rpcib kernel module. 420 * Set it to values greater that 1 to control 421 * the amount of debugging messages required. 422 */ 423 int rib_debug = 0; 424 425 int 426 _init(void) 427 { 428 int error; 429 430 error = mod_install((struct modlinkage *)&rib_modlinkage); 431 if (error != 0) { 432 /* 433 * Could not load module 434 */ 435 return (error); 436 } 437 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 438 return (0); 439 } 440 441 int 442 _fini() 443 { 444 int status; 445 446 /* 447 * Remove module 448 */ 449 if ((status = mod_remove(&rib_modlinkage)) != 0) { 450 return (status); 451 } 452 mutex_destroy(&plugin_state_lock); 453 return (0); 454 } 455 456 int 457 _info(struct modinfo *modinfop) 458 { 459 return (mod_info(&rib_modlinkage, modinfop)); 460 } 461 462 /* 463 * rpcib_getinfo() 464 * Given the device number, return the devinfo pointer or the 465 * instance number. 466 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 467 */ 468 469 /*ARGSUSED*/ 470 static int 471 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 472 { 473 int ret = DDI_SUCCESS; 474 475 switch (cmd) { 476 case DDI_INFO_DEVT2DEVINFO: 477 if (rpcib.rpcib_dip != NULL) 478 *result = rpcib.rpcib_dip; 479 else { 480 *result = NULL; 481 ret = DDI_FAILURE; 482 } 483 break; 484 485 case DDI_INFO_DEVT2INSTANCE: 486 *result = NULL; 487 break; 488 489 default: 490 ret = DDI_FAILURE; 491 } 492 return (ret); 493 } 494 495 static void 496 rpcib_free_hca_list() 497 { 498 rib_hca_t *hca, *hcap; 499 500 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 501 hca = rib_stat->hcas_list; 502 rib_stat->hcas_list = NULL; 503 rw_exit(&rib_stat->hcas_list_lock); 504 while (hca != NULL) { 505 rw_enter(&hca->state_lock, RW_WRITER); 506 hcap = hca; 507 hca = hca->next; 508 rib_stat->nhca_inited--; 509 rib_mod.rdma_count--; 510 hcap->state = HCA_DETACHED; 511 rw_exit(&hcap->state_lock); 512 rib_stop_hca_services(hcap); 513 514 kmem_free(hcap, sizeof (*hcap)); 515 } 516 } 517 518 static rdma_stat 519 rpcib_free_service_list() 520 { 521 rib_service_t *service; 522 ibt_status_t ret; 523 524 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 525 while (rib_stat->service_list != NULL) { 526 service = rib_stat->service_list; 527 ret = ibt_unbind_all_services(service->srv_hdl); 528 if (ret != IBT_SUCCESS) { 529 rw_exit(&rib_stat->service_list_lock); 530 #ifdef DEBUG 531 cmn_err(CE_NOTE, "rpcib_free_service_list: " 532 "ibt_unbind_all_services failed (%d)\n", (int)ret); 533 #endif 534 return (RDMA_FAILED); 535 } 536 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 537 service->srv_hdl); 538 if (ret != IBT_SUCCESS) { 539 rw_exit(&rib_stat->service_list_lock); 540 #ifdef DEBUG 541 cmn_err(CE_NOTE, "rpcib_free_service_list: " 542 "ibt_deregister_service failed (%d)\n", (int)ret); 543 #endif 544 return (RDMA_FAILED); 545 } 546 rib_stat->service_list = service->next; 547 kmem_free(service, sizeof (rib_service_t)); 548 } 549 rw_exit(&rib_stat->service_list_lock); 550 551 return (RDMA_SUCCESS); 552 } 553 554 static int 555 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 556 { 557 ibt_status_t ibt_status; 558 rdma_stat r_status; 559 560 switch (cmd) { 561 case DDI_ATTACH: 562 break; 563 case DDI_RESUME: 564 return (DDI_SUCCESS); 565 default: 566 return (DDI_FAILURE); 567 } 568 569 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 570 571 mutex_enter(&rpcib.rpcib_mutex); 572 if (rpcib.rpcib_dip != NULL) { 573 mutex_exit(&rpcib.rpcib_mutex); 574 return (DDI_FAILURE); 575 } 576 rpcib.rpcib_dip = dip; 577 mutex_exit(&rpcib.rpcib_mutex); 578 /* 579 * Create the "rpcib" minor-node. 580 */ 581 if (ddi_create_minor_node(dip, 582 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 583 /* Error message, no cmn_err as they print on console */ 584 return (DDI_FAILURE); 585 } 586 587 if (rib_stat == NULL) { 588 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 589 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 590 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 591 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 592 } 593 594 rib_stat->hca_count = ibt_get_hca_list(NULL); 595 if (rib_stat->hca_count < 1) { 596 mutex_destroy(&rib_stat->listen_lock); 597 rw_destroy(&rib_stat->hcas_list_lock); 598 mutex_destroy(&rib_stat->open_hca_lock); 599 kmem_free(rib_stat, sizeof (*rib_stat)); 600 rib_stat = NULL; 601 return (DDI_FAILURE); 602 } 603 604 ibt_status = ibt_attach(&rib_modinfo, dip, 605 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 606 607 if (ibt_status != IBT_SUCCESS) { 608 mutex_destroy(&rib_stat->listen_lock); 609 rw_destroy(&rib_stat->hcas_list_lock); 610 mutex_destroy(&rib_stat->open_hca_lock); 611 kmem_free(rib_stat, sizeof (*rib_stat)); 612 rib_stat = NULL; 613 return (DDI_FAILURE); 614 } 615 616 rib_stat->service_list = NULL; 617 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 618 mutex_enter(&rib_stat->open_hca_lock); 619 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 620 mutex_exit(&rib_stat->open_hca_lock); 621 goto open_fail; 622 } 623 mutex_exit(&rib_stat->open_hca_lock); 624 625 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 626 DDI_PROP_SUCCESS) { 627 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 628 "failed."); 629 goto register_fail; 630 } 631 632 /* 633 * Register with rdmatf 634 */ 635 r_status = rdma_register_mod(&rib_mod); 636 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 637 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 638 "status = %d", r_status); 639 goto register_fail; 640 } 641 642 return (DDI_SUCCESS); 643 644 register_fail: 645 646 open_fail: 647 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 648 rpcib_free_hca_list(); 649 (void) rpcib_free_service_list(); 650 mutex_destroy(&rib_stat->listen_lock); 651 rw_destroy(&rib_stat->hcas_list_lock); 652 mutex_destroy(&rib_stat->open_hca_lock); 653 rw_destroy(&rib_stat->service_list_lock); 654 kmem_free(rib_stat, sizeof (*rib_stat)); 655 rib_stat = NULL; 656 return (DDI_FAILURE); 657 } 658 659 /*ARGSUSED*/ 660 static int 661 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 662 { 663 switch (cmd) { 664 665 case DDI_DETACH: 666 break; 667 668 case DDI_SUSPEND: 669 default: 670 return (DDI_FAILURE); 671 } 672 673 /* 674 * Detach the hca and free resources 675 */ 676 mutex_enter(&plugin_state_lock); 677 plugin_state = NO_ACCEPT; 678 mutex_exit(&plugin_state_lock); 679 680 if (rpcib_free_service_list() != RDMA_SUCCESS) 681 return (DDI_FAILURE); 682 rpcib_free_hca_list(); 683 684 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 685 mutex_destroy(&rib_stat->listen_lock); 686 rw_destroy(&rib_stat->hcas_list_lock); 687 mutex_destroy(&rib_stat->open_hca_lock); 688 rw_destroy(&rib_stat->service_list_lock); 689 690 kmem_free(rib_stat, sizeof (*rib_stat)); 691 rib_stat = NULL; 692 693 mutex_enter(&rpcib.rpcib_mutex); 694 rpcib.rpcib_dip = NULL; 695 mutex_exit(&rpcib.rpcib_mutex); 696 mutex_destroy(&rpcib.rpcib_mutex); 697 return (DDI_SUCCESS); 698 } 699 700 701 static void rib_rbufpool_free(rib_hca_t *, int); 702 static void rib_rbufpool_deregister(rib_hca_t *, int); 703 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 704 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 705 static rdma_stat rib_rem_replylist(rib_qp_t *); 706 static int rib_remreply(rib_qp_t *, struct reply *); 707 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 708 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 709 710 711 /* 712 * One CQ pair per HCA 713 */ 714 static rdma_stat 715 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 716 rib_cq_t **cqp) 717 { 718 rib_cq_t *cq; 719 ibt_cq_attr_t cq_attr; 720 uint32_t real_size; 721 ibt_status_t status; 722 rdma_stat error = RDMA_SUCCESS; 723 724 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 725 cq->rib_hca = hca; 726 cq_attr.cq_size = cq_size; 727 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 728 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 729 &real_size); 730 if (status != IBT_SUCCESS) { 731 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 732 " status=%d", status); 733 error = RDMA_FAILED; 734 goto fail; 735 } 736 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 737 738 /* 739 * Enable CQ callbacks. CQ Callbacks are single shot 740 * (e.g. you have to call ibt_enable_cq_notify() 741 * after each callback to get another one). 742 */ 743 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 744 if (status != IBT_SUCCESS) { 745 cmn_err(CE_WARN, "rib_create_cq: " 746 "enable_cq_notify failed, status %d", status); 747 error = RDMA_FAILED; 748 goto fail; 749 } 750 *cqp = cq; 751 752 return (error); 753 fail: 754 if (cq->rib_cq_hdl) 755 (void) ibt_free_cq(cq->rib_cq_hdl); 756 if (cq) 757 kmem_free(cq, sizeof (rib_cq_t)); 758 return (error); 759 } 760 761 /* 762 * rpcib_find_hca 763 * 764 * Caller should have already locked the hcas_lock before calling 765 * this function. 766 */ 767 static rib_hca_t * 768 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 769 { 770 rib_hca_t *hca = ribstat->hcas_list; 771 772 while (hca && hca->hca_guid != guid) 773 hca = hca->next; 774 775 return (hca); 776 } 777 778 static rdma_stat 779 rpcib_open_hcas(rpcib_state_t *ribstat) 780 { 781 rib_hca_t *hca; 782 ibt_status_t ibt_status; 783 rdma_stat status; 784 ibt_hca_portinfo_t *pinfop; 785 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 786 uint_t size, cq_size; 787 int i; 788 kstat_t *ksp; 789 cache_avl_struct_t example_avl_node; 790 char rssc_name[32]; 791 int old_nhca_inited = ribstat->nhca_inited; 792 ib_guid_t *hca_guids; 793 794 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 795 796 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 797 if (ribstat->hca_count == 0) 798 return (RDMA_FAILED); 799 800 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 801 /* 802 * Open a hca and setup for RDMA 803 */ 804 for (i = 0; i < ribstat->hca_count; i++) { 805 if (rpcib_find_hca(ribstat, hca_guids[i])) 806 continue; 807 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 808 809 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 810 hca_guids[i], &hca->hca_hdl); 811 if (ibt_status != IBT_SUCCESS) { 812 kmem_free(hca, sizeof (rib_hca_t)); 813 continue; 814 } 815 hca->hca_guid = hca_guids[i]; 816 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 817 hca->state = HCA_INITED; 818 819 /* 820 * query HCA info 821 */ 822 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 823 if (ibt_status != IBT_SUCCESS) { 824 goto fail1; 825 } 826 827 /* 828 * One PD (Protection Domain) per HCA. 829 * A qp is allowed to access a memory region 830 * only when it's in the same PD as that of 831 * the memory region. 832 */ 833 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 834 if (ibt_status != IBT_SUCCESS) { 835 goto fail1; 836 } 837 838 /* 839 * query HCA ports 840 */ 841 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 842 0, &pinfop, &hca->hca_nports, &size); 843 if (ibt_status != IBT_SUCCESS) { 844 goto fail2; 845 } 846 hca->hca_ports = pinfop; 847 hca->hca_pinfosz = size; 848 pinfop = NULL; 849 850 cq_size = DEF_CQ_SIZE; /* default cq size */ 851 /* 852 * Create 2 pairs of cq's (1 pair for client 853 * and the other pair for server) on this hca. 854 * If number of qp's gets too large, then several 855 * cq's will be needed. 856 */ 857 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 858 &hca->svc_rcq); 859 if (status != RDMA_SUCCESS) { 860 goto fail3; 861 } 862 863 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 864 &hca->svc_scq); 865 if (status != RDMA_SUCCESS) { 866 goto fail3; 867 } 868 869 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 870 &hca->clnt_rcq); 871 if (status != RDMA_SUCCESS) { 872 goto fail3; 873 } 874 875 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 876 &hca->clnt_scq); 877 if (status != RDMA_SUCCESS) { 878 goto fail3; 879 } 880 881 /* 882 * Create buffer pools. 883 * Note rib_rbuf_create also allocates memory windows. 884 */ 885 hca->recv_pool = rib_rbufpool_create(hca, 886 RECV_BUFFER, rib_max_rbufs); 887 if (hca->recv_pool == NULL) { 888 goto fail3; 889 } 890 891 hca->send_pool = rib_rbufpool_create(hca, 892 SEND_BUFFER, rib_max_rbufs); 893 if (hca->send_pool == NULL) { 894 rib_rbufpool_destroy(hca, RECV_BUFFER); 895 goto fail3; 896 } 897 898 if (hca->server_side_cache == NULL) { 899 (void) sprintf(rssc_name, 900 "rib_srvr_cache_%llx", 901 (long long unsigned int) hca->hca_guid); 902 hca->server_side_cache = kmem_cache_create( 903 rssc_name, 904 sizeof (cache_avl_struct_t), 0, 905 NULL, 906 NULL, 907 rib_server_side_cache_reclaim, 908 hca, NULL, 0); 909 } 910 911 avl_create(&hca->avl_tree, 912 avl_compare, 913 sizeof (cache_avl_struct_t), 914 (uint_t)(uintptr_t)&example_avl_node.avl_link- 915 (uint_t)(uintptr_t)&example_avl_node); 916 917 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 918 hca->iblock); 919 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 920 rw_init(&hca->avl_rw_lock, 921 NULL, RW_DRIVER, hca->iblock); 922 mutex_init(&hca->cache_allocation_lock, 923 NULL, MUTEX_DRIVER, NULL); 924 hca->avl_init = TRUE; 925 926 /* Create kstats for the cache */ 927 ASSERT(INGLOBALZONE(curproc)); 928 929 if (!stats_enabled) { 930 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 931 KSTAT_TYPE_NAMED, 932 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 933 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 934 GLOBAL_ZONEID); 935 if (ksp) { 936 ksp->ks_data = (void *) &rpcib_kstat; 937 ksp->ks_update = rpcib_cache_kstat_update; 938 kstat_install(ksp); 939 stats_enabled = TRUE; 940 } 941 } 942 if (hca->cleanup_helper == NULL) { 943 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 944 945 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 946 (unsigned long long int) hca->hca_guid); 947 hca->cleanup_helper = ddi_taskq_create(NULL, 948 tq_name, 1, TASKQ_DEFAULTPRI, 0); 949 } 950 951 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 952 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 953 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 954 hca->iblock); 955 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 956 hca->iblock); 957 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 958 hca->inuse = TRUE; 959 960 hca->next = ribstat->hcas_list; 961 ribstat->hcas_list = hca; 962 ribstat->nhca_inited++; 963 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 964 continue; 965 966 fail3: 967 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 968 fail2: 969 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 970 fail1: 971 (void) ibt_close_hca(hca->hca_hdl); 972 kmem_free(hca, sizeof (rib_hca_t)); 973 } 974 rw_exit(&ribstat->hcas_list_lock); 975 ibt_free_hca_list(hca_guids, ribstat->hca_count); 976 rib_mod.rdma_count = rib_stat->nhca_inited; 977 978 /* 979 * return success if at least one new hca has been configured. 980 */ 981 if (ribstat->nhca_inited != old_nhca_inited) 982 return (RDMA_SUCCESS); 983 else 984 return (RDMA_FAILED); 985 } 986 987 /* 988 * Callback routines 989 */ 990 991 /* 992 * SCQ handlers 993 */ 994 /* ARGSUSED */ 995 static void 996 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 997 { 998 ibt_status_t ibt_status; 999 ibt_wc_t wc; 1000 struct send_wid *wd; 1001 CONN *conn; 1002 rib_qp_t *qp; 1003 int i; 1004 1005 /* 1006 * Re-enable cq notify here to avoid missing any 1007 * completion queue notification. 1008 */ 1009 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1010 1011 ibt_status = IBT_SUCCESS; 1012 while (ibt_status != IBT_CQ_EMPTY) { 1013 bzero(&wc, sizeof (wc)); 1014 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1015 if (ibt_status != IBT_SUCCESS) 1016 return; 1017 1018 /* 1019 * Got a send completion 1020 */ 1021 if (wc.wc_id != RDMA_DUMMY_WRID) { 1022 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1023 qp = wd->qp; 1024 conn = qptoc(qp); 1025 1026 mutex_enter(&wd->sendwait_lock); 1027 switch (wc.wc_status) { 1028 case IBT_WC_SUCCESS: 1029 wd->status = RDMA_SUCCESS; 1030 break; 1031 default: 1032 /* 1033 * RC Send Q Error Code Local state Remote State 1034 * ==================== =========== ============ 1035 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1036 * IBT_WC_LOCAL_LEN_ERR ERROR None 1037 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1038 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1039 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1040 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1041 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1042 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1043 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1044 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1045 * IBT_WC_WR_FLUSHED_ERR ERROR None 1046 */ 1047 /* 1048 * Channel in error state. Set connection to 1049 * ERROR and cleanup will happen either from 1050 * conn_release or from rib_conn_get 1051 */ 1052 wd->status = RDMA_FAILED; 1053 mutex_enter(&conn->c_lock); 1054 if (conn->c_state != C_DISCONN_PEND) 1055 conn->c_state = C_ERROR_CONN; 1056 mutex_exit(&conn->c_lock); 1057 break; 1058 } 1059 1060 if (wd->cv_sig == 1) { 1061 /* 1062 * Notify poster 1063 */ 1064 cv_signal(&wd->wait_cv); 1065 mutex_exit(&wd->sendwait_lock); 1066 } else { 1067 /* 1068 * Poster not waiting for notification. 1069 * Free the send buffers and send_wid 1070 */ 1071 for (i = 0; i < wd->nsbufs; i++) { 1072 rib_rbuf_free(qptoc(wd->qp), 1073 SEND_BUFFER, 1074 (void *)(uintptr_t)wd->sbufaddr[i]); 1075 } 1076 1077 /* decrement the send ref count */ 1078 rib_send_rele(qp); 1079 1080 mutex_exit(&wd->sendwait_lock); 1081 (void) rib_free_sendwait(wd); 1082 } 1083 } 1084 } 1085 } 1086 1087 /* ARGSUSED */ 1088 static void 1089 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1090 { 1091 ibt_status_t ibt_status; 1092 ibt_wc_t wc; 1093 struct send_wid *wd; 1094 rib_qp_t *qp; 1095 CONN *conn; 1096 int i; 1097 1098 /* 1099 * Re-enable cq notify here to avoid missing any 1100 * completion queue notification. 1101 */ 1102 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1103 1104 ibt_status = IBT_SUCCESS; 1105 while (ibt_status != IBT_CQ_EMPTY) { 1106 bzero(&wc, sizeof (wc)); 1107 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1108 if (ibt_status != IBT_SUCCESS) 1109 return; 1110 1111 /* 1112 * Got a send completion 1113 */ 1114 if (wc.wc_id != RDMA_DUMMY_WRID) { 1115 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1116 qp = wd->qp; 1117 conn = qptoc(qp); 1118 mutex_enter(&wd->sendwait_lock); 1119 1120 switch (wc.wc_status) { 1121 case IBT_WC_SUCCESS: 1122 wd->status = RDMA_SUCCESS; 1123 break; 1124 default: 1125 /* 1126 * Channel in error state. Set connection to 1127 * ERROR and cleanup will happen either from 1128 * conn_release or conn timeout. 1129 */ 1130 wd->status = RDMA_FAILED; 1131 mutex_enter(&conn->c_lock); 1132 if (conn->c_state != C_DISCONN_PEND) 1133 conn->c_state = C_ERROR_CONN; 1134 mutex_exit(&conn->c_lock); 1135 break; 1136 } 1137 1138 if (wd->cv_sig == 1) { 1139 /* 1140 * Update completion status and notify poster 1141 */ 1142 cv_signal(&wd->wait_cv); 1143 mutex_exit(&wd->sendwait_lock); 1144 } else { 1145 /* 1146 * Poster not waiting for notification. 1147 * Free the send buffers and send_wid 1148 */ 1149 for (i = 0; i < wd->nsbufs; i++) { 1150 rib_rbuf_free(qptoc(wd->qp), 1151 SEND_BUFFER, 1152 (void *)(uintptr_t)wd->sbufaddr[i]); 1153 } 1154 1155 /* decrement the send ref count */ 1156 rib_send_rele(qp); 1157 1158 mutex_exit(&wd->sendwait_lock); 1159 (void) rib_free_sendwait(wd); 1160 } 1161 } 1162 } 1163 } 1164 1165 /* 1166 * RCQ handler 1167 */ 1168 /* ARGSUSED */ 1169 static void 1170 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1171 { 1172 rib_qp_t *qp; 1173 ibt_status_t ibt_status; 1174 ibt_wc_t wc; 1175 struct recv_wid *rwid; 1176 1177 /* 1178 * Re-enable cq notify here to avoid missing any 1179 * completion queue notification. 1180 */ 1181 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1182 1183 ibt_status = IBT_SUCCESS; 1184 while (ibt_status != IBT_CQ_EMPTY) { 1185 bzero(&wc, sizeof (wc)); 1186 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1187 if (ibt_status != IBT_SUCCESS) 1188 return; 1189 1190 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1191 qp = rwid->qp; 1192 if (wc.wc_status == IBT_WC_SUCCESS) { 1193 XDR inxdrs, *xdrs; 1194 uint_t xid, vers, op, find_xid = 0; 1195 struct reply *r; 1196 CONN *conn = qptoc(qp); 1197 uint32_t rdma_credit = 0; 1198 1199 xdrs = &inxdrs; 1200 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1201 wc.wc_bytes_xfer, XDR_DECODE); 1202 /* 1203 * Treat xid as opaque (xid is the first entity 1204 * in the rpc rdma message). 1205 */ 1206 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1207 1208 /* Skip xid and set the xdr position accordingly. */ 1209 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1210 (void) xdr_u_int(xdrs, &vers); 1211 (void) xdr_u_int(xdrs, &rdma_credit); 1212 (void) xdr_u_int(xdrs, &op); 1213 XDR_DESTROY(xdrs); 1214 1215 if (vers != RPCRDMA_VERS) { 1216 /* 1217 * Invalid RPC/RDMA version. Cannot 1218 * interoperate. Set connection to 1219 * ERROR state and bail out. 1220 */ 1221 mutex_enter(&conn->c_lock); 1222 if (conn->c_state != C_DISCONN_PEND) 1223 conn->c_state = C_ERROR_CONN; 1224 mutex_exit(&conn->c_lock); 1225 rib_rbuf_free(conn, RECV_BUFFER, 1226 (void *)(uintptr_t)rwid->addr); 1227 rib_free_wid(rwid); 1228 continue; 1229 } 1230 1231 mutex_enter(&qp->replylist_lock); 1232 for (r = qp->replylist; r != NULL; r = r->next) { 1233 if (r->xid == xid) { 1234 find_xid = 1; 1235 switch (op) { 1236 case RDMA_MSG: 1237 case RDMA_NOMSG: 1238 case RDMA_MSGP: 1239 r->status = RDMA_SUCCESS; 1240 r->vaddr_cq = rwid->addr; 1241 r->bytes_xfer = 1242 wc.wc_bytes_xfer; 1243 cv_signal(&r->wait_cv); 1244 break; 1245 default: 1246 rib_rbuf_free(qptoc(qp), 1247 RECV_BUFFER, 1248 (void *)(uintptr_t) 1249 rwid->addr); 1250 break; 1251 } 1252 break; 1253 } 1254 } 1255 mutex_exit(&qp->replylist_lock); 1256 if (find_xid == 0) { 1257 /* RPC caller not waiting for reply */ 1258 1259 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1260 int, xid); 1261 1262 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1263 (void *)(uintptr_t)rwid->addr); 1264 } 1265 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1266 CONN *conn = qptoc(qp); 1267 1268 /* 1269 * Connection being flushed. Just free 1270 * the posted buffer 1271 */ 1272 rib_rbuf_free(conn, RECV_BUFFER, 1273 (void *)(uintptr_t)rwid->addr); 1274 } else { 1275 CONN *conn = qptoc(qp); 1276 /* 1277 * RC Recv Q Error Code Local state Remote State 1278 * ==================== =========== ============ 1279 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1280 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1281 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1282 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1283 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1284 * IBT_WC_WR_FLUSHED_ERR None None 1285 */ 1286 /* 1287 * Channel in error state. Set connection 1288 * in ERROR state. 1289 */ 1290 mutex_enter(&conn->c_lock); 1291 if (conn->c_state != C_DISCONN_PEND) 1292 conn->c_state = C_ERROR_CONN; 1293 mutex_exit(&conn->c_lock); 1294 rib_rbuf_free(conn, RECV_BUFFER, 1295 (void *)(uintptr_t)rwid->addr); 1296 } 1297 rib_free_wid(rwid); 1298 } 1299 } 1300 1301 /* Server side */ 1302 /* ARGSUSED */ 1303 static void 1304 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1305 { 1306 rdma_recv_data_t *rdp; 1307 rib_qp_t *qp; 1308 ibt_status_t ibt_status; 1309 ibt_wc_t wc; 1310 struct svc_recv *s_recvp; 1311 CONN *conn; 1312 mblk_t *mp; 1313 1314 /* 1315 * Re-enable cq notify here to avoid missing any 1316 * completion queue notification. 1317 */ 1318 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1319 1320 ibt_status = IBT_SUCCESS; 1321 while (ibt_status != IBT_CQ_EMPTY) { 1322 bzero(&wc, sizeof (wc)); 1323 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1324 if (ibt_status != IBT_SUCCESS) 1325 return; 1326 1327 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1328 qp = s_recvp->qp; 1329 conn = qptoc(qp); 1330 mutex_enter(&qp->posted_rbufs_lock); 1331 qp->n_posted_rbufs--; 1332 if (qp->n_posted_rbufs == 0) 1333 cv_signal(&qp->posted_rbufs_cv); 1334 mutex_exit(&qp->posted_rbufs_lock); 1335 1336 if (wc.wc_status == IBT_WC_SUCCESS) { 1337 XDR inxdrs, *xdrs; 1338 uint_t xid, vers, op; 1339 uint32_t rdma_credit; 1340 1341 xdrs = &inxdrs; 1342 /* s_recvp->vaddr stores data */ 1343 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1344 wc.wc_bytes_xfer, XDR_DECODE); 1345 1346 /* 1347 * Treat xid as opaque (xid is the first entity 1348 * in the rpc rdma message). 1349 */ 1350 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1351 /* Skip xid and set the xdr position accordingly. */ 1352 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1353 if (!xdr_u_int(xdrs, &vers) || 1354 !xdr_u_int(xdrs, &rdma_credit) || 1355 !xdr_u_int(xdrs, &op)) { 1356 rib_rbuf_free(conn, RECV_BUFFER, 1357 (void *)(uintptr_t)s_recvp->vaddr); 1358 XDR_DESTROY(xdrs); 1359 (void) rib_free_svc_recv(s_recvp); 1360 continue; 1361 } 1362 XDR_DESTROY(xdrs); 1363 1364 if (vers != RPCRDMA_VERS) { 1365 /* 1366 * Invalid RPC/RDMA version. 1367 * Drop rpc rdma message. 1368 */ 1369 rib_rbuf_free(conn, RECV_BUFFER, 1370 (void *)(uintptr_t)s_recvp->vaddr); 1371 (void) rib_free_svc_recv(s_recvp); 1372 continue; 1373 } 1374 /* 1375 * Is this for RDMA_DONE? 1376 */ 1377 if (op == RDMA_DONE) { 1378 rib_rbuf_free(conn, RECV_BUFFER, 1379 (void *)(uintptr_t)s_recvp->vaddr); 1380 /* 1381 * Wake up the thread waiting on 1382 * a RDMA_DONE for xid 1383 */ 1384 mutex_enter(&qp->rdlist_lock); 1385 rdma_done_notify(qp, xid); 1386 mutex_exit(&qp->rdlist_lock); 1387 (void) rib_free_svc_recv(s_recvp); 1388 continue; 1389 } 1390 1391 mutex_enter(&plugin_state_lock); 1392 if (plugin_state == ACCEPT) { 1393 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1394 == NULL) 1395 (void) strwaitbuf( 1396 sizeof (*rdp), BPRI_LO); 1397 /* 1398 * Plugin is in accept state, hence the master 1399 * transport queue for this is still accepting 1400 * requests. Hence we can call svc_queuereq to 1401 * queue this recieved msg. 1402 */ 1403 rdp = (rdma_recv_data_t *)mp->b_rptr; 1404 rdp->conn = conn; 1405 rdp->rpcmsg.addr = 1406 (caddr_t)(uintptr_t)s_recvp->vaddr; 1407 rdp->rpcmsg.type = RECV_BUFFER; 1408 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1409 rdp->status = wc.wc_status; 1410 mutex_enter(&conn->c_lock); 1411 conn->c_ref++; 1412 mutex_exit(&conn->c_lock); 1413 mp->b_wptr += sizeof (*rdp); 1414 svc_queuereq((queue_t *)rib_stat->q, mp); 1415 mutex_exit(&plugin_state_lock); 1416 } else { 1417 /* 1418 * The master transport for this is going 1419 * away and the queue is not accepting anymore 1420 * requests for krpc, so don't do anything, just 1421 * free the msg. 1422 */ 1423 mutex_exit(&plugin_state_lock); 1424 rib_rbuf_free(conn, RECV_BUFFER, 1425 (void *)(uintptr_t)s_recvp->vaddr); 1426 } 1427 } else { 1428 rib_rbuf_free(conn, RECV_BUFFER, 1429 (void *)(uintptr_t)s_recvp->vaddr); 1430 } 1431 (void) rib_free_svc_recv(s_recvp); 1432 } 1433 } 1434 1435 static void 1436 rib_attach_hca() 1437 { 1438 mutex_enter(&rib_stat->open_hca_lock); 1439 rpcib_open_hcas(rib_stat); 1440 rib_listen(NULL); 1441 mutex_exit(&rib_stat->open_hca_lock); 1442 } 1443 1444 /* 1445 * Handles DR event of IBT_HCA_DETACH_EVENT. 1446 */ 1447 /* ARGSUSED */ 1448 static void 1449 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1450 ibt_async_code_t code, ibt_async_event_t *event) 1451 { 1452 switch (code) { 1453 case IBT_HCA_ATTACH_EVENT: 1454 rib_attach_hca(); 1455 break; 1456 case IBT_HCA_DETACH_EVENT: 1457 { 1458 rib_hca_t *hca; 1459 1460 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1461 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1462 rw_enter(&hca->state_lock, RW_READER); 1463 if ((hca->state != HCA_DETACHED) && 1464 (hca->hca_hdl == hca_hdl)) { 1465 rw_exit(&hca->state_lock); 1466 break; 1467 } 1468 rw_exit(&hca->state_lock); 1469 } 1470 rw_exit(&rib_stat->hcas_list_lock); 1471 1472 if (hca == NULL) 1473 return; 1474 ASSERT(hca->hca_hdl == hca_hdl); 1475 rib_detach_hca(hca); 1476 #ifdef DEBUG 1477 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1478 #endif 1479 break; 1480 } 1481 case IBT_EVENT_PORT_UP: 1482 /* 1483 * A port is up. We should call rib_listen() since there is 1484 * a chance that rib_listen() may have failed during 1485 * rib_attach_hca() because the port had not been up yet. 1486 */ 1487 rib_listen(NULL); 1488 #ifdef DEBUG 1489 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1490 #endif 1491 break; 1492 #ifdef DEBUG 1493 case IBT_EVENT_PATH_MIGRATED: 1494 cmn_err(CE_NOTE, "rib_async_handler(): " 1495 "IBT_EVENT_PATH_MIGRATED\n"); 1496 break; 1497 case IBT_EVENT_SQD: 1498 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1499 break; 1500 case IBT_EVENT_COM_EST: 1501 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1502 break; 1503 case IBT_ERROR_CATASTROPHIC_CHAN: 1504 cmn_err(CE_NOTE, "rib_async_handler(): " 1505 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1506 break; 1507 case IBT_ERROR_INVALID_REQUEST_CHAN: 1508 cmn_err(CE_NOTE, "rib_async_handler(): " 1509 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1510 break; 1511 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1512 cmn_err(CE_NOTE, "rib_async_handler(): " 1513 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1514 break; 1515 case IBT_ERROR_PATH_MIGRATE_REQ: 1516 cmn_err(CE_NOTE, "rib_async_handler(): " 1517 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1518 break; 1519 case IBT_ERROR_CQ: 1520 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1521 break; 1522 case IBT_ERROR_PORT_DOWN: 1523 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1524 break; 1525 case IBT_ASYNC_OPAQUE1: 1526 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1527 break; 1528 case IBT_ASYNC_OPAQUE2: 1529 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1530 break; 1531 case IBT_ASYNC_OPAQUE3: 1532 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1533 break; 1534 case IBT_ASYNC_OPAQUE4: 1535 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1536 break; 1537 #endif 1538 default: 1539 break; 1540 } 1541 } 1542 1543 /* 1544 * Client's reachable function. 1545 */ 1546 static rdma_stat 1547 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1548 { 1549 rdma_stat status; 1550 rpcib_ping_t rpt; 1551 1552 bzero(&rpt, sizeof (rpcib_ping_t)); 1553 status = rib_ping_srv(addr_type, raddr, &rpt); 1554 1555 if (status == RDMA_SUCCESS) { 1556 *handle = (void *)rpt.hca; 1557 return (RDMA_SUCCESS); 1558 } else { 1559 *handle = NULL; 1560 DTRACE_PROBE(rpcib__i__pingfailed); 1561 return (RDMA_FAILED); 1562 } 1563 } 1564 1565 /* Client side qp creation */ 1566 static rdma_stat 1567 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1568 { 1569 rib_qp_t *kqp = NULL; 1570 CONN *conn; 1571 rdma_clnt_cred_ctrl_t *cc_info; 1572 1573 ASSERT(qp != NULL); 1574 *qp = NULL; 1575 1576 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1577 conn = qptoc(kqp); 1578 kqp->hca = hca; 1579 kqp->rdmaconn.c_rdmamod = &rib_mod; 1580 kqp->rdmaconn.c_private = (caddr_t)kqp; 1581 1582 kqp->mode = RIB_CLIENT; 1583 kqp->chan_flags = IBT_BLOCKING; 1584 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1585 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1586 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1587 /* 1588 * Initialize 1589 */ 1590 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1591 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1592 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1593 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1594 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1595 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1596 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1597 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1598 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1599 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1600 /* 1601 * Initialize the client credit control 1602 * portion of the rdmaconn struct. 1603 */ 1604 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1605 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1606 cc_info->clnt_cc_granted_ops = 0; 1607 cc_info->clnt_cc_in_flight_ops = 0; 1608 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1609 1610 *qp = kqp; 1611 return (RDMA_SUCCESS); 1612 } 1613 1614 /* Server side qp creation */ 1615 static rdma_stat 1616 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1617 { 1618 rib_qp_t *kqp = NULL; 1619 ibt_chan_sizes_t chan_sizes; 1620 ibt_rc_chan_alloc_args_t qp_attr; 1621 ibt_status_t ibt_status; 1622 rdma_srv_cred_ctrl_t *cc_info; 1623 1624 *qp = NULL; 1625 1626 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1627 kqp->hca = hca; 1628 kqp->port_num = port; 1629 kqp->rdmaconn.c_rdmamod = &rib_mod; 1630 kqp->rdmaconn.c_private = (caddr_t)kqp; 1631 1632 /* 1633 * Create the qp handle 1634 */ 1635 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1636 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1637 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1638 qp_attr.rc_pd = hca->pd_hdl; 1639 qp_attr.rc_hca_port_num = port; 1640 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1641 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1642 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1643 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1644 qp_attr.rc_clone_chan = NULL; 1645 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1646 qp_attr.rc_flags = IBT_WR_SIGNALED; 1647 1648 rw_enter(&hca->state_lock, RW_READER); 1649 if (hca->state != HCA_DETACHED) { 1650 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1651 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1652 &chan_sizes); 1653 } else { 1654 rw_exit(&hca->state_lock); 1655 goto fail; 1656 } 1657 rw_exit(&hca->state_lock); 1658 1659 if (ibt_status != IBT_SUCCESS) { 1660 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1661 int, ibt_status); 1662 goto fail; 1663 } 1664 1665 kqp->mode = RIB_SERVER; 1666 kqp->chan_flags = IBT_BLOCKING; 1667 kqp->q = q; /* server ONLY */ 1668 1669 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1670 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1671 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1672 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1673 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1674 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1675 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1676 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1677 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1678 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1679 /* 1680 * Set the private data area to qp to be used in callbacks 1681 */ 1682 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1683 kqp->rdmaconn.c_state = C_CONNECTED; 1684 1685 /* 1686 * Initialize the server credit control 1687 * portion of the rdmaconn struct. 1688 */ 1689 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1690 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1691 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1692 cc_info->srv_cc_cur_buffers_used = 0; 1693 cc_info->srv_cc_posted = preposted_rbufs; 1694 1695 *qp = kqp; 1696 1697 return (RDMA_SUCCESS); 1698 fail: 1699 if (kqp) 1700 kmem_free(kqp, sizeof (rib_qp_t)); 1701 1702 return (RDMA_FAILED); 1703 } 1704 1705 /* ARGSUSED */ 1706 ibt_cm_status_t 1707 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1708 ibt_cm_return_args_t *ret_args, void *priv_data, 1709 ibt_priv_data_len_t len) 1710 { 1711 rib_hca_t *hca; 1712 1713 hca = (rib_hca_t *)clnt_hdl; 1714 1715 switch (event->cm_type) { 1716 1717 /* got a connection close event */ 1718 case IBT_CM_EVENT_CONN_CLOSED: 1719 { 1720 CONN *conn; 1721 rib_qp_t *qp; 1722 1723 /* check reason why connection was closed */ 1724 switch (event->cm_event.closed) { 1725 case IBT_CM_CLOSED_DREP_RCVD: 1726 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1727 case IBT_CM_CLOSED_DUP: 1728 case IBT_CM_CLOSED_ABORT: 1729 case IBT_CM_CLOSED_ALREADY: 1730 /* 1731 * These cases indicate the local end initiated 1732 * the closing of the channel. Nothing to do here. 1733 */ 1734 break; 1735 default: 1736 /* 1737 * Reason for CONN_CLOSED event must be one of 1738 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1739 * or IBT_CM_CLOSED_STALE. These indicate cases were 1740 * the remote end is closing the channel. In these 1741 * cases free the channel and transition to error 1742 * state 1743 */ 1744 qp = ibt_get_chan_private(event->cm_channel); 1745 conn = qptoc(qp); 1746 mutex_enter(&conn->c_lock); 1747 if (conn->c_state == C_DISCONN_PEND) { 1748 mutex_exit(&conn->c_lock); 1749 break; 1750 } 1751 1752 conn->c_state = C_ERROR_CONN; 1753 1754 /* 1755 * Free the conn if c_ref is down to 0 already 1756 */ 1757 if (conn->c_ref == 0) { 1758 /* 1759 * Remove from list and free conn 1760 */ 1761 conn->c_state = C_DISCONN_PEND; 1762 mutex_exit(&conn->c_lock); 1763 rw_enter(&hca->state_lock, RW_READER); 1764 if (hca->state != HCA_DETACHED) 1765 (void) rib_disconnect_channel(conn, 1766 &hca->cl_conn_list); 1767 rw_exit(&hca->state_lock); 1768 } else { 1769 /* 1770 * conn will be freed when c_ref goes to 0. 1771 * Indicate to cleaning thread not to close 1772 * the connection, but just free the channel. 1773 */ 1774 conn->c_flags |= C_CLOSE_NOTNEEDED; 1775 mutex_exit(&conn->c_lock); 1776 } 1777 #ifdef DEBUG 1778 if (rib_debug) 1779 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1780 "(CONN_CLOSED) channel disconnected"); 1781 #endif 1782 break; 1783 } 1784 break; 1785 } 1786 default: 1787 break; 1788 } 1789 return (IBT_CM_ACCEPT); 1790 } 1791 1792 /* 1793 * Connect to the server. 1794 */ 1795 rdma_stat 1796 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1797 { 1798 ibt_chan_open_args_t chan_args; /* channel args */ 1799 ibt_chan_sizes_t chan_sizes; 1800 ibt_rc_chan_alloc_args_t qp_attr; 1801 ibt_status_t ibt_status; 1802 ibt_rc_returns_t ret_args; /* conn reject info */ 1803 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1804 ibt_ip_cm_info_t ipcm_info; 1805 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1806 1807 1808 (void) bzero(&chan_args, sizeof (chan_args)); 1809 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1810 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1811 1812 ipcm_info.src_addr.family = rptp->srcip.family; 1813 switch (ipcm_info.src_addr.family) { 1814 case AF_INET: 1815 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1816 break; 1817 case AF_INET6: 1818 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1819 break; 1820 } 1821 1822 ipcm_info.dst_addr.family = rptp->srcip.family; 1823 switch (ipcm_info.dst_addr.family) { 1824 case AF_INET: 1825 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1826 break; 1827 case AF_INET6: 1828 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1829 break; 1830 } 1831 1832 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1833 1834 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1835 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1836 1837 if (ibt_status != IBT_SUCCESS) { 1838 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1839 return (-1); 1840 } 1841 1842 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1843 /* Alloc a RC channel */ 1844 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1845 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1846 qp_attr.rc_pd = hca->pd_hdl; 1847 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1848 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1849 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1850 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1851 qp_attr.rc_clone_chan = NULL; 1852 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1853 qp_attr.rc_flags = IBT_WR_SIGNALED; 1854 1855 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1856 chan_args.oc_path = &rptp->path; 1857 1858 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1859 chan_args.oc_cm_clnt_private = (void *)hca; 1860 chan_args.oc_rdma_ra_out = 4; 1861 chan_args.oc_rdma_ra_in = 4; 1862 chan_args.oc_path_retry_cnt = 2; 1863 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1864 chan_args.oc_priv_data = cmp_ip_pvt; 1865 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1866 1867 refresh: 1868 rw_enter(&hca->state_lock, RW_READER); 1869 if (hca->state != HCA_DETACHED) { 1870 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1871 IBT_ACHAN_NO_FLAGS, 1872 &qp_attr, &qp->qp_hdl, 1873 &chan_sizes); 1874 } else { 1875 rw_exit(&hca->state_lock); 1876 return (RDMA_FAILED); 1877 } 1878 rw_exit(&hca->state_lock); 1879 1880 if (ibt_status != IBT_SUCCESS) { 1881 DTRACE_PROBE1(rpcib__i_conntosrv, 1882 int, ibt_status); 1883 return (RDMA_FAILED); 1884 } 1885 1886 /* Connect to the Server */ 1887 (void) bzero(&ret_args, sizeof (ret_args)); 1888 mutex_enter(&qp->cb_lock); 1889 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1890 IBT_BLOCKING, &chan_args, &ret_args); 1891 if (ibt_status != IBT_SUCCESS) { 1892 DTRACE_PROBE2(rpcib__i_openrctosrv, 1893 int, ibt_status, int, ret_args.rc_status); 1894 1895 (void) ibt_free_channel(qp->qp_hdl); 1896 qp->qp_hdl = NULL; 1897 mutex_exit(&qp->cb_lock); 1898 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1899 ret_args.rc_status == IBT_CM_CONN_STALE) { 1900 /* 1901 * Got IBT_CM_CONN_STALE probably because of stale 1902 * data on the passive end of a channel that existed 1903 * prior to reboot. Retry establishing a channel 1904 * REFRESH_ATTEMPTS times, during which time the 1905 * stale conditions on the server might clear up. 1906 */ 1907 goto refresh; 1908 } 1909 return (RDMA_FAILED); 1910 } 1911 mutex_exit(&qp->cb_lock); 1912 /* 1913 * Set the private data area to qp to be used in callbacks 1914 */ 1915 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1916 return (RDMA_SUCCESS); 1917 } 1918 1919 rdma_stat 1920 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1921 { 1922 uint_t i, addr_count; 1923 ibt_status_t ibt_status; 1924 uint8_t num_paths_p; 1925 ibt_ip_path_attr_t ipattr; 1926 ibt_path_ip_src_t srcip; 1927 rpcib_ipaddrs_t addrs4; 1928 rpcib_ipaddrs_t addrs6; 1929 struct sockaddr_in *sinp; 1930 struct sockaddr_in6 *sin6p; 1931 rdma_stat retval = RDMA_FAILED; 1932 rib_hca_t *hca; 1933 1934 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1935 return (RDMA_INVAL); 1936 ASSERT(raddr->buf != NULL); 1937 1938 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1939 1940 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1941 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1942 retval = RDMA_FAILED; 1943 goto done2; 1944 } 1945 1946 if (addr_type == AF_INET) { 1947 addr_count = addrs4.ri_count; 1948 sinp = (struct sockaddr_in *)raddr->buf; 1949 rptp->dstip.family = AF_INET; 1950 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1951 sinp = addrs4.ri_list; 1952 } else { 1953 addr_count = addrs6.ri_count; 1954 sin6p = (struct sockaddr_in6 *)raddr->buf; 1955 rptp->dstip.family = AF_INET6; 1956 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1957 sin6p = addrs6.ri_list; 1958 } 1959 1960 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1961 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1962 rw_enter(&hca->state_lock, RW_READER); 1963 if (hca->state == HCA_DETACHED) { 1964 rw_exit(&hca->state_lock); 1965 continue; 1966 } 1967 1968 ipattr.ipa_dst_ip = &rptp->dstip; 1969 ipattr.ipa_hca_guid = hca->hca_guid; 1970 ipattr.ipa_ndst = 1; 1971 ipattr.ipa_max_paths = 1; 1972 ipattr.ipa_src_ip.family = rptp->dstip.family; 1973 for (i = 0; i < addr_count; i++) { 1974 num_paths_p = 0; 1975 if (addr_type == AF_INET) { 1976 ipattr.ipa_src_ip.un.ip4addr = 1977 sinp[i].sin_addr.s_addr; 1978 } else { 1979 ipattr.ipa_src_ip.un.ip6addr = 1980 sin6p[i].sin6_addr; 1981 } 1982 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1983 1984 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1985 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1986 &num_paths_p, &srcip); 1987 if (ibt_status == IBT_SUCCESS && 1988 num_paths_p != 0 && 1989 rptp->path.pi_hca_guid == hca->hca_guid) { 1990 rptp->hca = hca; 1991 rw_exit(&hca->state_lock); 1992 if (addr_type == AF_INET) { 1993 rptp->srcip.family = AF_INET; 1994 rptp->srcip.un.ip4addr = 1995 srcip.ip_primary.un.ip4addr; 1996 } else { 1997 rptp->srcip.family = AF_INET6; 1998 rptp->srcip.un.ip6addr = 1999 srcip.ip_primary.un.ip6addr; 2000 2001 } 2002 retval = RDMA_SUCCESS; 2003 goto done1; 2004 } 2005 } 2006 rw_exit(&hca->state_lock); 2007 } 2008 done1: 2009 rw_exit(&rib_stat->hcas_list_lock); 2010 done2: 2011 if (addrs4.ri_size > 0) 2012 kmem_free(addrs4.ri_list, addrs4.ri_size); 2013 if (addrs6.ri_size > 0) 2014 kmem_free(addrs6.ri_list, addrs6.ri_size); 2015 return (retval); 2016 } 2017 2018 /* 2019 * Close channel, remove from connection list and 2020 * free up resources allocated for that channel. 2021 */ 2022 rdma_stat 2023 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2024 { 2025 rib_qp_t *qp = ctoqp(conn); 2026 rib_hca_t *hca; 2027 2028 mutex_enter(&conn->c_lock); 2029 if (conn->c_timeout != NULL) { 2030 mutex_exit(&conn->c_lock); 2031 (void) untimeout(conn->c_timeout); 2032 mutex_enter(&conn->c_lock); 2033 } 2034 2035 while (conn->c_flags & C_CLOSE_PENDING) { 2036 cv_wait(&conn->c_cv, &conn->c_lock); 2037 } 2038 mutex_exit(&conn->c_lock); 2039 2040 /* 2041 * c_ref == 0 and connection is in C_DISCONN_PEND 2042 */ 2043 hca = qp->hca; 2044 if (conn_list != NULL) 2045 (void) rib_rm_conn(conn, conn_list); 2046 2047 /* 2048 * There is only one case where we get here with 2049 * qp_hdl = NULL, which is during connection setup on 2050 * the client. In such a case there are no posted 2051 * send/recv buffers. 2052 */ 2053 if (qp->qp_hdl != NULL) { 2054 mutex_enter(&qp->posted_rbufs_lock); 2055 while (qp->n_posted_rbufs) 2056 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2057 mutex_exit(&qp->posted_rbufs_lock); 2058 2059 mutex_enter(&qp->send_rbufs_lock); 2060 while (qp->n_send_rbufs) 2061 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2062 mutex_exit(&qp->send_rbufs_lock); 2063 2064 (void) ibt_free_channel(qp->qp_hdl); 2065 qp->qp_hdl = NULL; 2066 } 2067 2068 ASSERT(qp->rdlist == NULL); 2069 2070 if (qp->replylist != NULL) { 2071 (void) rib_rem_replylist(qp); 2072 } 2073 2074 cv_destroy(&qp->cb_conn_cv); 2075 cv_destroy(&qp->posted_rbufs_cv); 2076 cv_destroy(&qp->send_rbufs_cv); 2077 mutex_destroy(&qp->cb_lock); 2078 mutex_destroy(&qp->replylist_lock); 2079 mutex_destroy(&qp->posted_rbufs_lock); 2080 mutex_destroy(&qp->send_rbufs_lock); 2081 mutex_destroy(&qp->rdlist_lock); 2082 2083 cv_destroy(&conn->c_cv); 2084 mutex_destroy(&conn->c_lock); 2085 2086 if (conn->c_raddr.buf != NULL) { 2087 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2088 } 2089 if (conn->c_laddr.buf != NULL) { 2090 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2091 } 2092 2093 /* 2094 * Credit control cleanup. 2095 */ 2096 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2097 rdma_clnt_cred_ctrl_t *cc_info; 2098 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2099 cv_destroy(&cc_info->clnt_cc_cv); 2100 } 2101 2102 kmem_free(qp, sizeof (rib_qp_t)); 2103 2104 /* 2105 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2106 * then the hca is no longer being used. 2107 */ 2108 if (conn_list != NULL) { 2109 rw_enter(&hca->state_lock, RW_READER); 2110 if (hca->state == HCA_DETACHED) { 2111 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2112 if (hca->srv_conn_list.conn_hd == NULL) { 2113 rw_enter(&hca->cl_conn_list.conn_lock, 2114 RW_READER); 2115 2116 if (hca->cl_conn_list.conn_hd == NULL) { 2117 mutex_enter(&hca->inuse_lock); 2118 hca->inuse = FALSE; 2119 cv_signal(&hca->cb_cv); 2120 mutex_exit(&hca->inuse_lock); 2121 } 2122 rw_exit(&hca->cl_conn_list.conn_lock); 2123 } 2124 rw_exit(&hca->srv_conn_list.conn_lock); 2125 } 2126 rw_exit(&hca->state_lock); 2127 } 2128 2129 return (RDMA_SUCCESS); 2130 } 2131 2132 /* 2133 * All sends are done under the protection of 2134 * the wdesc->sendwait_lock. n_send_rbufs count 2135 * is protected using the send_rbufs_lock. 2136 * lock ordering is: 2137 * sendwait_lock -> send_rbufs_lock 2138 */ 2139 2140 void 2141 rib_send_hold(rib_qp_t *qp) 2142 { 2143 mutex_enter(&qp->send_rbufs_lock); 2144 qp->n_send_rbufs++; 2145 mutex_exit(&qp->send_rbufs_lock); 2146 } 2147 2148 void 2149 rib_send_rele(rib_qp_t *qp) 2150 { 2151 mutex_enter(&qp->send_rbufs_lock); 2152 qp->n_send_rbufs--; 2153 if (qp->n_send_rbufs == 0) 2154 cv_signal(&qp->send_rbufs_cv); 2155 mutex_exit(&qp->send_rbufs_lock); 2156 } 2157 2158 /* 2159 * Wait for send completion notification. Only on receiving a 2160 * notification be it a successful or error completion, free the 2161 * send_wid. 2162 */ 2163 static rdma_stat 2164 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2165 { 2166 clock_t timout, cv_wait_ret; 2167 rdma_stat error = RDMA_SUCCESS; 2168 int i; 2169 2170 /* 2171 * Wait for send to complete 2172 */ 2173 ASSERT(wd != NULL); 2174 mutex_enter(&wd->sendwait_lock); 2175 if (wd->status == (uint_t)SEND_WAIT) { 2176 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2177 ddi_get_lbolt(); 2178 2179 if (qp->mode == RIB_SERVER) { 2180 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2181 &wd->sendwait_lock, timout)) > 0 && 2182 wd->status == (uint_t)SEND_WAIT) 2183 ; 2184 switch (cv_wait_ret) { 2185 case -1: /* timeout */ 2186 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2187 2188 wd->cv_sig = 0; /* no signal needed */ 2189 error = RDMA_TIMEDOUT; 2190 break; 2191 default: /* got send completion */ 2192 break; 2193 } 2194 } else { 2195 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2196 &wd->sendwait_lock, timout)) > 0 && 2197 wd->status == (uint_t)SEND_WAIT) 2198 ; 2199 switch (cv_wait_ret) { 2200 case -1: /* timeout */ 2201 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2202 2203 wd->cv_sig = 0; /* no signal needed */ 2204 error = RDMA_TIMEDOUT; 2205 break; 2206 case 0: /* interrupted */ 2207 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2208 2209 wd->cv_sig = 0; /* no signal needed */ 2210 error = RDMA_INTR; 2211 break; 2212 default: /* got send completion */ 2213 break; 2214 } 2215 } 2216 } 2217 2218 if (wd->status != (uint_t)SEND_WAIT) { 2219 /* got send completion */ 2220 if (wd->status != RDMA_SUCCESS) { 2221 switch (wd->status) { 2222 case RDMA_CONNLOST: 2223 error = RDMA_CONNLOST; 2224 break; 2225 default: 2226 error = RDMA_FAILED; 2227 break; 2228 } 2229 } 2230 for (i = 0; i < wd->nsbufs; i++) { 2231 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2232 (void *)(uintptr_t)wd->sbufaddr[i]); 2233 } 2234 2235 rib_send_rele(qp); 2236 2237 mutex_exit(&wd->sendwait_lock); 2238 (void) rib_free_sendwait(wd); 2239 2240 } else { 2241 mutex_exit(&wd->sendwait_lock); 2242 } 2243 return (error); 2244 } 2245 2246 static struct send_wid * 2247 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2248 { 2249 struct send_wid *wd; 2250 2251 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2252 wd->xid = xid; 2253 wd->cv_sig = cv_sig; 2254 wd->qp = qp; 2255 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2256 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2257 wd->status = (uint_t)SEND_WAIT; 2258 2259 return (wd); 2260 } 2261 2262 static int 2263 rib_free_sendwait(struct send_wid *wdesc) 2264 { 2265 cv_destroy(&wdesc->wait_cv); 2266 mutex_destroy(&wdesc->sendwait_lock); 2267 kmem_free(wdesc, sizeof (*wdesc)); 2268 2269 return (0); 2270 } 2271 2272 static rdma_stat 2273 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2274 { 2275 mutex_enter(&qp->replylist_lock); 2276 if (rep != NULL) { 2277 (void) rib_remreply(qp, rep); 2278 mutex_exit(&qp->replylist_lock); 2279 return (RDMA_SUCCESS); 2280 } 2281 mutex_exit(&qp->replylist_lock); 2282 return (RDMA_FAILED); 2283 } 2284 2285 /* 2286 * Send buffers are freed here only in case of error in posting 2287 * on QP. If the post succeeded, the send buffers are freed upon 2288 * send completion in rib_sendwait() or in the scq_handler. 2289 */ 2290 rdma_stat 2291 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2292 int send_sig, int cv_sig, caddr_t *swid) 2293 { 2294 struct send_wid *wdesc; 2295 struct clist *clp; 2296 ibt_status_t ibt_status = IBT_SUCCESS; 2297 rdma_stat ret = RDMA_SUCCESS; 2298 ibt_send_wr_t tx_wr; 2299 int i, nds; 2300 ibt_wr_ds_t sgl[DSEG_MAX]; 2301 uint_t total_msg_size; 2302 rib_qp_t *qp; 2303 2304 qp = ctoqp(conn); 2305 2306 ASSERT(cl != NULL); 2307 2308 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2309 2310 nds = 0; 2311 total_msg_size = 0; 2312 clp = cl; 2313 while (clp != NULL) { 2314 if (nds >= DSEG_MAX) { 2315 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2316 return (RDMA_FAILED); 2317 } 2318 sgl[nds].ds_va = clp->w.c_saddr; 2319 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2320 sgl[nds].ds_len = clp->c_len; 2321 total_msg_size += clp->c_len; 2322 clp = clp->c_next; 2323 nds++; 2324 } 2325 2326 if (send_sig) { 2327 /* Set SEND_SIGNAL flag. */ 2328 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2329 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2330 *swid = (caddr_t)wdesc; 2331 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2332 mutex_enter(&wdesc->sendwait_lock); 2333 wdesc->nsbufs = nds; 2334 for (i = 0; i < nds; i++) { 2335 wdesc->sbufaddr[i] = sgl[i].ds_va; 2336 } 2337 } else { 2338 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2339 *swid = NULL; 2340 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2341 } 2342 2343 tx_wr.wr_opcode = IBT_WRC_SEND; 2344 tx_wr.wr_trans = IBT_RC_SRV; 2345 tx_wr.wr_nds = nds; 2346 tx_wr.wr_sgl = sgl; 2347 2348 mutex_enter(&conn->c_lock); 2349 if (conn->c_state == C_CONNECTED) { 2350 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2351 } 2352 if (conn->c_state != C_CONNECTED || 2353 ibt_status != IBT_SUCCESS) { 2354 if (conn->c_state != C_DISCONN_PEND) 2355 conn->c_state = C_ERROR_CONN; 2356 mutex_exit(&conn->c_lock); 2357 if (send_sig) { 2358 for (i = 0; i < nds; i++) { 2359 rib_rbuf_free(conn, SEND_BUFFER, 2360 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2361 } 2362 mutex_exit(&wdesc->sendwait_lock); 2363 (void) rib_free_sendwait(wdesc); 2364 } 2365 return (RDMA_CONNLOST); 2366 } 2367 2368 mutex_exit(&conn->c_lock); 2369 2370 if (send_sig) { 2371 rib_send_hold(qp); 2372 mutex_exit(&wdesc->sendwait_lock); 2373 if (cv_sig) { 2374 /* 2375 * cv_wait for send to complete. 2376 * We can fail due to a timeout or signal or 2377 * unsuccessful send. 2378 */ 2379 ret = rib_sendwait(qp, wdesc); 2380 2381 return (ret); 2382 } 2383 } 2384 2385 return (RDMA_SUCCESS); 2386 } 2387 2388 2389 rdma_stat 2390 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2391 { 2392 rdma_stat ret; 2393 caddr_t wd; 2394 2395 /* send-wait & cv_signal */ 2396 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2397 return (ret); 2398 } 2399 2400 /* 2401 * Deprecated/obsolete interface not used currently 2402 * but earlier used for READ-READ protocol. 2403 * Send RPC reply and wait for RDMA_DONE. 2404 */ 2405 rdma_stat 2406 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2407 { 2408 rdma_stat ret = RDMA_SUCCESS; 2409 struct rdma_done_list *rd; 2410 clock_t timout, cv_wait_ret; 2411 caddr_t *wid = NULL; 2412 rib_qp_t *qp = ctoqp(conn); 2413 2414 mutex_enter(&qp->rdlist_lock); 2415 rd = rdma_done_add(qp, msgid); 2416 2417 /* No cv_signal (whether send-wait or no-send-wait) */ 2418 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2419 2420 if (ret != RDMA_SUCCESS) { 2421 rdma_done_rm(qp, rd); 2422 } else { 2423 /* 2424 * Wait for RDMA_DONE from remote end 2425 */ 2426 timout = 2427 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2428 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2429 &qp->rdlist_lock, 2430 timout); 2431 2432 rdma_done_rm(qp, rd); 2433 2434 if (cv_wait_ret < 0) { 2435 ret = RDMA_TIMEDOUT; 2436 } 2437 } 2438 2439 mutex_exit(&qp->rdlist_lock); 2440 return (ret); 2441 } 2442 2443 static struct recv_wid * 2444 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2445 { 2446 struct recv_wid *rwid; 2447 2448 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2449 rwid->xid = msgid; 2450 rwid->addr = sgl->ds_va; 2451 rwid->qp = qp; 2452 2453 return (rwid); 2454 } 2455 2456 static void 2457 rib_free_wid(struct recv_wid *rwid) 2458 { 2459 kmem_free(rwid, sizeof (struct recv_wid)); 2460 } 2461 2462 rdma_stat 2463 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2464 { 2465 rib_qp_t *qp = ctoqp(conn); 2466 struct clist *clp = cl; 2467 struct reply *rep; 2468 struct recv_wid *rwid; 2469 int nds; 2470 ibt_wr_ds_t sgl[DSEG_MAX]; 2471 ibt_recv_wr_t recv_wr; 2472 rdma_stat ret; 2473 ibt_status_t ibt_status; 2474 2475 /* 2476 * rdma_clnt_postrecv uses RECV_BUFFER. 2477 */ 2478 2479 nds = 0; 2480 while (cl != NULL) { 2481 if (nds >= DSEG_MAX) { 2482 ret = RDMA_FAILED; 2483 goto done; 2484 } 2485 sgl[nds].ds_va = cl->w.c_saddr; 2486 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2487 sgl[nds].ds_len = cl->c_len; 2488 cl = cl->c_next; 2489 nds++; 2490 } 2491 2492 if (nds != 1) { 2493 ret = RDMA_FAILED; 2494 goto done; 2495 } 2496 2497 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2498 recv_wr.wr_nds = nds; 2499 recv_wr.wr_sgl = sgl; 2500 2501 rwid = rib_create_wid(qp, &sgl[0], msgid); 2502 if (rwid) { 2503 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2504 } else { 2505 ret = RDMA_NORESOURCE; 2506 goto done; 2507 } 2508 rep = rib_addreplylist(qp, msgid); 2509 if (!rep) { 2510 rib_free_wid(rwid); 2511 ret = RDMA_NORESOURCE; 2512 goto done; 2513 } 2514 2515 mutex_enter(&conn->c_lock); 2516 2517 if (conn->c_state == C_CONNECTED) { 2518 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2519 } 2520 2521 if (conn->c_state != C_CONNECTED || 2522 ibt_status != IBT_SUCCESS) { 2523 if (conn->c_state != C_DISCONN_PEND) 2524 conn->c_state = C_ERROR_CONN; 2525 mutex_exit(&conn->c_lock); 2526 rib_free_wid(rwid); 2527 (void) rib_rem_rep(qp, rep); 2528 ret = RDMA_CONNLOST; 2529 goto done; 2530 } 2531 mutex_exit(&conn->c_lock); 2532 return (RDMA_SUCCESS); 2533 2534 done: 2535 while (clp != NULL) { 2536 rib_rbuf_free(conn, RECV_BUFFER, 2537 (void *)(uintptr_t)clp->w.c_saddr3); 2538 clp = clp->c_next; 2539 } 2540 return (ret); 2541 } 2542 2543 rdma_stat 2544 rib_svc_post(CONN* conn, struct clist *cl) 2545 { 2546 rib_qp_t *qp = ctoqp(conn); 2547 struct svc_recv *s_recvp; 2548 int nds; 2549 ibt_wr_ds_t sgl[DSEG_MAX]; 2550 ibt_recv_wr_t recv_wr; 2551 ibt_status_t ibt_status; 2552 2553 nds = 0; 2554 while (cl != NULL) { 2555 if (nds >= DSEG_MAX) { 2556 return (RDMA_FAILED); 2557 } 2558 sgl[nds].ds_va = cl->w.c_saddr; 2559 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2560 sgl[nds].ds_len = cl->c_len; 2561 cl = cl->c_next; 2562 nds++; 2563 } 2564 2565 if (nds != 1) { 2566 rib_rbuf_free(conn, RECV_BUFFER, 2567 (caddr_t)(uintptr_t)sgl[0].ds_va); 2568 2569 return (RDMA_FAILED); 2570 } 2571 2572 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2573 recv_wr.wr_nds = nds; 2574 recv_wr.wr_sgl = sgl; 2575 2576 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2577 /* Use s_recvp's addr as wr id */ 2578 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2579 mutex_enter(&conn->c_lock); 2580 if (conn->c_state == C_CONNECTED) { 2581 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2582 } 2583 if (conn->c_state != C_CONNECTED || 2584 ibt_status != IBT_SUCCESS) { 2585 if (conn->c_state != C_DISCONN_PEND) 2586 conn->c_state = C_ERROR_CONN; 2587 mutex_exit(&conn->c_lock); 2588 rib_rbuf_free(conn, RECV_BUFFER, 2589 (caddr_t)(uintptr_t)sgl[0].ds_va); 2590 (void) rib_free_svc_recv(s_recvp); 2591 2592 return (RDMA_CONNLOST); 2593 } 2594 mutex_exit(&conn->c_lock); 2595 2596 return (RDMA_SUCCESS); 2597 } 2598 2599 /* Client */ 2600 rdma_stat 2601 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2602 { 2603 2604 return (rib_clnt_post(conn, cl, msgid)); 2605 } 2606 2607 /* Client */ 2608 rdma_stat 2609 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2610 { 2611 rib_qp_t *qp = ctoqp(conn); 2612 struct reply *rep; 2613 2614 mutex_enter(&qp->replylist_lock); 2615 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2616 if (rep->xid == msgid) { 2617 if (rep->vaddr_cq) { 2618 rib_rbuf_free(conn, RECV_BUFFER, 2619 (caddr_t)(uintptr_t)rep->vaddr_cq); 2620 } 2621 (void) rib_remreply(qp, rep); 2622 break; 2623 } 2624 } 2625 mutex_exit(&qp->replylist_lock); 2626 2627 return (RDMA_SUCCESS); 2628 } 2629 2630 /* Server */ 2631 rdma_stat 2632 rib_post_recv(CONN *conn, struct clist *cl) 2633 { 2634 rib_qp_t *qp = ctoqp(conn); 2635 2636 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2637 mutex_enter(&qp->posted_rbufs_lock); 2638 qp->n_posted_rbufs++; 2639 mutex_exit(&qp->posted_rbufs_lock); 2640 return (RDMA_SUCCESS); 2641 } 2642 return (RDMA_FAILED); 2643 } 2644 2645 /* 2646 * Client side only interface to "recv" the rpc reply buf 2647 * posted earlier by rib_post_resp(conn, cl, msgid). 2648 */ 2649 rdma_stat 2650 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2651 { 2652 struct reply *rep = NULL; 2653 clock_t timout, cv_wait_ret; 2654 rdma_stat ret = RDMA_SUCCESS; 2655 rib_qp_t *qp = ctoqp(conn); 2656 2657 /* 2658 * Find the reply structure for this msgid 2659 */ 2660 mutex_enter(&qp->replylist_lock); 2661 2662 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2663 if (rep->xid == msgid) 2664 break; 2665 } 2666 2667 if (rep != NULL) { 2668 /* 2669 * If message not yet received, wait. 2670 */ 2671 if (rep->status == (uint_t)REPLY_WAIT) { 2672 timout = ddi_get_lbolt() + 2673 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2674 2675 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2676 &qp->replylist_lock, timout)) > 0 && 2677 rep->status == (uint_t)REPLY_WAIT) 2678 ; 2679 2680 switch (cv_wait_ret) { 2681 case -1: /* timeout */ 2682 ret = RDMA_TIMEDOUT; 2683 break; 2684 case 0: 2685 ret = RDMA_INTR; 2686 break; 2687 default: 2688 break; 2689 } 2690 } 2691 2692 if (rep->status == RDMA_SUCCESS) { 2693 struct clist *cl = NULL; 2694 2695 /* 2696 * Got message successfully 2697 */ 2698 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2699 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2700 *clp = cl; 2701 } else { 2702 if (rep->status != (uint_t)REPLY_WAIT) { 2703 /* 2704 * Got error in reply message. Free 2705 * recv buffer here. 2706 */ 2707 ret = rep->status; 2708 rib_rbuf_free(conn, RECV_BUFFER, 2709 (caddr_t)(uintptr_t)rep->vaddr_cq); 2710 } 2711 } 2712 (void) rib_remreply(qp, rep); 2713 } else { 2714 /* 2715 * No matching reply structure found for given msgid on the 2716 * reply wait list. 2717 */ 2718 ret = RDMA_INVAL; 2719 DTRACE_PROBE(rpcib__i__nomatchxid2); 2720 } 2721 2722 /* 2723 * Done. 2724 */ 2725 mutex_exit(&qp->replylist_lock); 2726 return (ret); 2727 } 2728 2729 /* 2730 * RDMA write a buffer to the remote address. 2731 */ 2732 rdma_stat 2733 rib_write(CONN *conn, struct clist *cl, int wait) 2734 { 2735 ibt_send_wr_t tx_wr; 2736 int cv_sig; 2737 ibt_wr_ds_t sgl[DSEG_MAX]; 2738 struct send_wid *wdesc; 2739 ibt_status_t ibt_status; 2740 rdma_stat ret = RDMA_SUCCESS; 2741 rib_qp_t *qp = ctoqp(conn); 2742 uint64_t n_writes = 0; 2743 2744 if (cl == NULL) { 2745 return (RDMA_FAILED); 2746 } 2747 2748 while ((cl != NULL)) { 2749 if (cl->c_len > 0) { 2750 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2751 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2752 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2753 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2754 sgl[0].ds_va = cl->w.c_saddr; 2755 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2756 sgl[0].ds_len = cl->c_len; 2757 2758 if (wait) { 2759 cv_sig = 1; 2760 } else { 2761 if (n_writes > max_unsignaled_rws) { 2762 n_writes = 0; 2763 cv_sig = 1; 2764 } else { 2765 cv_sig = 0; 2766 } 2767 } 2768 2769 if (cv_sig) { 2770 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2771 wdesc = rib_init_sendwait(0, cv_sig, qp); 2772 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2773 mutex_enter(&wdesc->sendwait_lock); 2774 } else { 2775 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2776 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2777 } 2778 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2779 tx_wr.wr_trans = IBT_RC_SRV; 2780 tx_wr.wr_nds = 1; 2781 tx_wr.wr_sgl = sgl; 2782 2783 mutex_enter(&conn->c_lock); 2784 if (conn->c_state == C_CONNECTED) { 2785 ibt_status = 2786 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2787 } 2788 if (conn->c_state != C_CONNECTED || 2789 ibt_status != IBT_SUCCESS) { 2790 if (conn->c_state != C_DISCONN_PEND) 2791 conn->c_state = C_ERROR_CONN; 2792 mutex_exit(&conn->c_lock); 2793 if (cv_sig) { 2794 mutex_exit(&wdesc->sendwait_lock); 2795 (void) rib_free_sendwait(wdesc); 2796 } 2797 return (RDMA_CONNLOST); 2798 } 2799 2800 mutex_exit(&conn->c_lock); 2801 2802 /* 2803 * Wait for send to complete 2804 */ 2805 if (cv_sig) { 2806 2807 rib_send_hold(qp); 2808 mutex_exit(&wdesc->sendwait_lock); 2809 2810 ret = rib_sendwait(qp, wdesc); 2811 if (ret != 0) 2812 return (ret); 2813 } 2814 n_writes ++; 2815 } 2816 cl = cl->c_next; 2817 } 2818 return (RDMA_SUCCESS); 2819 } 2820 2821 /* 2822 * RDMA Read a buffer from the remote address. 2823 */ 2824 rdma_stat 2825 rib_read(CONN *conn, struct clist *cl, int wait) 2826 { 2827 ibt_send_wr_t rx_wr; 2828 int cv_sig = 0; 2829 ibt_wr_ds_t sgl; 2830 struct send_wid *wdesc; 2831 ibt_status_t ibt_status = IBT_SUCCESS; 2832 rdma_stat ret = RDMA_SUCCESS; 2833 rib_qp_t *qp = ctoqp(conn); 2834 2835 if (cl == NULL) { 2836 return (RDMA_FAILED); 2837 } 2838 2839 while (cl != NULL) { 2840 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2841 /* 2842 * Remote address is at the head chunk item in list. 2843 */ 2844 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2845 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2846 2847 sgl.ds_va = cl->u.c_daddr; 2848 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2849 sgl.ds_len = cl->c_len; 2850 2851 /* 2852 * If there are multiple chunks to be read, and 2853 * wait is set, ask for signal only for the last chunk 2854 * and wait only on the last chunk. The completion of 2855 * RDMA_READ on last chunk ensures that reads on all 2856 * previous chunks are also completed. 2857 */ 2858 if (wait && (cl->c_next == NULL)) { 2859 cv_sig = 1; 2860 wdesc = rib_init_sendwait(0, cv_sig, qp); 2861 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2862 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2863 mutex_enter(&wdesc->sendwait_lock); 2864 } else { 2865 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2866 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2867 } 2868 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2869 rx_wr.wr_trans = IBT_RC_SRV; 2870 rx_wr.wr_nds = 1; 2871 rx_wr.wr_sgl = &sgl; 2872 2873 mutex_enter(&conn->c_lock); 2874 if (conn->c_state == C_CONNECTED) { 2875 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2876 } 2877 if (conn->c_state != C_CONNECTED || 2878 ibt_status != IBT_SUCCESS) { 2879 if (conn->c_state != C_DISCONN_PEND) 2880 conn->c_state = C_ERROR_CONN; 2881 mutex_exit(&conn->c_lock); 2882 if (wait && (cl->c_next == NULL)) { 2883 mutex_exit(&wdesc->sendwait_lock); 2884 (void) rib_free_sendwait(wdesc); 2885 } 2886 return (RDMA_CONNLOST); 2887 } 2888 2889 mutex_exit(&conn->c_lock); 2890 2891 /* 2892 * Wait for send to complete if this is the 2893 * last item in the list. 2894 */ 2895 if (wait && cl->c_next == NULL) { 2896 rib_send_hold(qp); 2897 mutex_exit(&wdesc->sendwait_lock); 2898 2899 ret = rib_sendwait(qp, wdesc); 2900 2901 if (ret != 0) 2902 return (ret); 2903 } 2904 cl = cl->c_next; 2905 } 2906 return (RDMA_SUCCESS); 2907 } 2908 2909 /* 2910 * rib_srv_cm_handler() 2911 * Connection Manager callback to handle RC connection requests. 2912 */ 2913 /* ARGSUSED */ 2914 static ibt_cm_status_t 2915 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2916 ibt_cm_return_args_t *ret_args, void *priv_data, 2917 ibt_priv_data_len_t len) 2918 { 2919 queue_t *q; 2920 rib_qp_t *qp; 2921 rib_hca_t *hca; 2922 rdma_stat status = RDMA_SUCCESS; 2923 int i; 2924 struct clist cl; 2925 rdma_buf_t rdbuf = {0}; 2926 void *buf = NULL; 2927 CONN *conn; 2928 ibt_ip_cm_info_t ipinfo; 2929 struct sockaddr_in *s; 2930 struct sockaddr_in6 *s6; 2931 int sin_size = sizeof (struct sockaddr_in); 2932 int in_size = sizeof (struct in_addr); 2933 int sin6_size = sizeof (struct sockaddr_in6); 2934 2935 ASSERT(any != NULL); 2936 ASSERT(event != NULL); 2937 2938 hca = (rib_hca_t *)any; 2939 2940 /* got a connection request */ 2941 switch (event->cm_type) { 2942 case IBT_CM_EVENT_REQ_RCV: 2943 /* 2944 * If the plugin is in the NO_ACCEPT state, bail out. 2945 */ 2946 mutex_enter(&plugin_state_lock); 2947 if (plugin_state == NO_ACCEPT) { 2948 mutex_exit(&plugin_state_lock); 2949 return (IBT_CM_REJECT); 2950 } 2951 mutex_exit(&plugin_state_lock); 2952 2953 /* 2954 * Need to send a MRA MAD to CM so that it does not 2955 * timeout on us. 2956 */ 2957 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2958 event->cm_event.req.req_timeout * 8, NULL, 0); 2959 2960 mutex_enter(&rib_stat->open_hca_lock); 2961 q = rib_stat->q; 2962 mutex_exit(&rib_stat->open_hca_lock); 2963 2964 status = rib_svc_create_chan(hca, (caddr_t)q, 2965 event->cm_event.req.req_prim_hca_port, &qp); 2966 2967 if (status) { 2968 return (IBT_CM_REJECT); 2969 } 2970 2971 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2972 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2973 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2974 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2975 2976 /* 2977 * Pre-posts RECV buffers 2978 */ 2979 conn = qptoc(qp); 2980 for (i = 0; i < preposted_rbufs; i++) { 2981 bzero(&rdbuf, sizeof (rdbuf)); 2982 rdbuf.type = RECV_BUFFER; 2983 buf = rib_rbuf_alloc(conn, &rdbuf); 2984 if (buf == NULL) { 2985 /* 2986 * A connection is not established yet. 2987 * Just flush the channel. Buffers 2988 * posted till now will error out with 2989 * IBT_WC_WR_FLUSHED_ERR. 2990 */ 2991 (void) ibt_flush_channel(qp->qp_hdl); 2992 (void) rib_disconnect_channel(conn, NULL); 2993 return (IBT_CM_REJECT); 2994 } 2995 2996 bzero(&cl, sizeof (cl)); 2997 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2998 cl.c_len = rdbuf.len; 2999 cl.c_smemhandle.mrc_lmr = 3000 rdbuf.handle.mrc_lmr; /* lkey */ 3001 cl.c_next = NULL; 3002 status = rib_post_recv(conn, &cl); 3003 if (status != RDMA_SUCCESS) { 3004 /* 3005 * A connection is not established yet. 3006 * Just flush the channel. Buffers 3007 * posted till now will error out with 3008 * IBT_WC_WR_FLUSHED_ERR. 3009 */ 3010 (void) ibt_flush_channel(qp->qp_hdl); 3011 (void) rib_disconnect_channel(conn, NULL); 3012 return (IBT_CM_REJECT); 3013 } 3014 } 3015 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3016 3017 /* 3018 * Get the address translation 3019 */ 3020 rw_enter(&hca->state_lock, RW_READER); 3021 if (hca->state == HCA_DETACHED) { 3022 rw_exit(&hca->state_lock); 3023 return (IBT_CM_REJECT); 3024 } 3025 rw_exit(&hca->state_lock); 3026 3027 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3028 3029 if (ibt_get_ip_data(event->cm_priv_data_len, 3030 event->cm_priv_data, 3031 &ipinfo) != IBT_SUCCESS) { 3032 3033 return (IBT_CM_REJECT); 3034 } 3035 3036 switch (ipinfo.src_addr.family) { 3037 case AF_INET: 3038 3039 conn->c_raddr.maxlen = 3040 conn->c_raddr.len = sin_size; 3041 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3042 3043 s = (struct sockaddr_in *)conn->c_raddr.buf; 3044 s->sin_family = AF_INET; 3045 3046 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3047 &s->sin_addr, in_size); 3048 3049 break; 3050 3051 case AF_INET6: 3052 3053 conn->c_raddr.maxlen = 3054 conn->c_raddr.len = sin6_size; 3055 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3056 3057 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3058 s6->sin6_family = AF_INET6; 3059 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3060 &s6->sin6_addr, 3061 sizeof (struct in6_addr)); 3062 3063 break; 3064 3065 default: 3066 return (IBT_CM_REJECT); 3067 } 3068 3069 break; 3070 3071 case IBT_CM_EVENT_CONN_CLOSED: 3072 { 3073 CONN *conn; 3074 rib_qp_t *qp; 3075 3076 switch (event->cm_event.closed) { 3077 case IBT_CM_CLOSED_DREP_RCVD: 3078 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3079 case IBT_CM_CLOSED_DUP: 3080 case IBT_CM_CLOSED_ABORT: 3081 case IBT_CM_CLOSED_ALREADY: 3082 /* 3083 * These cases indicate the local end initiated 3084 * the closing of the channel. Nothing to do here. 3085 */ 3086 break; 3087 default: 3088 /* 3089 * Reason for CONN_CLOSED event must be one of 3090 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3091 * or IBT_CM_CLOSED_STALE. These indicate cases were 3092 * the remote end is closing the channel. In these 3093 * cases free the channel and transition to error 3094 * state 3095 */ 3096 qp = ibt_get_chan_private(event->cm_channel); 3097 conn = qptoc(qp); 3098 mutex_enter(&conn->c_lock); 3099 if (conn->c_state == C_DISCONN_PEND) { 3100 mutex_exit(&conn->c_lock); 3101 break; 3102 } 3103 conn->c_state = C_ERROR_CONN; 3104 3105 /* 3106 * Free the conn if c_ref goes down to 0 3107 */ 3108 if (conn->c_ref == 0) { 3109 /* 3110 * Remove from list and free conn 3111 */ 3112 conn->c_state = C_DISCONN_PEND; 3113 mutex_exit(&conn->c_lock); 3114 (void) rib_disconnect_channel(conn, 3115 &hca->srv_conn_list); 3116 } else { 3117 /* 3118 * conn will be freed when c_ref goes to 0. 3119 * Indicate to cleaning thread not to close 3120 * the connection, but just free the channel. 3121 */ 3122 conn->c_flags |= C_CLOSE_NOTNEEDED; 3123 mutex_exit(&conn->c_lock); 3124 } 3125 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3126 break; 3127 } 3128 break; 3129 } 3130 case IBT_CM_EVENT_CONN_EST: 3131 /* 3132 * RTU received, hence connection established. 3133 */ 3134 if (rib_debug > 1) 3135 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3136 "(CONN_EST) channel established"); 3137 break; 3138 3139 default: 3140 if (rib_debug > 2) { 3141 /* Let CM handle the following events. */ 3142 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3143 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3144 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3145 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3146 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3147 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3148 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3149 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3150 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3151 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3152 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3153 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3154 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3155 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3156 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3157 } 3158 } 3159 return (IBT_CM_DEFAULT); 3160 } 3161 3162 /* accept all other CM messages (i.e. let the CM handle them) */ 3163 return (IBT_CM_ACCEPT); 3164 } 3165 3166 static rdma_stat 3167 rib_register_service(rib_hca_t *hca, int service_type, 3168 uint8_t protocol_num, in_port_t dst_port) 3169 { 3170 ibt_srv_desc_t sdesc; 3171 ibt_hca_portinfo_t *port_infop; 3172 ib_svc_id_t srv_id; 3173 ibt_srv_hdl_t srv_hdl; 3174 uint_t port_size; 3175 uint_t pki, i, num_ports, nbinds; 3176 ibt_status_t ibt_status; 3177 rib_service_t *service; 3178 ib_pkey_t pkey; 3179 3180 /* 3181 * Query all ports for the given HCA 3182 */ 3183 rw_enter(&hca->state_lock, RW_READER); 3184 if (hca->state != HCA_DETACHED) { 3185 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3186 &num_ports, &port_size); 3187 rw_exit(&hca->state_lock); 3188 } else { 3189 rw_exit(&hca->state_lock); 3190 return (RDMA_FAILED); 3191 } 3192 if (ibt_status != IBT_SUCCESS) { 3193 return (RDMA_FAILED); 3194 } 3195 3196 DTRACE_PROBE1(rpcib__i__regservice_numports, 3197 int, num_ports); 3198 3199 for (i = 0; i < num_ports; i++) { 3200 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3201 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3202 int, i+1); 3203 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3204 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3205 int, i+1); 3206 } 3207 } 3208 3209 /* 3210 * Get all the IP addresses on this system to register the 3211 * given "service type" on all DNS recognized IP addrs. 3212 * Each service type such as NFS will have all the systems 3213 * IP addresses as its different names. For now the only 3214 * type of service we support in RPCIB is NFS. 3215 */ 3216 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3217 /* 3218 * Start registering and binding service to active 3219 * on active ports on this HCA. 3220 */ 3221 nbinds = 0; 3222 for (service = rib_stat->service_list; 3223 service && (service->srv_type != service_type); 3224 service = service->next) 3225 ; 3226 3227 if (service == NULL) { 3228 /* 3229 * We use IP addresses as the service names for 3230 * service registration. Register each of them 3231 * with CM to obtain a svc_id and svc_hdl. We do not 3232 * register the service with machine's loopback address. 3233 */ 3234 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3235 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3236 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3237 sdesc.sd_handler = rib_srv_cm_handler; 3238 sdesc.sd_flags = 0; 3239 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3240 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3241 1, &srv_hdl, &srv_id); 3242 if ((ibt_status != IBT_SUCCESS) && 3243 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3244 rw_exit(&rib_stat->service_list_lock); 3245 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3246 int, ibt_status); 3247 ibt_free_portinfo(port_infop, port_size); 3248 return (RDMA_FAILED); 3249 } 3250 3251 /* 3252 * Allocate and prepare a service entry 3253 */ 3254 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3255 3256 service->srv_type = service_type; 3257 service->srv_hdl = srv_hdl; 3258 service->srv_id = srv_id; 3259 3260 service->next = rib_stat->service_list; 3261 rib_stat->service_list = service; 3262 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3263 int, service->srv_type); 3264 } else { 3265 srv_hdl = service->srv_hdl; 3266 srv_id = service->srv_id; 3267 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3268 int, service->srv_type); 3269 } 3270 3271 for (i = 0; i < num_ports; i++) { 3272 ibt_sbind_hdl_t sbp; 3273 rib_hca_service_t *hca_srv; 3274 ib_gid_t gid; 3275 3276 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3277 continue; 3278 3279 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3280 pkey = port_infop[i].p_pkey_tbl[pki]; 3281 3282 rw_enter(&hca->bound_services_lock, RW_READER); 3283 gid = port_infop[i].p_sgid_tbl[0]; 3284 for (hca_srv = hca->bound_services; hca_srv; 3285 hca_srv = hca_srv->next) { 3286 if ((hca_srv->srv_id == service->srv_id) && 3287 (hca_srv->gid.gid_prefix == 3288 gid.gid_prefix) && 3289 (hca_srv->gid.gid_guid == gid.gid_guid)) 3290 break; 3291 } 3292 rw_exit(&hca->bound_services_lock); 3293 if (hca_srv != NULL) { 3294 /* 3295 * port is alreay bound the the service 3296 */ 3297 DTRACE_PROBE1( 3298 rpcib__i__regservice__already__bound, 3299 int, i+1); 3300 nbinds++; 3301 continue; 3302 } 3303 3304 if ((pkey & IBSRM_HB) && 3305 (pkey != IB_PKEY_INVALID_FULL)) { 3306 3307 sbp = NULL; 3308 ibt_status = ibt_bind_service(srv_hdl, 3309 gid, NULL, hca, &sbp); 3310 3311 if (ibt_status == IBT_SUCCESS) { 3312 hca_srv = kmem_zalloc( 3313 sizeof (rib_hca_service_t), 3314 KM_SLEEP); 3315 hca_srv->srv_id = srv_id; 3316 hca_srv->gid = gid; 3317 hca_srv->sbind_hdl = sbp; 3318 3319 rw_enter(&hca->bound_services_lock, 3320 RW_WRITER); 3321 hca_srv->next = hca->bound_services; 3322 hca->bound_services = hca_srv; 3323 rw_exit(&hca->bound_services_lock); 3324 nbinds++; 3325 } 3326 3327 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3328 int, ibt_status); 3329 } 3330 } 3331 } 3332 rw_exit(&rib_stat->service_list_lock); 3333 3334 ibt_free_portinfo(port_infop, port_size); 3335 3336 if (nbinds == 0) { 3337 return (RDMA_FAILED); 3338 } else { 3339 /* 3340 * Put this plugin into accept state, since atleast 3341 * one registration was successful. 3342 */ 3343 mutex_enter(&plugin_state_lock); 3344 plugin_state = ACCEPT; 3345 mutex_exit(&plugin_state_lock); 3346 return (RDMA_SUCCESS); 3347 } 3348 } 3349 3350 void 3351 rib_listen(struct rdma_svc_data *rd) 3352 { 3353 rdma_stat status; 3354 int n_listening = 0; 3355 rib_hca_t *hca; 3356 3357 mutex_enter(&rib_stat->listen_lock); 3358 /* 3359 * if rd parameter is NULL then it means that rib_stat->q is 3360 * already initialized by a call from RDMA and we just want to 3361 * add a newly attached HCA to the same listening state as other 3362 * HCAs. 3363 */ 3364 if (rd == NULL) { 3365 if (rib_stat->q == NULL) { 3366 mutex_exit(&rib_stat->listen_lock); 3367 return; 3368 } 3369 } else { 3370 rib_stat->q = &rd->q; 3371 } 3372 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3373 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3374 /* 3375 * First check if a hca is still attached 3376 */ 3377 rw_enter(&hca->state_lock, RW_READER); 3378 if (hca->state != HCA_INITED) { 3379 rw_exit(&hca->state_lock); 3380 continue; 3381 } 3382 rw_exit(&hca->state_lock); 3383 3384 /* 3385 * Right now the only service type is NFS. Hence 3386 * force feed this value. Ideally to communicate 3387 * the service type it should be passed down in 3388 * rdma_svc_data. 3389 */ 3390 status = rib_register_service(hca, NFS, 3391 IPPROTO_TCP, nfs_rdma_port); 3392 if (status == RDMA_SUCCESS) 3393 n_listening++; 3394 } 3395 rw_exit(&rib_stat->hcas_list_lock); 3396 3397 /* 3398 * Service active on an HCA, check rd->err_code for more 3399 * explainable errors. 3400 */ 3401 if (rd) { 3402 if (n_listening > 0) { 3403 rd->active = 1; 3404 rd->err_code = RDMA_SUCCESS; 3405 } else { 3406 rd->active = 0; 3407 rd->err_code = RDMA_FAILED; 3408 } 3409 } 3410 mutex_exit(&rib_stat->listen_lock); 3411 } 3412 3413 /* XXXX */ 3414 /* ARGSUSED */ 3415 static void 3416 rib_listen_stop(struct rdma_svc_data *svcdata) 3417 { 3418 rib_hca_t *hca; 3419 3420 mutex_enter(&rib_stat->listen_lock); 3421 /* 3422 * KRPC called the RDMATF to stop the listeners, this means 3423 * stop sending incomming or recieved requests to KRPC master 3424 * transport handle for RDMA-IB. This is also means that the 3425 * master transport handle, responsible for us, is going away. 3426 */ 3427 mutex_enter(&plugin_state_lock); 3428 plugin_state = NO_ACCEPT; 3429 if (svcdata != NULL) 3430 svcdata->active = 0; 3431 mutex_exit(&plugin_state_lock); 3432 3433 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3434 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3435 /* 3436 * First check if a hca is still attached 3437 */ 3438 rw_enter(&hca->state_lock, RW_READER); 3439 if (hca->state == HCA_DETACHED) { 3440 rw_exit(&hca->state_lock); 3441 continue; 3442 } 3443 rib_close_channels(&hca->srv_conn_list); 3444 rib_stop_services(hca); 3445 rw_exit(&hca->state_lock); 3446 } 3447 rw_exit(&rib_stat->hcas_list_lock); 3448 3449 /* 3450 * Avoid rib_listen() using the stale q field. 3451 * This could happen if a port goes up after all services 3452 * are already unregistered. 3453 */ 3454 rib_stat->q = NULL; 3455 mutex_exit(&rib_stat->listen_lock); 3456 } 3457 3458 /* 3459 * Traverse the HCA's service list to unbind and deregister services. 3460 * For each bound service of HCA to be removed, first find the corresponding 3461 * service handle (srv_hdl) and then unbind the service by calling 3462 * ibt_unbind_service(). 3463 */ 3464 static void 3465 rib_stop_services(rib_hca_t *hca) 3466 { 3467 rib_hca_service_t *srv_list, *to_remove; 3468 3469 /* 3470 * unbind and deregister the services for this service type. 3471 * Right now there is only one service type. In future it will 3472 * be passed down to this function. 3473 */ 3474 rw_enter(&hca->bound_services_lock, RW_READER); 3475 srv_list = hca->bound_services; 3476 hca->bound_services = NULL; 3477 rw_exit(&hca->bound_services_lock); 3478 3479 while (srv_list != NULL) { 3480 rib_service_t *sc; 3481 3482 to_remove = srv_list; 3483 srv_list = to_remove->next; 3484 rw_enter(&rib_stat->service_list_lock, RW_READER); 3485 for (sc = rib_stat->service_list; 3486 sc && (sc->srv_id != to_remove->srv_id); 3487 sc = sc->next) 3488 ; 3489 /* 3490 * if sc is NULL then the service doesn't exist anymore, 3491 * probably just removed completely through rib_stat. 3492 */ 3493 if (sc != NULL) 3494 (void) ibt_unbind_service(sc->srv_hdl, 3495 to_remove->sbind_hdl); 3496 rw_exit(&rib_stat->service_list_lock); 3497 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3498 } 3499 } 3500 3501 static struct svc_recv * 3502 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3503 { 3504 struct svc_recv *recvp; 3505 3506 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3507 recvp->vaddr = sgl->ds_va; 3508 recvp->qp = qp; 3509 recvp->bytes_xfer = 0; 3510 return (recvp); 3511 } 3512 3513 static int 3514 rib_free_svc_recv(struct svc_recv *recvp) 3515 { 3516 kmem_free(recvp, sizeof (*recvp)); 3517 3518 return (0); 3519 } 3520 3521 static struct reply * 3522 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3523 { 3524 struct reply *rep; 3525 3526 3527 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3528 if (rep == NULL) { 3529 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3530 return (NULL); 3531 } 3532 rep->xid = msgid; 3533 rep->vaddr_cq = NULL; 3534 rep->bytes_xfer = 0; 3535 rep->status = (uint_t)REPLY_WAIT; 3536 rep->prev = NULL; 3537 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3538 3539 mutex_enter(&qp->replylist_lock); 3540 if (qp->replylist) { 3541 rep->next = qp->replylist; 3542 qp->replylist->prev = rep; 3543 } 3544 qp->rep_list_size++; 3545 3546 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3547 int, qp->rep_list_size); 3548 3549 qp->replylist = rep; 3550 mutex_exit(&qp->replylist_lock); 3551 3552 return (rep); 3553 } 3554 3555 static rdma_stat 3556 rib_rem_replylist(rib_qp_t *qp) 3557 { 3558 struct reply *r, *n; 3559 3560 mutex_enter(&qp->replylist_lock); 3561 for (r = qp->replylist; r != NULL; r = n) { 3562 n = r->next; 3563 (void) rib_remreply(qp, r); 3564 } 3565 mutex_exit(&qp->replylist_lock); 3566 3567 return (RDMA_SUCCESS); 3568 } 3569 3570 static int 3571 rib_remreply(rib_qp_t *qp, struct reply *rep) 3572 { 3573 3574 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3575 if (rep->prev) { 3576 rep->prev->next = rep->next; 3577 } 3578 if (rep->next) { 3579 rep->next->prev = rep->prev; 3580 } 3581 if (qp->replylist == rep) 3582 qp->replylist = rep->next; 3583 3584 cv_destroy(&rep->wait_cv); 3585 qp->rep_list_size--; 3586 3587 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3588 int, qp->rep_list_size); 3589 3590 kmem_free(rep, sizeof (*rep)); 3591 3592 return (0); 3593 } 3594 3595 rdma_stat 3596 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3597 struct mrc *buf_handle) 3598 { 3599 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3600 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3601 rdma_stat status; 3602 rib_hca_t *hca = (ctoqp(conn))->hca; 3603 3604 /* 3605 * Note: ALL buffer pools use the same memory type RDMARW. 3606 */ 3607 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3608 if (status == RDMA_SUCCESS) { 3609 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3610 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3611 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3612 } else { 3613 buf_handle->mrc_linfo = NULL; 3614 buf_handle->mrc_lmr = 0; 3615 buf_handle->mrc_rmr = 0; 3616 } 3617 return (status); 3618 } 3619 3620 static rdma_stat 3621 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3622 ibt_mr_flags_t spec, 3623 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3624 { 3625 ibt_mr_attr_t mem_attr; 3626 ibt_status_t ibt_status; 3627 mem_attr.mr_vaddr = (uintptr_t)buf; 3628 mem_attr.mr_len = (ib_msglen_t)size; 3629 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3630 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3631 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3632 IBT_MR_ENABLE_WINDOW_BIND | spec; 3633 3634 rw_enter(&hca->state_lock, RW_READER); 3635 if (hca->state != HCA_DETACHED) { 3636 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3637 &mem_attr, mr_hdlp, mr_descp); 3638 rw_exit(&hca->state_lock); 3639 } else { 3640 rw_exit(&hca->state_lock); 3641 return (RDMA_FAILED); 3642 } 3643 3644 if (ibt_status != IBT_SUCCESS) { 3645 return (RDMA_FAILED); 3646 } 3647 return (RDMA_SUCCESS); 3648 } 3649 3650 rdma_stat 3651 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3652 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3653 { 3654 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3655 rib_lrc_entry_t *l; 3656 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3657 rdma_stat status; 3658 rib_hca_t *hca = (ctoqp(conn))->hca; 3659 3660 /* 3661 * Non-coherent memory registration. 3662 */ 3663 l = (rib_lrc_entry_t *)lrc; 3664 if (l) { 3665 if (l->registered) { 3666 buf_handle->mrc_linfo = 3667 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3668 buf_handle->mrc_lmr = 3669 (uint32_t)l->lrc_mhandle.mrc_lmr; 3670 buf_handle->mrc_rmr = 3671 (uint32_t)l->lrc_mhandle.mrc_rmr; 3672 *sync_handle = (RIB_SYNCMEM_HANDLE) 3673 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3674 return (RDMA_SUCCESS); 3675 } else { 3676 /* Always register the whole buffer */ 3677 buf = (caddr_t)l->lrc_buf; 3678 buflen = l->lrc_len; 3679 } 3680 } 3681 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3682 3683 if (status == RDMA_SUCCESS) { 3684 if (l) { 3685 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3686 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3687 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3688 l->registered = TRUE; 3689 } 3690 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3691 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3692 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3693 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3694 } else { 3695 buf_handle->mrc_linfo = NULL; 3696 buf_handle->mrc_lmr = 0; 3697 buf_handle->mrc_rmr = 0; 3698 } 3699 return (status); 3700 } 3701 3702 /* ARGSUSED */ 3703 rdma_stat 3704 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3705 { 3706 rib_hca_t *hca = (ctoqp(conn))->hca; 3707 /* 3708 * Allow memory deregistration even if HCA is 3709 * getting detached. Need all outstanding 3710 * memory registrations to be deregistered 3711 * before HCA_DETACH_EVENT can be accepted. 3712 */ 3713 (void) ibt_deregister_mr(hca->hca_hdl, 3714 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3715 return (RDMA_SUCCESS); 3716 } 3717 3718 /* ARGSUSED */ 3719 rdma_stat 3720 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3721 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3722 { 3723 rib_lrc_entry_t *l; 3724 l = (rib_lrc_entry_t *)lrc; 3725 if (l) 3726 if (l->registered) 3727 return (RDMA_SUCCESS); 3728 3729 (void) rib_deregistermem(conn, buf, buf_handle); 3730 3731 return (RDMA_SUCCESS); 3732 } 3733 3734 /* ARGSUSED */ 3735 rdma_stat 3736 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3737 int len, int cpu) 3738 { 3739 ibt_status_t status; 3740 rib_hca_t *hca = (ctoqp(conn))->hca; 3741 ibt_mr_sync_t mr_segment; 3742 3743 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3744 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3745 mr_segment.ms_len = (ib_memlen_t)len; 3746 if (cpu) { 3747 /* make incoming data visible to memory */ 3748 mr_segment.ms_flags = IBT_SYNC_WRITE; 3749 } else { 3750 /* make memory changes visible to IO */ 3751 mr_segment.ms_flags = IBT_SYNC_READ; 3752 } 3753 rw_enter(&hca->state_lock, RW_READER); 3754 if (hca->state != HCA_DETACHED) { 3755 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3756 rw_exit(&hca->state_lock); 3757 } else { 3758 rw_exit(&hca->state_lock); 3759 return (RDMA_FAILED); 3760 } 3761 3762 if (status == IBT_SUCCESS) 3763 return (RDMA_SUCCESS); 3764 else { 3765 return (RDMA_FAILED); 3766 } 3767 } 3768 3769 /* 3770 * XXXX ???? 3771 */ 3772 static rdma_stat 3773 rib_getinfo(rdma_info_t *info) 3774 { 3775 /* 3776 * XXXX Hack! 3777 */ 3778 info->addrlen = 16; 3779 info->mts = 1000000; 3780 info->mtu = 1000000; 3781 3782 return (RDMA_SUCCESS); 3783 } 3784 3785 rib_bufpool_t * 3786 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3787 { 3788 rib_bufpool_t *rbp = NULL; 3789 bufpool_t *bp = NULL; 3790 caddr_t buf; 3791 ibt_mr_attr_t mem_attr; 3792 ibt_status_t ibt_status; 3793 int i, j; 3794 3795 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3796 3797 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3798 num * sizeof (void *), KM_SLEEP); 3799 3800 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3801 bp->numelems = num; 3802 3803 3804 switch (ptype) { 3805 case SEND_BUFFER: 3806 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3807 bp->rsize = RPC_MSG_SZ; 3808 break; 3809 case RECV_BUFFER: 3810 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3811 bp->rsize = RPC_BUF_SIZE; 3812 break; 3813 default: 3814 goto fail; 3815 } 3816 3817 /* 3818 * Register the pool. 3819 */ 3820 bp->bufsize = num * bp->rsize; 3821 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3822 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3823 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3824 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3825 sizeof (ibt_mr_desc_t), KM_SLEEP); 3826 rw_enter(&hca->state_lock, RW_READER); 3827 3828 if (hca->state == HCA_DETACHED) { 3829 rw_exit(&hca->state_lock); 3830 goto fail; 3831 } 3832 3833 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3834 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3835 mem_attr.mr_vaddr = (uintptr_t)buf; 3836 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3837 mem_attr.mr_as = NULL; 3838 ibt_status = ibt_register_mr(hca->hca_hdl, 3839 hca->pd_hdl, &mem_attr, 3840 &rbp->mr_hdl[i], 3841 &rbp->mr_desc[i]); 3842 if (ibt_status != IBT_SUCCESS) { 3843 for (j = 0; j < i; j++) { 3844 (void) ibt_deregister_mr(hca->hca_hdl, 3845 rbp->mr_hdl[j]); 3846 } 3847 rw_exit(&hca->state_lock); 3848 goto fail; 3849 } 3850 } 3851 rw_exit(&hca->state_lock); 3852 buf = (caddr_t)bp->buf; 3853 for (i = 0; i < num; i++, buf += bp->rsize) { 3854 bp->buflist[i] = (void *)buf; 3855 } 3856 bp->buffree = num - 1; /* no. of free buffers */ 3857 rbp->bpool = bp; 3858 3859 return (rbp); 3860 fail: 3861 if (bp) { 3862 if (bp->buf) 3863 kmem_free(bp->buf, bp->bufsize); 3864 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3865 } 3866 if (rbp) { 3867 if (rbp->mr_hdl) 3868 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3869 if (rbp->mr_desc) 3870 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3871 kmem_free(rbp, sizeof (rib_bufpool_t)); 3872 } 3873 return (NULL); 3874 } 3875 3876 static void 3877 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3878 { 3879 int i; 3880 rib_bufpool_t *rbp = NULL; 3881 bufpool_t *bp; 3882 3883 /* 3884 * Obtain pool address based on type of pool 3885 */ 3886 switch (ptype) { 3887 case SEND_BUFFER: 3888 rbp = hca->send_pool; 3889 break; 3890 case RECV_BUFFER: 3891 rbp = hca->recv_pool; 3892 break; 3893 default: 3894 return; 3895 } 3896 if (rbp == NULL) 3897 return; 3898 3899 bp = rbp->bpool; 3900 3901 /* 3902 * Deregister the pool memory and free it. 3903 */ 3904 for (i = 0; i < bp->numelems; i++) { 3905 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3906 } 3907 } 3908 3909 static void 3910 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3911 { 3912 3913 rib_bufpool_t *rbp = NULL; 3914 bufpool_t *bp; 3915 3916 /* 3917 * Obtain pool address based on type of pool 3918 */ 3919 switch (ptype) { 3920 case SEND_BUFFER: 3921 rbp = hca->send_pool; 3922 break; 3923 case RECV_BUFFER: 3924 rbp = hca->recv_pool; 3925 break; 3926 default: 3927 return; 3928 } 3929 if (rbp == NULL) 3930 return; 3931 3932 bp = rbp->bpool; 3933 3934 /* 3935 * Free the pool memory. 3936 */ 3937 if (rbp->mr_hdl) 3938 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3939 3940 if (rbp->mr_desc) 3941 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3942 if (bp->buf) 3943 kmem_free(bp->buf, bp->bufsize); 3944 mutex_destroy(&bp->buflock); 3945 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3946 kmem_free(rbp, sizeof (rib_bufpool_t)); 3947 } 3948 3949 void 3950 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3951 { 3952 /* 3953 * Deregister the pool memory and free it. 3954 */ 3955 rib_rbufpool_deregister(hca, ptype); 3956 rib_rbufpool_free(hca, ptype); 3957 } 3958 3959 /* 3960 * Fetch a buffer from the pool of type specified in rdbuf->type. 3961 */ 3962 static rdma_stat 3963 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3964 { 3965 rib_lrc_entry_t *rlep; 3966 3967 if (rdbuf->type == RDMA_LONG_BUFFER) { 3968 rlep = rib_get_cache_buf(conn, rdbuf->len); 3969 rdbuf->rb_private = (caddr_t)rlep; 3970 rdbuf->addr = rlep->lrc_buf; 3971 rdbuf->handle = rlep->lrc_mhandle; 3972 return (RDMA_SUCCESS); 3973 } 3974 3975 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3976 if (rdbuf->addr) { 3977 switch (rdbuf->type) { 3978 case SEND_BUFFER: 3979 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3980 break; 3981 case RECV_BUFFER: 3982 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3983 break; 3984 default: 3985 rdbuf->len = 0; 3986 } 3987 return (RDMA_SUCCESS); 3988 } else 3989 return (RDMA_FAILED); 3990 } 3991 3992 /* 3993 * Fetch a buffer of specified type. 3994 * Note that rdbuf->handle is mw's rkey. 3995 */ 3996 static void * 3997 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3998 { 3999 rib_qp_t *qp = ctoqp(conn); 4000 rib_hca_t *hca = qp->hca; 4001 rdma_btype ptype = rdbuf->type; 4002 void *buf; 4003 rib_bufpool_t *rbp = NULL; 4004 bufpool_t *bp; 4005 int i; 4006 4007 /* 4008 * Obtain pool address based on type of pool 4009 */ 4010 switch (ptype) { 4011 case SEND_BUFFER: 4012 rbp = hca->send_pool; 4013 break; 4014 case RECV_BUFFER: 4015 rbp = hca->recv_pool; 4016 break; 4017 default: 4018 return (NULL); 4019 } 4020 if (rbp == NULL) 4021 return (NULL); 4022 4023 bp = rbp->bpool; 4024 4025 mutex_enter(&bp->buflock); 4026 if (bp->buffree < 0) { 4027 mutex_exit(&bp->buflock); 4028 return (NULL); 4029 } 4030 4031 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4032 buf = bp->buflist[bp->buffree]; 4033 rdbuf->addr = buf; 4034 rdbuf->len = bp->rsize; 4035 for (i = bp->numelems - 1; i >= 0; i--) { 4036 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4037 rdbuf->handle.mrc_rmr = 4038 (uint32_t)rbp->mr_desc[i].md_rkey; 4039 rdbuf->handle.mrc_linfo = 4040 (uintptr_t)rbp->mr_hdl[i]; 4041 rdbuf->handle.mrc_lmr = 4042 (uint32_t)rbp->mr_desc[i].md_lkey; 4043 bp->buffree--; 4044 4045 mutex_exit(&bp->buflock); 4046 4047 return (buf); 4048 } 4049 } 4050 4051 mutex_exit(&bp->buflock); 4052 4053 return (NULL); 4054 } 4055 4056 static void 4057 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4058 { 4059 4060 if (rdbuf->type == RDMA_LONG_BUFFER) { 4061 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4062 rdbuf->rb_private = NULL; 4063 return; 4064 } 4065 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4066 } 4067 4068 static void 4069 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4070 { 4071 rib_qp_t *qp = ctoqp(conn); 4072 rib_hca_t *hca = qp->hca; 4073 rib_bufpool_t *rbp = NULL; 4074 bufpool_t *bp; 4075 4076 /* 4077 * Obtain pool address based on type of pool 4078 */ 4079 switch (ptype) { 4080 case SEND_BUFFER: 4081 rbp = hca->send_pool; 4082 break; 4083 case RECV_BUFFER: 4084 rbp = hca->recv_pool; 4085 break; 4086 default: 4087 return; 4088 } 4089 if (rbp == NULL) 4090 return; 4091 4092 bp = rbp->bpool; 4093 4094 mutex_enter(&bp->buflock); 4095 if (++bp->buffree >= bp->numelems) { 4096 /* 4097 * Should never happen 4098 */ 4099 bp->buffree--; 4100 } else { 4101 bp->buflist[bp->buffree] = buf; 4102 } 4103 mutex_exit(&bp->buflock); 4104 } 4105 4106 static rdma_stat 4107 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4108 { 4109 rw_enter(&connlist->conn_lock, RW_WRITER); 4110 if (connlist->conn_hd) { 4111 cn->c_next = connlist->conn_hd; 4112 connlist->conn_hd->c_prev = cn; 4113 } 4114 connlist->conn_hd = cn; 4115 rw_exit(&connlist->conn_lock); 4116 4117 return (RDMA_SUCCESS); 4118 } 4119 4120 static rdma_stat 4121 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4122 { 4123 rw_enter(&connlist->conn_lock, RW_WRITER); 4124 if (cn->c_prev) { 4125 cn->c_prev->c_next = cn->c_next; 4126 } 4127 if (cn->c_next) { 4128 cn->c_next->c_prev = cn->c_prev; 4129 } 4130 if (connlist->conn_hd == cn) 4131 connlist->conn_hd = cn->c_next; 4132 rw_exit(&connlist->conn_lock); 4133 4134 return (RDMA_SUCCESS); 4135 } 4136 4137 /* 4138 * rib_find_hca_connection 4139 * 4140 * if there is an existing connection to the specified address then 4141 * it will be returned in conn, otherwise conn will be set to NULL. 4142 * Also cleans up any connection that is in error state. 4143 */ 4144 static int 4145 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4146 struct netbuf *d_svcaddr, CONN **conn) 4147 { 4148 CONN *cn; 4149 clock_t cv_stat, timout; 4150 4151 *conn = NULL; 4152 again: 4153 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4154 cn = hca->cl_conn_list.conn_hd; 4155 while (cn != NULL) { 4156 /* 4157 * First, clear up any connection in the ERROR state 4158 */ 4159 mutex_enter(&cn->c_lock); 4160 if (cn->c_state == C_ERROR_CONN) { 4161 if (cn->c_ref == 0) { 4162 /* 4163 * Remove connection from list and destroy it. 4164 */ 4165 cn->c_state = C_DISCONN_PEND; 4166 mutex_exit(&cn->c_lock); 4167 rw_exit(&hca->cl_conn_list.conn_lock); 4168 rib_conn_close((void *)cn); 4169 goto again; 4170 } 4171 mutex_exit(&cn->c_lock); 4172 cn = cn->c_next; 4173 continue; 4174 } 4175 if (cn->c_state == C_DISCONN_PEND) { 4176 mutex_exit(&cn->c_lock); 4177 cn = cn->c_next; 4178 continue; 4179 } 4180 4181 /* 4182 * source address is only checked for if there is one, 4183 * this is the case for retries. 4184 */ 4185 if ((cn->c_raddr.len == d_svcaddr->len) && 4186 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4187 d_svcaddr->len) == 0) && 4188 ((s_svcaddr->len == 0) || 4189 ((cn->c_laddr.len == s_svcaddr->len) && 4190 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4191 s_svcaddr->len) == 0)))) { 4192 /* 4193 * Our connection. Give up conn list lock 4194 * as we are done traversing the list. 4195 */ 4196 rw_exit(&hca->cl_conn_list.conn_lock); 4197 if (cn->c_state == C_CONNECTED) { 4198 cn->c_ref++; /* sharing a conn */ 4199 mutex_exit(&cn->c_lock); 4200 *conn = cn; 4201 return (RDMA_SUCCESS); 4202 } 4203 if (cn->c_state == C_CONN_PEND) { 4204 /* 4205 * Hold a reference to this conn before 4206 * we give up the lock. 4207 */ 4208 cn->c_ref++; 4209 timout = ddi_get_lbolt() + 4210 drv_usectohz(CONN_WAIT_TIME * 1000000); 4211 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4212 &cn->c_lock, timout)) > 0 && 4213 cn->c_state == C_CONN_PEND) 4214 ; 4215 if (cv_stat == 0) { 4216 cn->c_ref--; 4217 mutex_exit(&cn->c_lock); 4218 return (RDMA_INTR); 4219 } 4220 if (cv_stat < 0) { 4221 cn->c_ref--; 4222 mutex_exit(&cn->c_lock); 4223 return (RDMA_TIMEDOUT); 4224 } 4225 if (cn->c_state == C_CONNECTED) { 4226 *conn = cn; 4227 mutex_exit(&cn->c_lock); 4228 return (RDMA_SUCCESS); 4229 } else { 4230 cn->c_ref--; 4231 mutex_exit(&cn->c_lock); 4232 return (RDMA_TIMEDOUT); 4233 } 4234 } 4235 } 4236 mutex_exit(&cn->c_lock); 4237 cn = cn->c_next; 4238 } 4239 rw_exit(&hca->cl_conn_list.conn_lock); 4240 *conn = NULL; 4241 return (RDMA_FAILED); 4242 } 4243 4244 /* 4245 * Connection management. 4246 * IBTF does not support recycling of channels. So connections are only 4247 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4248 * C_DISCONN_PEND state. No C_IDLE state. 4249 * C_CONN_PEND state: Connection establishment in progress to the server. 4250 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4251 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4252 * only in this state. 4253 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4254 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4255 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4256 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4257 * c_ref drops to 0 (this indicates that RPC has no more references to this 4258 * connection), the connection should be destroyed. A connection transitions 4259 * into this state when it is being destroyed. 4260 */ 4261 /* ARGSUSED */ 4262 static rdma_stat 4263 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4264 int addr_type, void *handle, CONN **conn) 4265 { 4266 CONN *cn; 4267 int status; 4268 rib_hca_t *hca; 4269 rib_qp_t *qp; 4270 rpcib_ping_t rpt; 4271 int s_addr_len; 4272 char *s_addr_buf; 4273 4274 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4275 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4276 rw_enter(&hca->state_lock, RW_READER); 4277 if (hca->state != HCA_DETACHED) { 4278 status = rib_find_hca_connection(hca, s_svcaddr, 4279 d_svcaddr, conn); 4280 rw_exit(&hca->state_lock); 4281 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4282 rw_exit(&rib_stat->hcas_list_lock); 4283 return (status); 4284 } 4285 } else 4286 rw_exit(&hca->state_lock); 4287 } 4288 rw_exit(&rib_stat->hcas_list_lock); 4289 4290 /* 4291 * No existing connection found, establish a new connection. 4292 */ 4293 bzero(&rpt, sizeof (rpcib_ping_t)); 4294 4295 status = rib_ping_srv(addr_type, d_svcaddr, &rpt); 4296 if (status != RDMA_SUCCESS) { 4297 return (RDMA_FAILED); 4298 } 4299 hca = rpt.hca; 4300 4301 if (rpt.srcip.family == AF_INET) { 4302 s_addr_len = sizeof (rpt.srcip.un.ip4addr); 4303 s_addr_buf = (char *)&rpt.srcip.un.ip4addr; 4304 } else if (rpt.srcip.family == AF_INET6) { 4305 s_addr_len = sizeof (rpt.srcip.un.ip6addr); 4306 s_addr_buf = (char *)&rpt.srcip.un.ip6addr; 4307 } else 4308 return (RDMA_FAILED); 4309 4310 /* 4311 * Channel to server doesn't exist yet, create one. 4312 */ 4313 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4314 return (RDMA_FAILED); 4315 } 4316 cn = qptoc(qp); 4317 cn->c_state = C_CONN_PEND; 4318 cn->c_ref = 1; 4319 4320 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4321 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4322 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4323 4324 /* 4325 * Add to conn list. 4326 * We had given up the READER lock. In the time since then, 4327 * another thread might have created the connection we are 4328 * trying here. But for now, that is quiet alright - there 4329 * might be two connections between a pair of hosts instead 4330 * of one. If we really want to close that window, 4331 * then need to check the list after acquiring the 4332 * WRITER lock. 4333 */ 4334 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4335 status = rib_conn_to_srv(hca, qp, &rpt); 4336 mutex_enter(&cn->c_lock); 4337 if (status == RDMA_SUCCESS) { 4338 cn->c_state = C_CONNECTED; 4339 *conn = cn; 4340 } else { 4341 cn->c_state = C_ERROR_CONN; 4342 cn->c_ref--; 4343 } 4344 cv_broadcast(&cn->c_cv); 4345 mutex_exit(&cn->c_lock); 4346 return (status); 4347 } 4348 4349 static void 4350 rib_conn_close(void *rarg) 4351 { 4352 CONN *conn = (CONN *)rarg; 4353 rib_qp_t *qp = ctoqp(conn); 4354 4355 mutex_enter(&conn->c_lock); 4356 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4357 4358 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4359 /* 4360 * Live connection in CONNECTED state. 4361 */ 4362 if (conn->c_state == C_CONNECTED) { 4363 conn->c_state = C_ERROR_CONN; 4364 } 4365 mutex_exit(&conn->c_lock); 4366 4367 rib_close_a_channel(conn); 4368 4369 mutex_enter(&conn->c_lock); 4370 conn->c_flags &= ~C_CLOSE_PENDING; 4371 cv_signal(&conn->c_cv); 4372 } 4373 4374 mutex_exit(&conn->c_lock); 4375 4376 if (qp->mode == RIB_SERVER) 4377 (void) rib_disconnect_channel(conn, 4378 &qp->hca->srv_conn_list); 4379 else 4380 (void) rib_disconnect_channel(conn, 4381 &qp->hca->cl_conn_list); 4382 } 4383 4384 static void 4385 rib_conn_timeout_call(void *carg) 4386 { 4387 time_t idle_time; 4388 CONN *conn = (CONN *)carg; 4389 rib_hca_t *hca = ctoqp(conn)->hca; 4390 int error; 4391 4392 mutex_enter(&conn->c_lock); 4393 if ((conn->c_ref > 0) || 4394 (conn->c_state == C_DISCONN_PEND)) { 4395 conn->c_timeout = NULL; 4396 mutex_exit(&conn->c_lock); 4397 return; 4398 } 4399 4400 idle_time = (gethrestime_sec() - conn->c_last_used); 4401 4402 if ((idle_time <= rib_conn_timeout) && 4403 (conn->c_state != C_ERROR_CONN)) { 4404 /* 4405 * There was activity after the last timeout. 4406 * Extend the conn life. Unless the conn is 4407 * already in error state. 4408 */ 4409 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4410 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4411 mutex_exit(&conn->c_lock); 4412 return; 4413 } 4414 4415 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4416 (void *)conn, DDI_NOSLEEP); 4417 4418 /* 4419 * If taskq dispatch fails above, then reset the timeout 4420 * to try again after 10 secs. 4421 */ 4422 4423 if (error != DDI_SUCCESS) { 4424 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4425 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4426 mutex_exit(&conn->c_lock); 4427 return; 4428 } 4429 4430 conn->c_state = C_DISCONN_PEND; 4431 mutex_exit(&conn->c_lock); 4432 } 4433 4434 static rdma_stat 4435 rib_conn_release(CONN *conn) 4436 { 4437 4438 mutex_enter(&conn->c_lock); 4439 conn->c_ref--; 4440 4441 conn->c_last_used = gethrestime_sec(); 4442 if (conn->c_ref > 0) { 4443 mutex_exit(&conn->c_lock); 4444 return (RDMA_SUCCESS); 4445 } 4446 4447 /* 4448 * If a conn is C_ERROR_CONN, close the channel. 4449 */ 4450 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4451 conn->c_state = C_DISCONN_PEND; 4452 mutex_exit(&conn->c_lock); 4453 rib_conn_close((void *)conn); 4454 return (RDMA_SUCCESS); 4455 } 4456 4457 /* 4458 * c_ref == 0, set a timeout for conn release 4459 */ 4460 4461 if (conn->c_timeout == NULL) { 4462 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4463 SEC_TO_TICK(rib_conn_timeout)); 4464 } 4465 4466 mutex_exit(&conn->c_lock); 4467 return (RDMA_SUCCESS); 4468 } 4469 4470 /* 4471 * Add at front of list 4472 */ 4473 static struct rdma_done_list * 4474 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4475 { 4476 struct rdma_done_list *rd; 4477 4478 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4479 4480 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4481 rd->xid = xid; 4482 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4483 4484 rd->prev = NULL; 4485 rd->next = qp->rdlist; 4486 if (qp->rdlist != NULL) 4487 qp->rdlist->prev = rd; 4488 qp->rdlist = rd; 4489 4490 return (rd); 4491 } 4492 4493 static void 4494 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4495 { 4496 struct rdma_done_list *r; 4497 4498 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4499 4500 r = rd->next; 4501 if (r != NULL) { 4502 r->prev = rd->prev; 4503 } 4504 4505 r = rd->prev; 4506 if (r != NULL) { 4507 r->next = rd->next; 4508 } else { 4509 qp->rdlist = rd->next; 4510 } 4511 4512 cv_destroy(&rd->rdma_done_cv); 4513 kmem_free(rd, sizeof (*rd)); 4514 } 4515 4516 static void 4517 rdma_done_rem_list(rib_qp_t *qp) 4518 { 4519 struct rdma_done_list *r, *n; 4520 4521 mutex_enter(&qp->rdlist_lock); 4522 for (r = qp->rdlist; r != NULL; r = n) { 4523 n = r->next; 4524 rdma_done_rm(qp, r); 4525 } 4526 mutex_exit(&qp->rdlist_lock); 4527 } 4528 4529 static void 4530 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4531 { 4532 struct rdma_done_list *r = qp->rdlist; 4533 4534 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4535 4536 while (r) { 4537 if (r->xid == xid) { 4538 cv_signal(&r->rdma_done_cv); 4539 return; 4540 } else { 4541 r = r->next; 4542 } 4543 } 4544 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4545 int, xid); 4546 } 4547 4548 /* 4549 * Expects conn->c_lock to be held by the caller. 4550 */ 4551 4552 static void 4553 rib_close_a_channel(CONN *conn) 4554 { 4555 rib_qp_t *qp; 4556 qp = ctoqp(conn); 4557 4558 if (qp->qp_hdl == NULL) { 4559 /* channel already freed */ 4560 return; 4561 } 4562 4563 /* 4564 * Call ibt_close_rc_channel in blocking mode 4565 * with no callbacks. 4566 */ 4567 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4568 NULL, 0, NULL, NULL, 0); 4569 } 4570 4571 /* 4572 * Goes through all connections and closes the channel 4573 * This will cause all the WRs on those channels to be 4574 * flushed. 4575 */ 4576 static void 4577 rib_close_channels(rib_conn_list_t *connlist) 4578 { 4579 CONN *conn, *tmp; 4580 4581 rw_enter(&connlist->conn_lock, RW_READER); 4582 conn = connlist->conn_hd; 4583 while (conn != NULL) { 4584 mutex_enter(&conn->c_lock); 4585 tmp = conn->c_next; 4586 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4587 4588 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4589 4590 /* 4591 * Live connection in CONNECTED state. 4592 */ 4593 if (conn->c_state == C_CONNECTED) 4594 conn->c_state = C_ERROR_CONN; 4595 mutex_exit(&conn->c_lock); 4596 4597 rib_close_a_channel(conn); 4598 4599 mutex_enter(&conn->c_lock); 4600 conn->c_flags &= ~C_CLOSE_PENDING; 4601 /* Signal a pending rib_disconnect_channel() */ 4602 cv_signal(&conn->c_cv); 4603 } 4604 mutex_exit(&conn->c_lock); 4605 conn = tmp; 4606 } 4607 rw_exit(&connlist->conn_lock); 4608 } 4609 4610 /* 4611 * Frees up all connections that are no longer being referenced 4612 */ 4613 static void 4614 rib_purge_connlist(rib_conn_list_t *connlist) 4615 { 4616 CONN *conn; 4617 4618 top: 4619 rw_enter(&connlist->conn_lock, RW_READER); 4620 conn = connlist->conn_hd; 4621 while (conn != NULL) { 4622 mutex_enter(&conn->c_lock); 4623 4624 /* 4625 * At this point connection is either in ERROR 4626 * or DISCONN_PEND state. If in DISCONN_PEND state 4627 * then some other thread is culling that connection. 4628 * If not and if c_ref is 0, then destroy the connection. 4629 */ 4630 if (conn->c_ref == 0 && 4631 conn->c_state != C_DISCONN_PEND) { 4632 /* 4633 * Cull the connection 4634 */ 4635 conn->c_state = C_DISCONN_PEND; 4636 mutex_exit(&conn->c_lock); 4637 rw_exit(&connlist->conn_lock); 4638 (void) rib_disconnect_channel(conn, connlist); 4639 goto top; 4640 } else { 4641 /* 4642 * conn disconnect already scheduled or will 4643 * happen from conn_release when c_ref drops to 0. 4644 */ 4645 mutex_exit(&conn->c_lock); 4646 } 4647 conn = conn->c_next; 4648 } 4649 rw_exit(&connlist->conn_lock); 4650 4651 /* 4652 * At this point, only connections with c_ref != 0 are on the list 4653 */ 4654 } 4655 4656 /* 4657 * Free all the HCA resources and close 4658 * the hca. 4659 */ 4660 4661 static void 4662 rib_free_hca(rib_hca_t *hca) 4663 { 4664 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4665 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4666 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4667 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4668 4669 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4670 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4671 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4672 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4673 4674 rib_rbufpool_destroy(hca, RECV_BUFFER); 4675 rib_rbufpool_destroy(hca, SEND_BUFFER); 4676 rib_destroy_cache(hca); 4677 if (rib_mod.rdma_count == 0) 4678 rdma_unregister_mod(&rib_mod); 4679 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4680 (void) ibt_close_hca(hca->hca_hdl); 4681 hca->hca_hdl = NULL; 4682 } 4683 4684 4685 static void 4686 rib_stop_hca_services(rib_hca_t *hca) 4687 { 4688 rib_stop_services(hca); 4689 rib_close_channels(&hca->cl_conn_list); 4690 rib_close_channels(&hca->srv_conn_list); 4691 4692 rib_purge_connlist(&hca->cl_conn_list); 4693 rib_purge_connlist(&hca->srv_conn_list); 4694 4695 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4696 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4697 GLOBAL_ZONEID); 4698 stats_enabled = FALSE; 4699 } 4700 4701 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4702 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4703 if (hca->srv_conn_list.conn_hd == NULL && 4704 hca->cl_conn_list.conn_hd == NULL) { 4705 /* 4706 * conn_lists are NULL, so destroy 4707 * buffers, close hca and be done. 4708 */ 4709 rib_free_hca(hca); 4710 } 4711 rw_exit(&hca->cl_conn_list.conn_lock); 4712 rw_exit(&hca->srv_conn_list.conn_lock); 4713 4714 if (hca->hca_hdl != NULL) { 4715 mutex_enter(&hca->inuse_lock); 4716 while (hca->inuse) 4717 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4718 mutex_exit(&hca->inuse_lock); 4719 4720 rib_free_hca(hca); 4721 } 4722 rw_destroy(&hca->bound_services_lock); 4723 4724 if (hca->cleanup_helper != NULL) { 4725 ddi_taskq_destroy(hca->cleanup_helper); 4726 hca->cleanup_helper = NULL; 4727 } 4728 } 4729 4730 /* 4731 * Cleans and closes up all uses of the HCA 4732 */ 4733 static void 4734 rib_detach_hca(rib_hca_t *hca) 4735 { 4736 rib_hca_t **hcap; 4737 4738 /* 4739 * Stop all services on the HCA 4740 * Go through cl_conn_list and close all rc_channels 4741 * Go through svr_conn_list and close all rc_channels 4742 * Free connections whose c_ref has dropped to 0 4743 * Destroy all CQs 4744 * Deregister and released all buffer pool memory after all 4745 * connections are destroyed 4746 * Free the protection domain 4747 * ibt_close_hca() 4748 */ 4749 rw_enter(&hca->state_lock, RW_WRITER); 4750 if (hca->state == HCA_DETACHED) { 4751 rw_exit(&hca->state_lock); 4752 return; 4753 } 4754 4755 hca->state = HCA_DETACHED; 4756 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4757 for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca); 4758 hcap = &(*hcap)->next) 4759 ; 4760 ASSERT(*hcap == hca); 4761 *hcap = hca->next; 4762 rib_stat->nhca_inited--; 4763 rib_mod.rdma_count--; 4764 rw_exit(&rib_stat->hcas_list_lock); 4765 rw_exit(&hca->state_lock); 4766 4767 rib_stop_hca_services(hca); 4768 4769 kmem_free(hca, sizeof (*hca)); 4770 } 4771 4772 static void 4773 rib_server_side_cache_reclaim(void *argp) 4774 { 4775 cache_avl_struct_t *rcas; 4776 rib_lrc_entry_t *rb; 4777 rib_hca_t *hca = (rib_hca_t *)argp; 4778 4779 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4780 rcas = avl_first(&hca->avl_tree); 4781 if (rcas != NULL) 4782 avl_remove(&hca->avl_tree, rcas); 4783 4784 while (rcas != NULL) { 4785 while (rcas->r.forw != &rcas->r) { 4786 rcas->elements--; 4787 rb = rcas->r.forw; 4788 remque(rb); 4789 if (rb->registered) 4790 (void) rib_deregistermem_via_hca(hca, 4791 rb->lrc_buf, rb->lrc_mhandle); 4792 4793 hca->cache_allocation -= rb->lrc_len; 4794 kmem_free(rb->lrc_buf, rb->lrc_len); 4795 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4796 } 4797 mutex_destroy(&rcas->node_lock); 4798 kmem_cache_free(hca->server_side_cache, rcas); 4799 rcas = avl_first(&hca->avl_tree); 4800 if (rcas != NULL) 4801 avl_remove(&hca->avl_tree, rcas); 4802 } 4803 rw_exit(&hca->avl_rw_lock); 4804 } 4805 4806 static void 4807 rib_server_side_cache_cleanup(void *argp) 4808 { 4809 cache_avl_struct_t *rcas; 4810 rib_lrc_entry_t *rb; 4811 rib_hca_t *hca = (rib_hca_t *)argp; 4812 4813 mutex_enter(&hca->cache_allocation_lock); 4814 if (hca->cache_allocation < cache_limit) { 4815 mutex_exit(&hca->cache_allocation_lock); 4816 return; 4817 } 4818 mutex_exit(&hca->cache_allocation_lock); 4819 4820 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4821 rcas = avl_last(&hca->avl_tree); 4822 if (rcas != NULL) 4823 avl_remove(&hca->avl_tree, rcas); 4824 4825 while (rcas != NULL) { 4826 while (rcas->r.forw != &rcas->r) { 4827 rcas->elements--; 4828 rb = rcas->r.forw; 4829 remque(rb); 4830 if (rb->registered) 4831 (void) rib_deregistermem_via_hca(hca, 4832 rb->lrc_buf, rb->lrc_mhandle); 4833 4834 hca->cache_allocation -= rb->lrc_len; 4835 4836 kmem_free(rb->lrc_buf, rb->lrc_len); 4837 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4838 } 4839 mutex_destroy(&rcas->node_lock); 4840 if (hca->server_side_cache) { 4841 kmem_cache_free(hca->server_side_cache, rcas); 4842 } 4843 4844 if (hca->cache_allocation < cache_limit) { 4845 rw_exit(&hca->avl_rw_lock); 4846 return; 4847 } 4848 4849 rcas = avl_last(&hca->avl_tree); 4850 if (rcas != NULL) 4851 avl_remove(&hca->avl_tree, rcas); 4852 } 4853 rw_exit(&hca->avl_rw_lock); 4854 } 4855 4856 static int 4857 avl_compare(const void *t1, const void *t2) 4858 { 4859 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4860 return (0); 4861 4862 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4863 return (-1); 4864 4865 return (1); 4866 } 4867 4868 static void 4869 rib_destroy_cache(rib_hca_t *hca) 4870 { 4871 if (hca->avl_init) { 4872 rib_server_side_cache_reclaim((void *)hca); 4873 if (hca->server_side_cache) { 4874 kmem_cache_destroy(hca->server_side_cache); 4875 hca->server_side_cache = NULL; 4876 } 4877 avl_destroy(&hca->avl_tree); 4878 mutex_destroy(&hca->cache_allocation_lock); 4879 rw_destroy(&hca->avl_rw_lock); 4880 } 4881 hca->avl_init = FALSE; 4882 } 4883 4884 static void 4885 rib_force_cleanup(void *hca) 4886 { 4887 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4888 (void) ddi_taskq_dispatch( 4889 ((rib_hca_t *)hca)->cleanup_helper, 4890 rib_server_side_cache_cleanup, 4891 (void *)hca, DDI_NOSLEEP); 4892 } 4893 4894 static rib_lrc_entry_t * 4895 rib_get_cache_buf(CONN *conn, uint32_t len) 4896 { 4897 cache_avl_struct_t cas, *rcas; 4898 rib_hca_t *hca = (ctoqp(conn))->hca; 4899 rib_lrc_entry_t *reply_buf; 4900 avl_index_t where = NULL; 4901 uint64_t c_alloc = 0; 4902 4903 if (!hca->avl_init) 4904 goto error_alloc; 4905 4906 cas.len = len; 4907 4908 rw_enter(&hca->avl_rw_lock, RW_READER); 4909 4910 mutex_enter(&hca->cache_allocation_lock); 4911 c_alloc = hca->cache_allocation; 4912 mutex_exit(&hca->cache_allocation_lock); 4913 4914 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4915 &where)) == NULL) { 4916 /* Am I above the cache limit */ 4917 if ((c_alloc + len) >= cache_limit) { 4918 rib_force_cleanup((void *)hca); 4919 rw_exit(&hca->avl_rw_lock); 4920 mutex_enter(&hca->cache_allocation_lock); 4921 hca->cache_misses_above_the_limit ++; 4922 mutex_exit(&hca->cache_allocation_lock); 4923 4924 /* Allocate and register the buffer directly */ 4925 goto error_alloc; 4926 } 4927 4928 rw_exit(&hca->avl_rw_lock); 4929 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4930 4931 /* Recheck to make sure no other thread added the entry in */ 4932 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4933 &cas, &where)) == NULL) { 4934 /* Allocate an avl tree entry */ 4935 rcas = (cache_avl_struct_t *) 4936 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4937 4938 bzero(rcas, sizeof (cache_avl_struct_t)); 4939 rcas->elements = 0; 4940 rcas->r.forw = &rcas->r; 4941 rcas->r.back = &rcas->r; 4942 rcas->len = len; 4943 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4944 avl_insert(&hca->avl_tree, rcas, where); 4945 } 4946 } 4947 4948 mutex_enter(&rcas->node_lock); 4949 4950 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4951 reply_buf = rcas->r.forw; 4952 remque(reply_buf); 4953 rcas->elements--; 4954 mutex_exit(&rcas->node_lock); 4955 rw_exit(&hca->avl_rw_lock); 4956 4957 mutex_enter(&hca->cache_allocation_lock); 4958 hca->cache_hits++; 4959 hca->cache_allocation -= len; 4960 mutex_exit(&hca->cache_allocation_lock); 4961 } else { 4962 /* Am I above the cache limit */ 4963 mutex_exit(&rcas->node_lock); 4964 if ((c_alloc + len) >= cache_limit) { 4965 rib_force_cleanup((void *)hca); 4966 rw_exit(&hca->avl_rw_lock); 4967 4968 mutex_enter(&hca->cache_allocation_lock); 4969 hca->cache_misses_above_the_limit++; 4970 mutex_exit(&hca->cache_allocation_lock); 4971 /* Allocate and register the buffer directly */ 4972 goto error_alloc; 4973 } 4974 rw_exit(&hca->avl_rw_lock); 4975 mutex_enter(&hca->cache_allocation_lock); 4976 hca->cache_misses++; 4977 mutex_exit(&hca->cache_allocation_lock); 4978 /* Allocate a reply_buf entry */ 4979 reply_buf = (rib_lrc_entry_t *) 4980 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4981 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4982 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4983 reply_buf->lrc_len = len; 4984 reply_buf->registered = FALSE; 4985 reply_buf->avl_node = (void *)rcas; 4986 } 4987 4988 return (reply_buf); 4989 4990 error_alloc: 4991 reply_buf = (rib_lrc_entry_t *) 4992 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4993 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4994 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4995 reply_buf->lrc_len = len; 4996 reply_buf->registered = FALSE; 4997 reply_buf->avl_node = NULL; 4998 4999 return (reply_buf); 5000 } 5001 5002 /* 5003 * Return a pre-registered back to the cache (without 5004 * unregistering the buffer).. 5005 */ 5006 5007 static void 5008 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5009 { 5010 cache_avl_struct_t cas, *rcas; 5011 avl_index_t where = NULL; 5012 rib_hca_t *hca = (ctoqp(conn))->hca; 5013 5014 if (!hca->avl_init) 5015 goto error_free; 5016 5017 cas.len = reg_buf->lrc_len; 5018 rw_enter(&hca->avl_rw_lock, RW_READER); 5019 if ((rcas = (cache_avl_struct_t *) 5020 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5021 rw_exit(&hca->avl_rw_lock); 5022 goto error_free; 5023 } else { 5024 cas.len = reg_buf->lrc_len; 5025 mutex_enter(&rcas->node_lock); 5026 insque(reg_buf, &rcas->r); 5027 rcas->elements ++; 5028 mutex_exit(&rcas->node_lock); 5029 rw_exit(&hca->avl_rw_lock); 5030 mutex_enter(&hca->cache_allocation_lock); 5031 hca->cache_allocation += cas.len; 5032 mutex_exit(&hca->cache_allocation_lock); 5033 } 5034 5035 return; 5036 5037 error_free: 5038 5039 if (reg_buf->registered) 5040 (void) rib_deregistermem_via_hca(hca, 5041 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5042 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5043 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5044 } 5045 5046 static rdma_stat 5047 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5048 uint_t buflen, struct mrc *buf_handle) 5049 { 5050 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5051 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5052 rdma_stat status; 5053 5054 5055 /* 5056 * Note: ALL buffer pools use the same memory type RDMARW. 5057 */ 5058 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5059 if (status == RDMA_SUCCESS) { 5060 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5061 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5062 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5063 } else { 5064 buf_handle->mrc_linfo = NULL; 5065 buf_handle->mrc_lmr = 0; 5066 buf_handle->mrc_rmr = 0; 5067 } 5068 return (status); 5069 } 5070 5071 /* ARGSUSED */ 5072 static rdma_stat 5073 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5074 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5075 { 5076 5077 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5078 return (RDMA_SUCCESS); 5079 } 5080 5081 /* ARGSUSED */ 5082 static rdma_stat 5083 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5084 { 5085 5086 (void) ibt_deregister_mr(hca->hca_hdl, 5087 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5088 return (RDMA_SUCCESS); 5089 } 5090 5091 /* 5092 * Check if the IP interface named by `lifrp' is RDMA-capable. 5093 */ 5094 static boolean_t 5095 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5096 { 5097 char ifname[LIFNAMSIZ]; 5098 char *cp; 5099 5100 if (lifrp->lifr_type == IFT_IB) 5101 return (B_TRUE); 5102 5103 /* 5104 * Strip off the logical interface portion before getting 5105 * intimate with the name. 5106 */ 5107 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5108 if ((cp = strchr(ifname, ':')) != NULL) 5109 *cp = '\0'; 5110 5111 return (strcmp("lo0", ifname) == 0); 5112 } 5113 5114 static int 5115 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5116 { 5117 vnode_t *kvp, *vp; 5118 TIUSER *tiptr; 5119 struct strioctl iocb; 5120 k_sigset_t smask; 5121 int err = 0; 5122 5123 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 5124 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 5125 &tiptr, CRED()) == 0) { 5126 vp = tiptr->fp->f_vnode; 5127 } else { 5128 VN_RELE(kvp); 5129 return (EPROTO); 5130 } 5131 } else { 5132 return (EPROTO); 5133 } 5134 5135 iocb.ic_cmd = cmd; 5136 iocb.ic_timout = 0; 5137 iocb.ic_len = len; 5138 iocb.ic_dp = (caddr_t)arg; 5139 sigintr(&smask, 0); 5140 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5141 sigunintr(&smask); 5142 (void) t_kclose(tiptr, 0); 5143 VN_RELE(kvp); 5144 return (err); 5145 } 5146 5147 /* 5148 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5149 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5150 */ 5151 static int 5152 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5153 { 5154 int err; 5155 struct lifnum lifn; 5156 5157 bzero(&lifn, sizeof (struct lifnum)); 5158 lifn.lifn_family = AF_UNSPEC; 5159 5160 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5161 if (err != 0) 5162 return (err); 5163 5164 /* 5165 * Pad the interface count to account for additional interfaces that 5166 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5167 */ 5168 lifn.lifn_count += 4; 5169 5170 bzero(lifcp, sizeof (struct lifconf)); 5171 lifcp->lifc_family = AF_UNSPEC; 5172 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5173 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5174 5175 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5176 if (err != 0) { 5177 kmem_free(lifcp->lifc_buf, *bufsizep); 5178 return (err); 5179 } 5180 return (0); 5181 } 5182 5183 static boolean_t 5184 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5185 { 5186 uint_t i, nifs; 5187 uint_t bufsize; 5188 struct lifconf lifc; 5189 struct lifreq *lifrp; 5190 struct sockaddr_in *sinp; 5191 struct sockaddr_in6 *sin6p; 5192 5193 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5194 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5195 5196 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5197 return (B_FALSE); 5198 5199 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5200 kmem_free(lifc.lifc_buf, bufsize); 5201 return (B_FALSE); 5202 } 5203 5204 /* 5205 * Worst case is that all of the addresses are IB-capable and have 5206 * the same address family, so size our buffers accordingly. 5207 */ 5208 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5209 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5210 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5211 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5212 5213 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5214 if (!rpcib_rdma_capable_interface(lifrp)) 5215 continue; 5216 5217 if (lifrp->lifr_addr.ss_family == AF_INET) { 5218 sinp = addrs4->ri_list; 5219 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5220 sizeof (struct sockaddr_in)); 5221 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5222 sin6p = addrs6->ri_list; 5223 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5224 sizeof (struct sockaddr_in6)); 5225 } 5226 } 5227 5228 kmem_free(lifc.lifc_buf, bufsize); 5229 return (B_TRUE); 5230 } 5231 5232 /* ARGSUSED */ 5233 static int 5234 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5235 { 5236 rib_hca_t *hca; 5237 5238 if (KSTAT_WRITE == rw) { 5239 return (EACCES); 5240 } 5241 5242 rpcib_kstat.cache_limit.value.ui64 = 5243 (uint64_t)cache_limit; 5244 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5245 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5246 rpcib_kstat.cache_allocation.value.ui64 += 5247 (uint64_t)hca->cache_allocation; 5248 rpcib_kstat.cache_hits.value.ui64 += 5249 (uint64_t)hca->cache_hits; 5250 rpcib_kstat.cache_misses.value.ui64 += 5251 (uint64_t)hca->cache_misses; 5252 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5253 (uint64_t)hca->cache_misses_above_the_limit; 5254 } 5255 rw_exit(&rib_stat->hcas_list_lock); 5256 return (0); 5257 } 5258