1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 #define RIBNETID_TCP "tcp" 221 #define RIBNETID_TCP6 "tcp6" 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 234 235 int preposted_rbufs = RDMA_BUFS_GRANT; 236 int send_threshold = 1; 237 238 /* 239 * Old cards with Tavor driver have limited memory footprint 240 * when booted in 32bit. The rib_max_rbufs tunable can be 241 * tuned for more buffers if needed. 242 */ 243 244 #if !defined(_ELF64) && !defined(__sparc) 245 int rib_max_rbufs = MAX_BUFS; 246 #else 247 int rib_max_rbufs = 10 * MAX_BUFS; 248 #endif /* !(_ELF64) && !(__sparc) */ 249 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 251 252 /* 253 * State of the plugin. 254 * ACCEPT = accepting new connections and requests. 255 * NO_ACCEPT = not accepting new connection and requests. 256 * This should eventually move to rpcib_state_t structure, since this 257 * will tell in which state the plugin is for a particular type of service 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 259 * state for one and in no_accept state for the other. 260 */ 261 int plugin_state; 262 kmutex_t plugin_state_lock; 263 264 ldi_ident_t rpcib_li; 265 266 /* 267 * RPCIB RDMATF operations 268 */ 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 270 static rdma_stat rib_disconnect(CONN *conn); 271 static void rib_listen(struct rdma_svc_data *rd); 272 static void rib_listen_stop(struct rdma_svc_data *rd); 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 274 uint_t buflen, struct mrc *buf_handle); 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 276 struct mrc buf_handle); 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 280 struct mrc buf_handle); 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 283 void *lrc); 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 287 caddr_t buf, int len, int cpu); 288 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 290 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 293 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 295 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 306 int addr_type, void *, CONN **); 307 static rdma_stat rib_conn_release(CONN *conn); 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 309 rpcib_ping_t *, CONN **); 310 static rdma_stat rib_getinfo(rdma_info_t *info); 311 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 314 static void rib_destroy_cache(rib_hca_t *hca); 315 static void rib_server_side_cache_reclaim(void *argp); 316 static int avl_compare(const void *t1, const void *t2); 317 318 static void rib_stop_services(rib_hca_t *); 319 static void rib_close_channels(rib_conn_list_t *); 320 static void rib_conn_close(void *); 321 static void rib_recv_rele(rib_qp_t *); 322 static rdma_stat rib_conn_release_locked(CONN *conn); 323 324 /* 325 * RPCIB addressing operations 326 */ 327 328 /* 329 * RDMA operations the RPCIB module exports 330 */ 331 static rdmaops_t rib_ops = { 332 rib_reachable, 333 rib_conn_get, 334 rib_conn_release, 335 rib_listen, 336 rib_listen_stop, 337 rib_registermem, 338 rib_deregistermem, 339 rib_registermemsync, 340 rib_deregistermemsync, 341 rib_syncmem, 342 rib_reg_buf_alloc, 343 rib_reg_buf_free, 344 rib_send, 345 rib_send_resp, 346 rib_post_resp, 347 rib_post_resp_remove, 348 rib_post_recv, 349 rib_recv, 350 rib_read, 351 rib_write, 352 rib_getinfo, 353 }; 354 355 /* 356 * RDMATF RPCIB plugin details 357 */ 358 static rdma_mod_t rib_mod = { 359 "ibtf", /* api name */ 360 RDMATF_VERS_1, 361 0, 362 &rib_ops, /* rdma op vector for ibtf */ 363 }; 364 365 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 366 static rdma_stat rib_qp_init(rib_qp_t *, int); 367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 373 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 375 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 378 rib_qp_t **); 379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 380 rib_qp_t **); 381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 383 static int rib_free_sendwait(struct send_wid *); 384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 386 static void rdma_done_rem_list(rib_qp_t *); 387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 388 389 static void rib_async_handler(void *, 390 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 393 static int rib_free_svc_recv(struct svc_recv *); 394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 395 static void rib_free_wid(struct recv_wid *); 396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 397 static void rib_detach_hca(ibt_hca_hdl_t); 398 static void rib_close_a_channel(CONN *); 399 static void rib_send_hold(rib_qp_t *); 400 static void rib_send_rele(rib_qp_t *); 401 402 /* 403 * Registration with IBTF as a consumer 404 */ 405 static struct ibt_clnt_modinfo_s rib_modinfo = { 406 IBTI_V_CURR, 407 IBT_GENERIC, 408 rib_async_handler, /* async event handler */ 409 NULL, /* Memory Region Handler */ 410 "nfs/ib" 411 }; 412 413 /* 414 * Global strucuture 415 */ 416 417 typedef struct rpcib_s { 418 dev_info_t *rpcib_dip; 419 kmutex_t rpcib_mutex; 420 } rpcib_t; 421 422 rpcib_t rpcib; 423 424 /* 425 * /etc/system controlled variable to control 426 * debugging in rpcib kernel module. 427 * Set it to values greater that 1 to control 428 * the amount of debugging messages required. 429 */ 430 int rib_debug = 0; 431 432 int 433 _init(void) 434 { 435 int error; 436 437 error = mod_install((struct modlinkage *)&rib_modlinkage); 438 if (error != 0) { 439 /* 440 * Could not load module 441 */ 442 return (error); 443 } 444 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 445 return (0); 446 } 447 448 int 449 _fini() 450 { 451 int status; 452 453 /* 454 * Remove module 455 */ 456 if ((status = mod_remove(&rib_modlinkage)) != 0) { 457 return (status); 458 } 459 mutex_destroy(&plugin_state_lock); 460 return (0); 461 } 462 463 int 464 _info(struct modinfo *modinfop) 465 { 466 return (mod_info(&rib_modlinkage, modinfop)); 467 } 468 469 /* 470 * rpcib_getinfo() 471 * Given the device number, return the devinfo pointer or the 472 * instance number. 473 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 474 */ 475 476 /*ARGSUSED*/ 477 static int 478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 479 { 480 int ret = DDI_SUCCESS; 481 482 switch (cmd) { 483 case DDI_INFO_DEVT2DEVINFO: 484 if (rpcib.rpcib_dip != NULL) 485 *result = rpcib.rpcib_dip; 486 else { 487 *result = NULL; 488 ret = DDI_FAILURE; 489 } 490 break; 491 492 case DDI_INFO_DEVT2INSTANCE: 493 *result = NULL; 494 break; 495 496 default: 497 ret = DDI_FAILURE; 498 } 499 return (ret); 500 } 501 502 static void 503 rpcib_free_hca_list() 504 { 505 rib_hca_t *hca, *hcap; 506 507 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 508 hca = rib_stat->hcas_list; 509 rib_stat->hcas_list = NULL; 510 rw_exit(&rib_stat->hcas_list_lock); 511 while (hca != NULL) { 512 rw_enter(&hca->state_lock, RW_WRITER); 513 hcap = hca; 514 hca = hca->next; 515 rib_stat->nhca_inited--; 516 rib_mod.rdma_count--; 517 hcap->state = HCA_DETACHED; 518 rw_exit(&hcap->state_lock); 519 rib_stop_hca_services(hcap); 520 521 kmem_free(hcap, sizeof (*hcap)); 522 } 523 } 524 525 static rdma_stat 526 rpcib_free_service_list() 527 { 528 rib_service_t *service; 529 ibt_status_t ret; 530 531 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 532 while (rib_stat->service_list != NULL) { 533 service = rib_stat->service_list; 534 ret = ibt_unbind_all_services(service->srv_hdl); 535 if (ret != IBT_SUCCESS) { 536 rw_exit(&rib_stat->service_list_lock); 537 #ifdef DEBUG 538 cmn_err(CE_NOTE, "rpcib_free_service_list: " 539 "ibt_unbind_all_services failed (%d)\n", (int)ret); 540 #endif 541 return (RDMA_FAILED); 542 } 543 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 544 service->srv_hdl); 545 if (ret != IBT_SUCCESS) { 546 rw_exit(&rib_stat->service_list_lock); 547 #ifdef DEBUG 548 cmn_err(CE_NOTE, "rpcib_free_service_list: " 549 "ibt_deregister_service failed (%d)\n", (int)ret); 550 #endif 551 return (RDMA_FAILED); 552 } 553 rib_stat->service_list = service->next; 554 kmem_free(service, sizeof (rib_service_t)); 555 } 556 rw_exit(&rib_stat->service_list_lock); 557 558 return (RDMA_SUCCESS); 559 } 560 561 static int 562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 563 { 564 ibt_status_t ibt_status; 565 rdma_stat r_status; 566 567 switch (cmd) { 568 case DDI_ATTACH: 569 break; 570 case DDI_RESUME: 571 return (DDI_SUCCESS); 572 default: 573 return (DDI_FAILURE); 574 } 575 576 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 577 578 mutex_enter(&rpcib.rpcib_mutex); 579 if (rpcib.rpcib_dip != NULL) { 580 mutex_exit(&rpcib.rpcib_mutex); 581 return (DDI_FAILURE); 582 } 583 rpcib.rpcib_dip = dip; 584 mutex_exit(&rpcib.rpcib_mutex); 585 /* 586 * Create the "rpcib" minor-node. 587 */ 588 if (ddi_create_minor_node(dip, 589 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 590 /* Error message, no cmn_err as they print on console */ 591 return (DDI_FAILURE); 592 } 593 594 if (rib_stat == NULL) { 595 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 596 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 597 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 598 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 599 } 600 601 rib_stat->hca_count = ibt_get_hca_list(NULL); 602 if (rib_stat->hca_count < 1) { 603 mutex_destroy(&rib_stat->listen_lock); 604 rw_destroy(&rib_stat->hcas_list_lock); 605 mutex_destroy(&rib_stat->open_hca_lock); 606 kmem_free(rib_stat, sizeof (*rib_stat)); 607 rib_stat = NULL; 608 return (DDI_FAILURE); 609 } 610 611 ibt_status = ibt_attach(&rib_modinfo, dip, 612 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 613 614 if (ibt_status != IBT_SUCCESS) { 615 mutex_destroy(&rib_stat->listen_lock); 616 rw_destroy(&rib_stat->hcas_list_lock); 617 mutex_destroy(&rib_stat->open_hca_lock); 618 kmem_free(rib_stat, sizeof (*rib_stat)); 619 rib_stat = NULL; 620 return (DDI_FAILURE); 621 } 622 623 rib_stat->service_list = NULL; 624 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 625 mutex_enter(&rib_stat->open_hca_lock); 626 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 627 mutex_exit(&rib_stat->open_hca_lock); 628 goto open_fail; 629 } 630 mutex_exit(&rib_stat->open_hca_lock); 631 632 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 633 DDI_PROP_SUCCESS) { 634 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 635 "failed."); 636 goto register_fail; 637 } 638 639 /* 640 * Register with rdmatf 641 */ 642 r_status = rdma_register_mod(&rib_mod); 643 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 644 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 645 "status = %d", r_status); 646 goto register_fail; 647 } 648 649 return (DDI_SUCCESS); 650 651 register_fail: 652 653 open_fail: 654 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 655 rpcib_free_hca_list(); 656 (void) rpcib_free_service_list(); 657 mutex_destroy(&rib_stat->listen_lock); 658 rw_destroy(&rib_stat->hcas_list_lock); 659 mutex_destroy(&rib_stat->open_hca_lock); 660 rw_destroy(&rib_stat->service_list_lock); 661 kmem_free(rib_stat, sizeof (*rib_stat)); 662 rib_stat = NULL; 663 return (DDI_FAILURE); 664 } 665 666 /*ARGSUSED*/ 667 static int 668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 669 { 670 switch (cmd) { 671 672 case DDI_DETACH: 673 break; 674 675 case DDI_SUSPEND: 676 default: 677 return (DDI_FAILURE); 678 } 679 680 /* 681 * Detach the hca and free resources 682 */ 683 mutex_enter(&plugin_state_lock); 684 plugin_state = NO_ACCEPT; 685 mutex_exit(&plugin_state_lock); 686 687 if (rpcib_free_service_list() != RDMA_SUCCESS) 688 return (DDI_FAILURE); 689 rpcib_free_hca_list(); 690 691 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 692 mutex_destroy(&rib_stat->listen_lock); 693 rw_destroy(&rib_stat->hcas_list_lock); 694 mutex_destroy(&rib_stat->open_hca_lock); 695 rw_destroy(&rib_stat->service_list_lock); 696 697 kmem_free(rib_stat, sizeof (*rib_stat)); 698 rib_stat = NULL; 699 700 mutex_enter(&rpcib.rpcib_mutex); 701 rpcib.rpcib_dip = NULL; 702 mutex_exit(&rpcib.rpcib_mutex); 703 mutex_destroy(&rpcib.rpcib_mutex); 704 return (DDI_SUCCESS); 705 } 706 707 708 static void rib_rbufpool_free(rib_hca_t *, int); 709 static void rib_rbufpool_deregister(rib_hca_t *, int); 710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 712 static rdma_stat rib_rem_replylist(rib_qp_t *); 713 static int rib_remreply(rib_qp_t *, struct reply *); 714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 716 717 718 /* 719 * One CQ pair per HCA 720 */ 721 static rdma_stat 722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 723 rib_cq_t **cqp) 724 { 725 rib_cq_t *cq; 726 ibt_cq_attr_t cq_attr; 727 uint32_t real_size; 728 ibt_status_t status; 729 rdma_stat error = RDMA_SUCCESS; 730 731 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 732 cq->rib_hca = hca; 733 cq_attr.cq_size = cq_size; 734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 736 &real_size); 737 if (status != IBT_SUCCESS) { 738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 739 " status=%d", status); 740 error = RDMA_FAILED; 741 goto fail; 742 } 743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 744 745 /* 746 * Enable CQ callbacks. CQ Callbacks are single shot 747 * (e.g. you have to call ibt_enable_cq_notify() 748 * after each callback to get another one). 749 */ 750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 751 if (status != IBT_SUCCESS) { 752 cmn_err(CE_WARN, "rib_create_cq: " 753 "enable_cq_notify failed, status %d", status); 754 error = RDMA_FAILED; 755 goto fail; 756 } 757 *cqp = cq; 758 759 return (error); 760 fail: 761 if (cq->rib_cq_hdl) 762 (void) ibt_free_cq(cq->rib_cq_hdl); 763 if (cq) 764 kmem_free(cq, sizeof (rib_cq_t)); 765 return (error); 766 } 767 768 /* 769 * rpcib_find_hca 770 * 771 * Caller should have already locked the hcas_lock before calling 772 * this function. 773 */ 774 static rib_hca_t * 775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 776 { 777 rib_hca_t *hca = ribstat->hcas_list; 778 779 while (hca && hca->hca_guid != guid) 780 hca = hca->next; 781 782 return (hca); 783 } 784 785 static rdma_stat 786 rpcib_open_hcas(rpcib_state_t *ribstat) 787 { 788 rib_hca_t *hca; 789 ibt_status_t ibt_status; 790 rdma_stat status; 791 ibt_hca_portinfo_t *pinfop; 792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 793 uint_t size, cq_size; 794 int i; 795 kstat_t *ksp; 796 cache_avl_struct_t example_avl_node; 797 char rssc_name[32]; 798 int old_nhca_inited = ribstat->nhca_inited; 799 ib_guid_t *hca_guids; 800 801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 802 803 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 804 if (ribstat->hca_count == 0) 805 return (RDMA_FAILED); 806 807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 808 /* 809 * Open a hca and setup for RDMA 810 */ 811 for (i = 0; i < ribstat->hca_count; i++) { 812 if (rpcib_find_hca(ribstat, hca_guids[i])) 813 continue; 814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 815 816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 817 hca_guids[i], &hca->hca_hdl); 818 if (ibt_status != IBT_SUCCESS) { 819 kmem_free(hca, sizeof (rib_hca_t)); 820 continue; 821 } 822 hca->hca_guid = hca_guids[i]; 823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 824 hca->state = HCA_INITED; 825 826 /* 827 * query HCA info 828 */ 829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 830 if (ibt_status != IBT_SUCCESS) { 831 goto fail1; 832 } 833 834 /* 835 * One PD (Protection Domain) per HCA. 836 * A qp is allowed to access a memory region 837 * only when it's in the same PD as that of 838 * the memory region. 839 */ 840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 841 if (ibt_status != IBT_SUCCESS) { 842 goto fail1; 843 } 844 845 /* 846 * query HCA ports 847 */ 848 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 849 0, &pinfop, &hca->hca_nports, &size); 850 if (ibt_status != IBT_SUCCESS) { 851 goto fail2; 852 } 853 hca->hca_ports = pinfop; 854 hca->hca_pinfosz = size; 855 pinfop = NULL; 856 857 cq_size = DEF_CQ_SIZE; /* default cq size */ 858 /* 859 * Create 2 pairs of cq's (1 pair for client 860 * and the other pair for server) on this hca. 861 * If number of qp's gets too large, then several 862 * cq's will be needed. 863 */ 864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 865 &hca->svc_rcq); 866 if (status != RDMA_SUCCESS) { 867 goto fail3; 868 } 869 870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 871 &hca->svc_scq); 872 if (status != RDMA_SUCCESS) { 873 goto fail3; 874 } 875 876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 877 &hca->clnt_rcq); 878 if (status != RDMA_SUCCESS) { 879 goto fail3; 880 } 881 882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 883 &hca->clnt_scq); 884 if (status != RDMA_SUCCESS) { 885 goto fail3; 886 } 887 888 /* 889 * Create buffer pools. 890 * Note rib_rbuf_create also allocates memory windows. 891 */ 892 hca->recv_pool = rib_rbufpool_create(hca, 893 RECV_BUFFER, rib_max_rbufs); 894 if (hca->recv_pool == NULL) { 895 goto fail3; 896 } 897 898 hca->send_pool = rib_rbufpool_create(hca, 899 SEND_BUFFER, rib_max_rbufs); 900 if (hca->send_pool == NULL) { 901 rib_rbufpool_destroy(hca, RECV_BUFFER); 902 goto fail3; 903 } 904 905 if (hca->server_side_cache == NULL) { 906 (void) sprintf(rssc_name, 907 "rib_srvr_cache_%llx", 908 (long long unsigned int) hca->hca_guid); 909 hca->server_side_cache = kmem_cache_create( 910 rssc_name, 911 sizeof (cache_avl_struct_t), 0, 912 NULL, 913 NULL, 914 rib_server_side_cache_reclaim, 915 hca, NULL, 0); 916 } 917 918 avl_create(&hca->avl_tree, 919 avl_compare, 920 sizeof (cache_avl_struct_t), 921 (uint_t)(uintptr_t)&example_avl_node.avl_link- 922 (uint_t)(uintptr_t)&example_avl_node); 923 924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 925 hca->iblock); 926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 927 rw_init(&hca->avl_rw_lock, 928 NULL, RW_DRIVER, hca->iblock); 929 mutex_init(&hca->cache_allocation_lock, 930 NULL, MUTEX_DRIVER, NULL); 931 hca->avl_init = TRUE; 932 933 /* Create kstats for the cache */ 934 ASSERT(INGLOBALZONE(curproc)); 935 936 if (!stats_enabled) { 937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 938 KSTAT_TYPE_NAMED, 939 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 941 GLOBAL_ZONEID); 942 if (ksp) { 943 ksp->ks_data = (void *) &rpcib_kstat; 944 ksp->ks_update = rpcib_cache_kstat_update; 945 kstat_install(ksp); 946 stats_enabled = TRUE; 947 } 948 } 949 if (hca->cleanup_helper == NULL) { 950 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 951 952 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 953 (unsigned long long int) hca->hca_guid); 954 hca->cleanup_helper = ddi_taskq_create(NULL, 955 tq_name, 1, TASKQ_DEFAULTPRI, 0); 956 } 957 958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 961 hca->iblock); 962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 963 hca->iblock); 964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 965 hca->inuse = TRUE; 966 967 hca->next = ribstat->hcas_list; 968 ribstat->hcas_list = hca; 969 ribstat->nhca_inited++; 970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 971 continue; 972 973 fail3: 974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 975 fail2: 976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 977 fail1: 978 (void) ibt_close_hca(hca->hca_hdl); 979 kmem_free(hca, sizeof (rib_hca_t)); 980 } 981 rw_exit(&ribstat->hcas_list_lock); 982 ibt_free_hca_list(hca_guids, ribstat->hca_count); 983 rib_mod.rdma_count = rib_stat->nhca_inited; 984 985 /* 986 * return success if at least one new hca has been configured. 987 */ 988 if (ribstat->nhca_inited != old_nhca_inited) 989 return (RDMA_SUCCESS); 990 else 991 return (RDMA_FAILED); 992 } 993 994 /* 995 * Callback routines 996 */ 997 998 /* 999 * SCQ handlers 1000 */ 1001 /* ARGSUSED */ 1002 static void 1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1004 { 1005 ibt_status_t ibt_status; 1006 ibt_wc_t wc; 1007 struct send_wid *wd; 1008 CONN *conn; 1009 rib_qp_t *qp; 1010 int i; 1011 1012 /* 1013 * Re-enable cq notify here to avoid missing any 1014 * completion queue notification. 1015 */ 1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1017 1018 ibt_status = IBT_SUCCESS; 1019 while (ibt_status != IBT_CQ_EMPTY) { 1020 bzero(&wc, sizeof (wc)); 1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1022 if (ibt_status != IBT_SUCCESS) 1023 return; 1024 1025 /* 1026 * Got a send completion 1027 */ 1028 if (wc.wc_id != RDMA_DUMMY_WRID) { 1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1030 qp = wd->qp; 1031 conn = qptoc(qp); 1032 1033 mutex_enter(&wd->sendwait_lock); 1034 switch (wc.wc_status) { 1035 case IBT_WC_SUCCESS: 1036 wd->status = RDMA_SUCCESS; 1037 break; 1038 default: 1039 /* 1040 * RC Send Q Error Code Local state Remote State 1041 * ==================== =========== ============ 1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1043 * IBT_WC_LOCAL_LEN_ERR ERROR None 1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1052 * IBT_WC_WR_FLUSHED_ERR ERROR None 1053 */ 1054 /* 1055 * Channel in error state. Set connection to 1056 * ERROR and cleanup will happen either from 1057 * conn_release or from rib_conn_get 1058 */ 1059 wd->status = RDMA_FAILED; 1060 mutex_enter(&conn->c_lock); 1061 if (conn->c_state != C_DISCONN_PEND) 1062 conn->c_state = C_ERROR_CONN; 1063 mutex_exit(&conn->c_lock); 1064 break; 1065 } 1066 1067 if (wd->cv_sig == 1) { 1068 /* 1069 * Notify poster 1070 */ 1071 cv_signal(&wd->wait_cv); 1072 mutex_exit(&wd->sendwait_lock); 1073 } else { 1074 /* 1075 * Poster not waiting for notification. 1076 * Free the send buffers and send_wid 1077 */ 1078 for (i = 0; i < wd->nsbufs; i++) { 1079 rib_rbuf_free(qptoc(wd->qp), 1080 SEND_BUFFER, 1081 (void *)(uintptr_t)wd->sbufaddr[i]); 1082 } 1083 1084 /* decrement the send ref count */ 1085 rib_send_rele(qp); 1086 1087 mutex_exit(&wd->sendwait_lock); 1088 (void) rib_free_sendwait(wd); 1089 } 1090 } 1091 } 1092 } 1093 1094 /* ARGSUSED */ 1095 static void 1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1097 { 1098 ibt_status_t ibt_status; 1099 ibt_wc_t wc; 1100 struct send_wid *wd; 1101 rib_qp_t *qp; 1102 CONN *conn; 1103 int i; 1104 1105 /* 1106 * Re-enable cq notify here to avoid missing any 1107 * completion queue notification. 1108 */ 1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1110 1111 ibt_status = IBT_SUCCESS; 1112 while (ibt_status != IBT_CQ_EMPTY) { 1113 bzero(&wc, sizeof (wc)); 1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1115 if (ibt_status != IBT_SUCCESS) 1116 return; 1117 1118 /* 1119 * Got a send completion 1120 */ 1121 if (wc.wc_id != RDMA_DUMMY_WRID) { 1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1123 qp = wd->qp; 1124 conn = qptoc(qp); 1125 mutex_enter(&wd->sendwait_lock); 1126 1127 switch (wc.wc_status) { 1128 case IBT_WC_SUCCESS: 1129 wd->status = RDMA_SUCCESS; 1130 break; 1131 default: 1132 /* 1133 * Channel in error state. Set connection to 1134 * ERROR and cleanup will happen either from 1135 * conn_release or conn timeout. 1136 */ 1137 wd->status = RDMA_FAILED; 1138 mutex_enter(&conn->c_lock); 1139 if (conn->c_state != C_DISCONN_PEND) 1140 conn->c_state = C_ERROR_CONN; 1141 mutex_exit(&conn->c_lock); 1142 break; 1143 } 1144 1145 if (wd->cv_sig == 1) { 1146 /* 1147 * Update completion status and notify poster 1148 */ 1149 cv_signal(&wd->wait_cv); 1150 mutex_exit(&wd->sendwait_lock); 1151 } else { 1152 /* 1153 * Poster not waiting for notification. 1154 * Free the send buffers and send_wid 1155 */ 1156 for (i = 0; i < wd->nsbufs; i++) { 1157 rib_rbuf_free(qptoc(wd->qp), 1158 SEND_BUFFER, 1159 (void *)(uintptr_t)wd->sbufaddr[i]); 1160 } 1161 1162 /* decrement the send ref count */ 1163 rib_send_rele(qp); 1164 1165 mutex_exit(&wd->sendwait_lock); 1166 (void) rib_free_sendwait(wd); 1167 } 1168 } 1169 } 1170 } 1171 1172 /* 1173 * RCQ handler 1174 */ 1175 /* ARGSUSED */ 1176 static void 1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1178 { 1179 rib_qp_t *qp; 1180 ibt_status_t ibt_status; 1181 ibt_wc_t wc; 1182 struct recv_wid *rwid; 1183 1184 /* 1185 * Re-enable cq notify here to avoid missing any 1186 * completion queue notification. 1187 */ 1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1189 1190 ibt_status = IBT_SUCCESS; 1191 while (ibt_status != IBT_CQ_EMPTY) { 1192 bzero(&wc, sizeof (wc)); 1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1194 if (ibt_status != IBT_SUCCESS) 1195 return; 1196 1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1198 qp = rwid->qp; 1199 1200 if (wc.wc_status == IBT_WC_SUCCESS) { 1201 XDR inxdrs, *xdrs; 1202 uint_t xid, vers, op, find_xid = 0; 1203 struct reply *r; 1204 CONN *conn = qptoc(qp); 1205 uint32_t rdma_credit = 0; 1206 1207 xdrs = &inxdrs; 1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1209 wc.wc_bytes_xfer, XDR_DECODE); 1210 /* 1211 * Treat xid as opaque (xid is the first entity 1212 * in the rpc rdma message). 1213 */ 1214 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1215 1216 /* Skip xid and set the xdr position accordingly. */ 1217 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1218 (void) xdr_u_int(xdrs, &vers); 1219 (void) xdr_u_int(xdrs, &rdma_credit); 1220 (void) xdr_u_int(xdrs, &op); 1221 XDR_DESTROY(xdrs); 1222 1223 if (vers != RPCRDMA_VERS) { 1224 /* 1225 * Invalid RPC/RDMA version. Cannot 1226 * interoperate. Set connection to 1227 * ERROR state and bail out. 1228 */ 1229 mutex_enter(&conn->c_lock); 1230 if (conn->c_state != C_DISCONN_PEND) 1231 conn->c_state = C_ERROR_CONN; 1232 mutex_exit(&conn->c_lock); 1233 rib_rbuf_free(conn, RECV_BUFFER, 1234 (void *)(uintptr_t)rwid->addr); 1235 rib_free_wid(rwid); 1236 rib_recv_rele(qp); 1237 continue; 1238 } 1239 1240 mutex_enter(&qp->replylist_lock); 1241 for (r = qp->replylist; r != NULL; r = r->next) { 1242 if (r->xid == xid) { 1243 find_xid = 1; 1244 switch (op) { 1245 case RDMA_MSG: 1246 case RDMA_NOMSG: 1247 case RDMA_MSGP: 1248 r->status = RDMA_SUCCESS; 1249 r->vaddr_cq = rwid->addr; 1250 r->bytes_xfer = 1251 wc.wc_bytes_xfer; 1252 cv_signal(&r->wait_cv); 1253 break; 1254 default: 1255 rib_rbuf_free(qptoc(qp), 1256 RECV_BUFFER, 1257 (void *)(uintptr_t) 1258 rwid->addr); 1259 break; 1260 } 1261 break; 1262 } 1263 } 1264 mutex_exit(&qp->replylist_lock); 1265 if (find_xid == 0) { 1266 /* RPC caller not waiting for reply */ 1267 1268 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1269 int, xid); 1270 1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1272 (void *)(uintptr_t)rwid->addr); 1273 } 1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1275 CONN *conn = qptoc(qp); 1276 1277 /* 1278 * Connection being flushed. Just free 1279 * the posted buffer 1280 */ 1281 rib_rbuf_free(conn, RECV_BUFFER, 1282 (void *)(uintptr_t)rwid->addr); 1283 } else { 1284 CONN *conn = qptoc(qp); 1285 /* 1286 * RC Recv Q Error Code Local state Remote State 1287 * ==================== =========== ============ 1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1293 * IBT_WC_WR_FLUSHED_ERR None None 1294 */ 1295 /* 1296 * Channel in error state. Set connection 1297 * in ERROR state. 1298 */ 1299 mutex_enter(&conn->c_lock); 1300 if (conn->c_state != C_DISCONN_PEND) 1301 conn->c_state = C_ERROR_CONN; 1302 mutex_exit(&conn->c_lock); 1303 rib_rbuf_free(conn, RECV_BUFFER, 1304 (void *)(uintptr_t)rwid->addr); 1305 } 1306 rib_free_wid(rwid); 1307 rib_recv_rele(qp); 1308 } 1309 } 1310 1311 /* Server side */ 1312 /* ARGSUSED */ 1313 static void 1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1315 { 1316 rdma_recv_data_t *rdp; 1317 rib_qp_t *qp; 1318 ibt_status_t ibt_status; 1319 ibt_wc_t wc; 1320 struct svc_recv *s_recvp; 1321 CONN *conn; 1322 mblk_t *mp; 1323 1324 /* 1325 * Re-enable cq notify here to avoid missing any 1326 * completion queue notification. 1327 */ 1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1329 1330 ibt_status = IBT_SUCCESS; 1331 while (ibt_status != IBT_CQ_EMPTY) { 1332 bzero(&wc, sizeof (wc)); 1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1334 if (ibt_status != IBT_SUCCESS) 1335 return; 1336 1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1338 qp = s_recvp->qp; 1339 conn = qptoc(qp); 1340 1341 if (wc.wc_status == IBT_WC_SUCCESS) { 1342 XDR inxdrs, *xdrs; 1343 uint_t xid, vers, op; 1344 uint32_t rdma_credit; 1345 1346 xdrs = &inxdrs; 1347 /* s_recvp->vaddr stores data */ 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1349 wc.wc_bytes_xfer, XDR_DECODE); 1350 1351 /* 1352 * Treat xid as opaque (xid is the first entity 1353 * in the rpc rdma message). 1354 */ 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1356 /* Skip xid and set the xdr position accordingly. */ 1357 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1358 if (!xdr_u_int(xdrs, &vers) || 1359 !xdr_u_int(xdrs, &rdma_credit) || 1360 !xdr_u_int(xdrs, &op)) { 1361 rib_rbuf_free(conn, RECV_BUFFER, 1362 (void *)(uintptr_t)s_recvp->vaddr); 1363 XDR_DESTROY(xdrs); 1364 rib_recv_rele(qp); 1365 (void) rib_free_svc_recv(s_recvp); 1366 continue; 1367 } 1368 XDR_DESTROY(xdrs); 1369 1370 if (vers != RPCRDMA_VERS) { 1371 /* 1372 * Invalid RPC/RDMA version. 1373 * Drop rpc rdma message. 1374 */ 1375 rib_rbuf_free(conn, RECV_BUFFER, 1376 (void *)(uintptr_t)s_recvp->vaddr); 1377 rib_recv_rele(qp); 1378 (void) rib_free_svc_recv(s_recvp); 1379 continue; 1380 } 1381 /* 1382 * Is this for RDMA_DONE? 1383 */ 1384 if (op == RDMA_DONE) { 1385 rib_rbuf_free(conn, RECV_BUFFER, 1386 (void *)(uintptr_t)s_recvp->vaddr); 1387 /* 1388 * Wake up the thread waiting on 1389 * a RDMA_DONE for xid 1390 */ 1391 mutex_enter(&qp->rdlist_lock); 1392 rdma_done_notify(qp, xid); 1393 mutex_exit(&qp->rdlist_lock); 1394 rib_recv_rele(qp); 1395 (void) rib_free_svc_recv(s_recvp); 1396 continue; 1397 } 1398 1399 mutex_enter(&plugin_state_lock); 1400 if (plugin_state == ACCEPT) { 1401 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1402 == NULL) 1403 (void) strwaitbuf( 1404 sizeof (*rdp), BPRI_LO); 1405 /* 1406 * Plugin is in accept state, hence the master 1407 * transport queue for this is still accepting 1408 * requests. Hence we can call svc_queuereq to 1409 * queue this recieved msg. 1410 */ 1411 rdp = (rdma_recv_data_t *)mp->b_rptr; 1412 rdp->conn = conn; 1413 rdp->rpcmsg.addr = 1414 (caddr_t)(uintptr_t)s_recvp->vaddr; 1415 rdp->rpcmsg.type = RECV_BUFFER; 1416 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1417 rdp->status = wc.wc_status; 1418 mutex_enter(&conn->c_lock); 1419 conn->c_ref++; 1420 mutex_exit(&conn->c_lock); 1421 mp->b_wptr += sizeof (*rdp); 1422 svc_queuereq((queue_t *)rib_stat->q, mp); 1423 mutex_exit(&plugin_state_lock); 1424 } else { 1425 /* 1426 * The master transport for this is going 1427 * away and the queue is not accepting anymore 1428 * requests for krpc, so don't do anything, just 1429 * free the msg. 1430 */ 1431 mutex_exit(&plugin_state_lock); 1432 rib_rbuf_free(conn, RECV_BUFFER, 1433 (void *)(uintptr_t)s_recvp->vaddr); 1434 } 1435 } else { 1436 rib_rbuf_free(conn, RECV_BUFFER, 1437 (void *)(uintptr_t)s_recvp->vaddr); 1438 } 1439 rib_recv_rele(qp); 1440 (void) rib_free_svc_recv(s_recvp); 1441 } 1442 } 1443 1444 static void 1445 rib_attach_hca() 1446 { 1447 mutex_enter(&rib_stat->open_hca_lock); 1448 (void) rpcib_open_hcas(rib_stat); 1449 rib_listen(NULL); 1450 mutex_exit(&rib_stat->open_hca_lock); 1451 } 1452 1453 /* 1454 * Handles DR event of IBT_HCA_DETACH_EVENT. 1455 */ 1456 /* ARGSUSED */ 1457 static void 1458 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1459 ibt_async_code_t code, ibt_async_event_t *event) 1460 { 1461 switch (code) { 1462 case IBT_HCA_ATTACH_EVENT: 1463 rib_attach_hca(); 1464 break; 1465 case IBT_HCA_DETACH_EVENT: 1466 rib_detach_hca(hca_hdl); 1467 #ifdef DEBUG 1468 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1469 #endif 1470 break; 1471 case IBT_EVENT_PORT_UP: 1472 /* 1473 * A port is up. We should call rib_listen() since there is 1474 * a chance that rib_listen() may have failed during 1475 * rib_attach_hca() because the port had not been up yet. 1476 */ 1477 rib_listen(NULL); 1478 #ifdef DEBUG 1479 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1480 #endif 1481 break; 1482 #ifdef DEBUG 1483 case IBT_EVENT_PATH_MIGRATED: 1484 cmn_err(CE_NOTE, "rib_async_handler(): " 1485 "IBT_EVENT_PATH_MIGRATED\n"); 1486 break; 1487 case IBT_EVENT_SQD: 1488 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1489 break; 1490 case IBT_EVENT_COM_EST: 1491 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1492 break; 1493 case IBT_ERROR_CATASTROPHIC_CHAN: 1494 cmn_err(CE_NOTE, "rib_async_handler(): " 1495 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1496 break; 1497 case IBT_ERROR_INVALID_REQUEST_CHAN: 1498 cmn_err(CE_NOTE, "rib_async_handler(): " 1499 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1500 break; 1501 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1502 cmn_err(CE_NOTE, "rib_async_handler(): " 1503 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1504 break; 1505 case IBT_ERROR_PATH_MIGRATE_REQ: 1506 cmn_err(CE_NOTE, "rib_async_handler(): " 1507 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1508 break; 1509 case IBT_ERROR_CQ: 1510 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1511 break; 1512 case IBT_ERROR_PORT_DOWN: 1513 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1514 break; 1515 case IBT_ASYNC_OPAQUE1: 1516 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1517 break; 1518 case IBT_ASYNC_OPAQUE2: 1519 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1520 break; 1521 case IBT_ASYNC_OPAQUE3: 1522 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1523 break; 1524 case IBT_ASYNC_OPAQUE4: 1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1526 break; 1527 #endif 1528 default: 1529 break; 1530 } 1531 } 1532 1533 /* 1534 * Client's reachable function. 1535 */ 1536 static rdma_stat 1537 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1538 { 1539 rdma_stat status; 1540 rpcib_ping_t rpt; 1541 struct netbuf saddr; 1542 CONN *conn; 1543 1544 bzero(&saddr, sizeof (struct netbuf)); 1545 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1546 1547 if (status == RDMA_SUCCESS) { 1548 *handle = (void *)rpt.hca; 1549 /* release the reference */ 1550 (void) rib_conn_release(conn); 1551 return (RDMA_SUCCESS); 1552 } else { 1553 *handle = NULL; 1554 DTRACE_PROBE(rpcib__i__pingfailed); 1555 return (RDMA_FAILED); 1556 } 1557 } 1558 1559 /* Client side qp creation */ 1560 static rdma_stat 1561 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1562 { 1563 rib_qp_t *kqp = NULL; 1564 CONN *conn; 1565 rdma_clnt_cred_ctrl_t *cc_info; 1566 1567 ASSERT(qp != NULL); 1568 *qp = NULL; 1569 1570 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1571 conn = qptoc(kqp); 1572 kqp->hca = hca; 1573 kqp->rdmaconn.c_rdmamod = &rib_mod; 1574 kqp->rdmaconn.c_private = (caddr_t)kqp; 1575 1576 kqp->mode = RIB_CLIENT; 1577 kqp->chan_flags = IBT_BLOCKING; 1578 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1579 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1580 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1581 /* 1582 * Initialize 1583 */ 1584 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1585 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1586 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1587 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1588 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1589 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1590 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1591 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1592 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1593 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1594 /* 1595 * Initialize the client credit control 1596 * portion of the rdmaconn struct. 1597 */ 1598 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1599 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1600 cc_info->clnt_cc_granted_ops = 0; 1601 cc_info->clnt_cc_in_flight_ops = 0; 1602 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1603 1604 *qp = kqp; 1605 return (RDMA_SUCCESS); 1606 } 1607 1608 /* Server side qp creation */ 1609 static rdma_stat 1610 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1611 { 1612 rib_qp_t *kqp = NULL; 1613 ibt_chan_sizes_t chan_sizes; 1614 ibt_rc_chan_alloc_args_t qp_attr; 1615 ibt_status_t ibt_status; 1616 rdma_srv_cred_ctrl_t *cc_info; 1617 1618 *qp = NULL; 1619 1620 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1621 kqp->hca = hca; 1622 kqp->port_num = port; 1623 kqp->rdmaconn.c_rdmamod = &rib_mod; 1624 kqp->rdmaconn.c_private = (caddr_t)kqp; 1625 1626 /* 1627 * Create the qp handle 1628 */ 1629 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1630 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1631 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1632 qp_attr.rc_pd = hca->pd_hdl; 1633 qp_attr.rc_hca_port_num = port; 1634 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1635 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1636 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1637 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1638 qp_attr.rc_clone_chan = NULL; 1639 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1640 qp_attr.rc_flags = IBT_WR_SIGNALED; 1641 1642 rw_enter(&hca->state_lock, RW_READER); 1643 if (hca->state != HCA_DETACHED) { 1644 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1645 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1646 &chan_sizes); 1647 } else { 1648 rw_exit(&hca->state_lock); 1649 goto fail; 1650 } 1651 rw_exit(&hca->state_lock); 1652 1653 if (ibt_status != IBT_SUCCESS) { 1654 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1655 int, ibt_status); 1656 goto fail; 1657 } 1658 1659 kqp->mode = RIB_SERVER; 1660 kqp->chan_flags = IBT_BLOCKING; 1661 kqp->q = q; /* server ONLY */ 1662 1663 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1664 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1665 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1666 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1667 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1668 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1669 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1670 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1671 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1672 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1673 /* 1674 * Set the private data area to qp to be used in callbacks 1675 */ 1676 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1677 kqp->rdmaconn.c_state = C_CONNECTED; 1678 1679 /* 1680 * Initialize the server credit control 1681 * portion of the rdmaconn struct. 1682 */ 1683 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1684 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1685 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1686 cc_info->srv_cc_cur_buffers_used = 0; 1687 cc_info->srv_cc_posted = preposted_rbufs; 1688 1689 *qp = kqp; 1690 1691 return (RDMA_SUCCESS); 1692 fail: 1693 if (kqp) 1694 kmem_free(kqp, sizeof (rib_qp_t)); 1695 1696 return (RDMA_FAILED); 1697 } 1698 1699 /* ARGSUSED */ 1700 ibt_cm_status_t 1701 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1702 ibt_cm_return_args_t *ret_args, void *priv_data, 1703 ibt_priv_data_len_t len) 1704 { 1705 rib_hca_t *hca; 1706 1707 hca = (rib_hca_t *)clnt_hdl; 1708 1709 switch (event->cm_type) { 1710 1711 /* got a connection close event */ 1712 case IBT_CM_EVENT_CONN_CLOSED: 1713 { 1714 CONN *conn; 1715 rib_qp_t *qp; 1716 1717 /* check reason why connection was closed */ 1718 switch (event->cm_event.closed) { 1719 case IBT_CM_CLOSED_DREP_RCVD: 1720 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1721 case IBT_CM_CLOSED_DUP: 1722 case IBT_CM_CLOSED_ABORT: 1723 case IBT_CM_CLOSED_ALREADY: 1724 /* 1725 * These cases indicate the local end initiated 1726 * the closing of the channel. Nothing to do here. 1727 */ 1728 break; 1729 default: 1730 /* 1731 * Reason for CONN_CLOSED event must be one of 1732 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1733 * or IBT_CM_CLOSED_STALE. These indicate cases were 1734 * the remote end is closing the channel. In these 1735 * cases free the channel and transition to error 1736 * state 1737 */ 1738 qp = ibt_get_chan_private(event->cm_channel); 1739 conn = qptoc(qp); 1740 mutex_enter(&conn->c_lock); 1741 if (conn->c_state == C_DISCONN_PEND) { 1742 mutex_exit(&conn->c_lock); 1743 break; 1744 } 1745 1746 conn->c_state = C_ERROR_CONN; 1747 1748 /* 1749 * Free the conn if c_ref is down to 0 already 1750 */ 1751 if (conn->c_ref == 0) { 1752 /* 1753 * Remove from list and free conn 1754 */ 1755 conn->c_state = C_DISCONN_PEND; 1756 mutex_exit(&conn->c_lock); 1757 rw_enter(&hca->state_lock, RW_READER); 1758 if (hca->state != HCA_DETACHED) 1759 (void) rib_disconnect_channel(conn, 1760 &hca->cl_conn_list); 1761 rw_exit(&hca->state_lock); 1762 } else { 1763 /* 1764 * conn will be freed when c_ref goes to 0. 1765 * Indicate to cleaning thread not to close 1766 * the connection, but just free the channel. 1767 */ 1768 conn->c_flags |= C_CLOSE_NOTNEEDED; 1769 mutex_exit(&conn->c_lock); 1770 } 1771 #ifdef DEBUG 1772 if (rib_debug) 1773 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1774 "(CONN_CLOSED) channel disconnected"); 1775 #endif 1776 break; 1777 } 1778 break; 1779 } 1780 default: 1781 break; 1782 } 1783 return (IBT_CM_ACCEPT); 1784 } 1785 1786 /* 1787 * Connect to the server. 1788 */ 1789 rdma_stat 1790 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1791 { 1792 ibt_chan_open_args_t chan_args; /* channel args */ 1793 ibt_chan_sizes_t chan_sizes; 1794 ibt_rc_chan_alloc_args_t qp_attr; 1795 ibt_status_t ibt_status; 1796 ibt_rc_returns_t ret_args; /* conn reject info */ 1797 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1798 ibt_ip_cm_info_t ipcm_info; 1799 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1800 1801 1802 (void) bzero(&chan_args, sizeof (chan_args)); 1803 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1804 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1805 1806 ipcm_info.src_addr.family = rptp->srcip.family; 1807 switch (ipcm_info.src_addr.family) { 1808 case AF_INET: 1809 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1810 break; 1811 case AF_INET6: 1812 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1813 break; 1814 } 1815 1816 ipcm_info.dst_addr.family = rptp->srcip.family; 1817 switch (ipcm_info.dst_addr.family) { 1818 case AF_INET: 1819 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1820 break; 1821 case AF_INET6: 1822 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1823 break; 1824 } 1825 1826 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1827 1828 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1829 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1830 1831 if (ibt_status != IBT_SUCCESS) { 1832 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1833 return (-1); 1834 } 1835 1836 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1837 /* Alloc a RC channel */ 1838 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1839 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1840 qp_attr.rc_pd = hca->pd_hdl; 1841 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1842 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1843 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1844 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1845 qp_attr.rc_clone_chan = NULL; 1846 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1847 qp_attr.rc_flags = IBT_WR_SIGNALED; 1848 1849 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1850 chan_args.oc_path = &rptp->path; 1851 1852 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1853 chan_args.oc_cm_clnt_private = (void *)hca; 1854 chan_args.oc_rdma_ra_out = 4; 1855 chan_args.oc_rdma_ra_in = 4; 1856 chan_args.oc_path_retry_cnt = 2; 1857 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1858 chan_args.oc_priv_data = cmp_ip_pvt; 1859 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1860 1861 refresh: 1862 rw_enter(&hca->state_lock, RW_READER); 1863 if (hca->state != HCA_DETACHED) { 1864 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1865 IBT_ACHAN_NO_FLAGS, 1866 &qp_attr, &qp->qp_hdl, 1867 &chan_sizes); 1868 } else { 1869 rw_exit(&hca->state_lock); 1870 return (RDMA_FAILED); 1871 } 1872 rw_exit(&hca->state_lock); 1873 1874 if (ibt_status != IBT_SUCCESS) { 1875 DTRACE_PROBE1(rpcib__i_conntosrv, 1876 int, ibt_status); 1877 return (RDMA_FAILED); 1878 } 1879 1880 /* Connect to the Server */ 1881 (void) bzero(&ret_args, sizeof (ret_args)); 1882 mutex_enter(&qp->cb_lock); 1883 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1884 IBT_BLOCKING, &chan_args, &ret_args); 1885 if (ibt_status != IBT_SUCCESS) { 1886 DTRACE_PROBE2(rpcib__i_openrctosrv, 1887 int, ibt_status, int, ret_args.rc_status); 1888 1889 (void) ibt_free_channel(qp->qp_hdl); 1890 qp->qp_hdl = NULL; 1891 mutex_exit(&qp->cb_lock); 1892 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1893 ret_args.rc_status == IBT_CM_CONN_STALE) { 1894 /* 1895 * Got IBT_CM_CONN_STALE probably because of stale 1896 * data on the passive end of a channel that existed 1897 * prior to reboot. Retry establishing a channel 1898 * REFRESH_ATTEMPTS times, during which time the 1899 * stale conditions on the server might clear up. 1900 */ 1901 goto refresh; 1902 } 1903 return (RDMA_FAILED); 1904 } 1905 mutex_exit(&qp->cb_lock); 1906 /* 1907 * Set the private data area to qp to be used in callbacks 1908 */ 1909 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1910 return (RDMA_SUCCESS); 1911 } 1912 1913 rdma_stat 1914 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1915 { 1916 uint_t i, addr_count; 1917 ibt_status_t ibt_status; 1918 uint8_t num_paths_p; 1919 ibt_ip_path_attr_t ipattr; 1920 ibt_path_ip_src_t srcip; 1921 rpcib_ipaddrs_t addrs4; 1922 rpcib_ipaddrs_t addrs6; 1923 struct sockaddr_in *sinp; 1924 struct sockaddr_in6 *sin6p; 1925 rdma_stat retval = RDMA_FAILED; 1926 rib_hca_t *hca; 1927 1928 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1929 return (RDMA_INVAL); 1930 ASSERT(raddr->buf != NULL); 1931 1932 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1933 1934 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1935 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1936 retval = RDMA_FAILED; 1937 goto done2; 1938 } 1939 1940 if (addr_type == AF_INET) { 1941 addr_count = addrs4.ri_count; 1942 sinp = (struct sockaddr_in *)raddr->buf; 1943 rptp->dstip.family = AF_INET; 1944 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1945 sinp = addrs4.ri_list; 1946 } else { 1947 addr_count = addrs6.ri_count; 1948 sin6p = (struct sockaddr_in6 *)raddr->buf; 1949 rptp->dstip.family = AF_INET6; 1950 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1951 sin6p = addrs6.ri_list; 1952 } 1953 1954 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1955 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1956 rw_enter(&hca->state_lock, RW_READER); 1957 if (hca->state == HCA_DETACHED) { 1958 rw_exit(&hca->state_lock); 1959 continue; 1960 } 1961 1962 ipattr.ipa_dst_ip = &rptp->dstip; 1963 ipattr.ipa_hca_guid = hca->hca_guid; 1964 ipattr.ipa_ndst = 1; 1965 ipattr.ipa_max_paths = 1; 1966 ipattr.ipa_src_ip.family = rptp->dstip.family; 1967 for (i = 0; i < addr_count; i++) { 1968 num_paths_p = 0; 1969 if (addr_type == AF_INET) { 1970 ipattr.ipa_src_ip.un.ip4addr = 1971 sinp[i].sin_addr.s_addr; 1972 } else { 1973 ipattr.ipa_src_ip.un.ip6addr = 1974 sin6p[i].sin6_addr; 1975 } 1976 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1977 1978 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1979 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1980 &num_paths_p, &srcip); 1981 if (ibt_status == IBT_SUCCESS && 1982 num_paths_p != 0 && 1983 rptp->path.pi_hca_guid == hca->hca_guid) { 1984 rptp->hca = hca; 1985 rw_exit(&hca->state_lock); 1986 if (addr_type == AF_INET) { 1987 rptp->srcip.family = AF_INET; 1988 rptp->srcip.un.ip4addr = 1989 srcip.ip_primary.un.ip4addr; 1990 } else { 1991 rptp->srcip.family = AF_INET6; 1992 rptp->srcip.un.ip6addr = 1993 srcip.ip_primary.un.ip6addr; 1994 1995 } 1996 retval = RDMA_SUCCESS; 1997 goto done1; 1998 } 1999 } 2000 rw_exit(&hca->state_lock); 2001 } 2002 done1: 2003 rw_exit(&rib_stat->hcas_list_lock); 2004 done2: 2005 if (addrs4.ri_size > 0) 2006 kmem_free(addrs4.ri_list, addrs4.ri_size); 2007 if (addrs6.ri_size > 0) 2008 kmem_free(addrs6.ri_list, addrs6.ri_size); 2009 return (retval); 2010 } 2011 2012 /* 2013 * Close channel, remove from connection list and 2014 * free up resources allocated for that channel. 2015 */ 2016 rdma_stat 2017 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2018 { 2019 rib_qp_t *qp = ctoqp(conn); 2020 rib_hca_t *hca; 2021 2022 mutex_enter(&conn->c_lock); 2023 if (conn->c_timeout != NULL) { 2024 mutex_exit(&conn->c_lock); 2025 (void) untimeout(conn->c_timeout); 2026 mutex_enter(&conn->c_lock); 2027 } 2028 2029 while (conn->c_flags & C_CLOSE_PENDING) { 2030 cv_wait(&conn->c_cv, &conn->c_lock); 2031 } 2032 mutex_exit(&conn->c_lock); 2033 2034 /* 2035 * c_ref == 0 and connection is in C_DISCONN_PEND 2036 */ 2037 hca = qp->hca; 2038 if (conn_list != NULL) 2039 (void) rib_rm_conn(conn, conn_list); 2040 2041 /* 2042 * There is only one case where we get here with 2043 * qp_hdl = NULL, which is during connection setup on 2044 * the client. In such a case there are no posted 2045 * send/recv buffers. 2046 */ 2047 if (qp->qp_hdl != NULL) { 2048 mutex_enter(&qp->posted_rbufs_lock); 2049 while (qp->n_posted_rbufs) 2050 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2051 mutex_exit(&qp->posted_rbufs_lock); 2052 2053 mutex_enter(&qp->send_rbufs_lock); 2054 while (qp->n_send_rbufs) 2055 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2056 mutex_exit(&qp->send_rbufs_lock); 2057 2058 (void) ibt_free_channel(qp->qp_hdl); 2059 qp->qp_hdl = NULL; 2060 } 2061 2062 ASSERT(qp->rdlist == NULL); 2063 2064 if (qp->replylist != NULL) { 2065 (void) rib_rem_replylist(qp); 2066 } 2067 2068 cv_destroy(&qp->cb_conn_cv); 2069 cv_destroy(&qp->posted_rbufs_cv); 2070 cv_destroy(&qp->send_rbufs_cv); 2071 mutex_destroy(&qp->cb_lock); 2072 mutex_destroy(&qp->replylist_lock); 2073 mutex_destroy(&qp->posted_rbufs_lock); 2074 mutex_destroy(&qp->send_rbufs_lock); 2075 mutex_destroy(&qp->rdlist_lock); 2076 2077 cv_destroy(&conn->c_cv); 2078 mutex_destroy(&conn->c_lock); 2079 2080 if (conn->c_raddr.buf != NULL) { 2081 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2082 } 2083 if (conn->c_laddr.buf != NULL) { 2084 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2085 } 2086 if (conn->c_netid != NULL) { 2087 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2088 } 2089 2090 /* 2091 * Credit control cleanup. 2092 */ 2093 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2094 rdma_clnt_cred_ctrl_t *cc_info; 2095 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2096 cv_destroy(&cc_info->clnt_cc_cv); 2097 } 2098 2099 kmem_free(qp, sizeof (rib_qp_t)); 2100 2101 /* 2102 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2103 * then the hca is no longer being used. 2104 */ 2105 if (conn_list != NULL) { 2106 rw_enter(&hca->state_lock, RW_READER); 2107 if (hca->state == HCA_DETACHED) { 2108 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2109 if (hca->srv_conn_list.conn_hd == NULL) { 2110 rw_enter(&hca->cl_conn_list.conn_lock, 2111 RW_READER); 2112 2113 if (hca->cl_conn_list.conn_hd == NULL) { 2114 mutex_enter(&hca->inuse_lock); 2115 hca->inuse = FALSE; 2116 cv_signal(&hca->cb_cv); 2117 mutex_exit(&hca->inuse_lock); 2118 } 2119 rw_exit(&hca->cl_conn_list.conn_lock); 2120 } 2121 rw_exit(&hca->srv_conn_list.conn_lock); 2122 } 2123 rw_exit(&hca->state_lock); 2124 } 2125 2126 return (RDMA_SUCCESS); 2127 } 2128 2129 /* 2130 * All sends are done under the protection of 2131 * the wdesc->sendwait_lock. n_send_rbufs count 2132 * is protected using the send_rbufs_lock. 2133 * lock ordering is: 2134 * sendwait_lock -> send_rbufs_lock 2135 */ 2136 2137 void 2138 rib_send_hold(rib_qp_t *qp) 2139 { 2140 mutex_enter(&qp->send_rbufs_lock); 2141 qp->n_send_rbufs++; 2142 mutex_exit(&qp->send_rbufs_lock); 2143 } 2144 2145 void 2146 rib_send_rele(rib_qp_t *qp) 2147 { 2148 mutex_enter(&qp->send_rbufs_lock); 2149 qp->n_send_rbufs--; 2150 if (qp->n_send_rbufs == 0) 2151 cv_signal(&qp->send_rbufs_cv); 2152 mutex_exit(&qp->send_rbufs_lock); 2153 } 2154 2155 void 2156 rib_recv_rele(rib_qp_t *qp) 2157 { 2158 mutex_enter(&qp->posted_rbufs_lock); 2159 qp->n_posted_rbufs--; 2160 if (qp->n_posted_rbufs == 0) 2161 cv_signal(&qp->posted_rbufs_cv); 2162 mutex_exit(&qp->posted_rbufs_lock); 2163 } 2164 2165 /* 2166 * Wait for send completion notification. Only on receiving a 2167 * notification be it a successful or error completion, free the 2168 * send_wid. 2169 */ 2170 static rdma_stat 2171 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2172 { 2173 clock_t timout, cv_wait_ret; 2174 rdma_stat error = RDMA_SUCCESS; 2175 int i; 2176 2177 /* 2178 * Wait for send to complete 2179 */ 2180 ASSERT(wd != NULL); 2181 mutex_enter(&wd->sendwait_lock); 2182 if (wd->status == (uint_t)SEND_WAIT) { 2183 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2184 ddi_get_lbolt(); 2185 2186 if (qp->mode == RIB_SERVER) { 2187 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2188 &wd->sendwait_lock, timout)) > 0 && 2189 wd->status == (uint_t)SEND_WAIT) 2190 ; 2191 switch (cv_wait_ret) { 2192 case -1: /* timeout */ 2193 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2194 2195 wd->cv_sig = 0; /* no signal needed */ 2196 error = RDMA_TIMEDOUT; 2197 break; 2198 default: /* got send completion */ 2199 break; 2200 } 2201 } else { 2202 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2203 &wd->sendwait_lock, timout)) > 0 && 2204 wd->status == (uint_t)SEND_WAIT) 2205 ; 2206 switch (cv_wait_ret) { 2207 case -1: /* timeout */ 2208 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2209 2210 wd->cv_sig = 0; /* no signal needed */ 2211 error = RDMA_TIMEDOUT; 2212 break; 2213 case 0: /* interrupted */ 2214 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2215 2216 wd->cv_sig = 0; /* no signal needed */ 2217 error = RDMA_INTR; 2218 break; 2219 default: /* got send completion */ 2220 break; 2221 } 2222 } 2223 } 2224 2225 if (wd->status != (uint_t)SEND_WAIT) { 2226 /* got send completion */ 2227 if (wd->status != RDMA_SUCCESS) { 2228 switch (wd->status) { 2229 case RDMA_CONNLOST: 2230 error = RDMA_CONNLOST; 2231 break; 2232 default: 2233 error = RDMA_FAILED; 2234 break; 2235 } 2236 } 2237 for (i = 0; i < wd->nsbufs; i++) { 2238 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2239 (void *)(uintptr_t)wd->sbufaddr[i]); 2240 } 2241 2242 rib_send_rele(qp); 2243 2244 mutex_exit(&wd->sendwait_lock); 2245 (void) rib_free_sendwait(wd); 2246 2247 } else { 2248 mutex_exit(&wd->sendwait_lock); 2249 } 2250 return (error); 2251 } 2252 2253 static struct send_wid * 2254 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2255 { 2256 struct send_wid *wd; 2257 2258 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2259 wd->xid = xid; 2260 wd->cv_sig = cv_sig; 2261 wd->qp = qp; 2262 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2263 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2264 wd->status = (uint_t)SEND_WAIT; 2265 2266 return (wd); 2267 } 2268 2269 static int 2270 rib_free_sendwait(struct send_wid *wdesc) 2271 { 2272 cv_destroy(&wdesc->wait_cv); 2273 mutex_destroy(&wdesc->sendwait_lock); 2274 kmem_free(wdesc, sizeof (*wdesc)); 2275 2276 return (0); 2277 } 2278 2279 static rdma_stat 2280 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2281 { 2282 mutex_enter(&qp->replylist_lock); 2283 if (rep != NULL) { 2284 (void) rib_remreply(qp, rep); 2285 mutex_exit(&qp->replylist_lock); 2286 return (RDMA_SUCCESS); 2287 } 2288 mutex_exit(&qp->replylist_lock); 2289 return (RDMA_FAILED); 2290 } 2291 2292 /* 2293 * Send buffers are freed here only in case of error in posting 2294 * on QP. If the post succeeded, the send buffers are freed upon 2295 * send completion in rib_sendwait() or in the scq_handler. 2296 */ 2297 rdma_stat 2298 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2299 int send_sig, int cv_sig, caddr_t *swid) 2300 { 2301 struct send_wid *wdesc; 2302 struct clist *clp; 2303 ibt_status_t ibt_status = IBT_SUCCESS; 2304 rdma_stat ret = RDMA_SUCCESS; 2305 ibt_send_wr_t tx_wr; 2306 int i, nds; 2307 ibt_wr_ds_t sgl[DSEG_MAX]; 2308 uint_t total_msg_size; 2309 rib_qp_t *qp; 2310 2311 qp = ctoqp(conn); 2312 2313 ASSERT(cl != NULL); 2314 2315 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2316 2317 nds = 0; 2318 total_msg_size = 0; 2319 clp = cl; 2320 while (clp != NULL) { 2321 if (nds >= DSEG_MAX) { 2322 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2323 return (RDMA_FAILED); 2324 } 2325 sgl[nds].ds_va = clp->w.c_saddr; 2326 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2327 sgl[nds].ds_len = clp->c_len; 2328 total_msg_size += clp->c_len; 2329 clp = clp->c_next; 2330 nds++; 2331 } 2332 2333 if (send_sig) { 2334 /* Set SEND_SIGNAL flag. */ 2335 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2336 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2337 *swid = (caddr_t)wdesc; 2338 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2339 mutex_enter(&wdesc->sendwait_lock); 2340 wdesc->nsbufs = nds; 2341 for (i = 0; i < nds; i++) { 2342 wdesc->sbufaddr[i] = sgl[i].ds_va; 2343 } 2344 } else { 2345 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2346 *swid = NULL; 2347 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2348 } 2349 2350 tx_wr.wr_opcode = IBT_WRC_SEND; 2351 tx_wr.wr_trans = IBT_RC_SRV; 2352 tx_wr.wr_nds = nds; 2353 tx_wr.wr_sgl = sgl; 2354 2355 mutex_enter(&conn->c_lock); 2356 if (conn->c_state == C_CONNECTED) { 2357 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2358 } 2359 if (conn->c_state != C_CONNECTED || 2360 ibt_status != IBT_SUCCESS) { 2361 if (conn->c_state != C_DISCONN_PEND) 2362 conn->c_state = C_ERROR_CONN; 2363 mutex_exit(&conn->c_lock); 2364 if (send_sig) { 2365 for (i = 0; i < nds; i++) { 2366 rib_rbuf_free(conn, SEND_BUFFER, 2367 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2368 } 2369 mutex_exit(&wdesc->sendwait_lock); 2370 (void) rib_free_sendwait(wdesc); 2371 } 2372 return (RDMA_CONNLOST); 2373 } 2374 2375 mutex_exit(&conn->c_lock); 2376 2377 if (send_sig) { 2378 rib_send_hold(qp); 2379 mutex_exit(&wdesc->sendwait_lock); 2380 if (cv_sig) { 2381 /* 2382 * cv_wait for send to complete. 2383 * We can fail due to a timeout or signal or 2384 * unsuccessful send. 2385 */ 2386 ret = rib_sendwait(qp, wdesc); 2387 2388 return (ret); 2389 } 2390 } 2391 2392 return (RDMA_SUCCESS); 2393 } 2394 2395 2396 rdma_stat 2397 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2398 { 2399 rdma_stat ret; 2400 caddr_t wd; 2401 2402 /* send-wait & cv_signal */ 2403 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2404 return (ret); 2405 } 2406 2407 /* 2408 * Deprecated/obsolete interface not used currently 2409 * but earlier used for READ-READ protocol. 2410 * Send RPC reply and wait for RDMA_DONE. 2411 */ 2412 rdma_stat 2413 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2414 { 2415 rdma_stat ret = RDMA_SUCCESS; 2416 struct rdma_done_list *rd; 2417 clock_t cv_wait_ret; 2418 caddr_t *wid = NULL; 2419 rib_qp_t *qp = ctoqp(conn); 2420 2421 mutex_enter(&qp->rdlist_lock); 2422 rd = rdma_done_add(qp, msgid); 2423 2424 /* No cv_signal (whether send-wait or no-send-wait) */ 2425 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2426 2427 if (ret != RDMA_SUCCESS) { 2428 rdma_done_rm(qp, rd); 2429 } else { 2430 /* 2431 * Wait for RDMA_DONE from remote end 2432 */ 2433 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2434 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2435 TR_CLOCK_TICK); 2436 2437 rdma_done_rm(qp, rd); 2438 2439 if (cv_wait_ret < 0) { 2440 ret = RDMA_TIMEDOUT; 2441 } 2442 } 2443 2444 mutex_exit(&qp->rdlist_lock); 2445 return (ret); 2446 } 2447 2448 static struct recv_wid * 2449 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2450 { 2451 struct recv_wid *rwid; 2452 2453 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2454 rwid->xid = msgid; 2455 rwid->addr = sgl->ds_va; 2456 rwid->qp = qp; 2457 2458 return (rwid); 2459 } 2460 2461 static void 2462 rib_free_wid(struct recv_wid *rwid) 2463 { 2464 kmem_free(rwid, sizeof (struct recv_wid)); 2465 } 2466 2467 rdma_stat 2468 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2469 { 2470 rib_qp_t *qp = ctoqp(conn); 2471 struct clist *clp = cl; 2472 struct reply *rep; 2473 struct recv_wid *rwid; 2474 int nds; 2475 ibt_wr_ds_t sgl[DSEG_MAX]; 2476 ibt_recv_wr_t recv_wr; 2477 rdma_stat ret; 2478 ibt_status_t ibt_status; 2479 2480 /* 2481 * rdma_clnt_postrecv uses RECV_BUFFER. 2482 */ 2483 2484 nds = 0; 2485 while (cl != NULL) { 2486 if (nds >= DSEG_MAX) { 2487 ret = RDMA_FAILED; 2488 goto done; 2489 } 2490 sgl[nds].ds_va = cl->w.c_saddr; 2491 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2492 sgl[nds].ds_len = cl->c_len; 2493 cl = cl->c_next; 2494 nds++; 2495 } 2496 2497 if (nds != 1) { 2498 ret = RDMA_FAILED; 2499 goto done; 2500 } 2501 2502 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2503 recv_wr.wr_nds = nds; 2504 recv_wr.wr_sgl = sgl; 2505 2506 rwid = rib_create_wid(qp, &sgl[0], msgid); 2507 if (rwid) { 2508 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2509 } else { 2510 ret = RDMA_NORESOURCE; 2511 goto done; 2512 } 2513 rep = rib_addreplylist(qp, msgid); 2514 if (!rep) { 2515 rib_free_wid(rwid); 2516 ret = RDMA_NORESOURCE; 2517 goto done; 2518 } 2519 2520 mutex_enter(&conn->c_lock); 2521 2522 if (conn->c_state == C_CONNECTED) { 2523 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2524 } 2525 2526 if (conn->c_state != C_CONNECTED || 2527 ibt_status != IBT_SUCCESS) { 2528 if (conn->c_state != C_DISCONN_PEND) 2529 conn->c_state = C_ERROR_CONN; 2530 mutex_exit(&conn->c_lock); 2531 rib_free_wid(rwid); 2532 (void) rib_rem_rep(qp, rep); 2533 ret = RDMA_CONNLOST; 2534 goto done; 2535 } 2536 2537 mutex_enter(&qp->posted_rbufs_lock); 2538 qp->n_posted_rbufs++; 2539 mutex_exit(&qp->posted_rbufs_lock); 2540 2541 mutex_exit(&conn->c_lock); 2542 return (RDMA_SUCCESS); 2543 2544 done: 2545 while (clp != NULL) { 2546 rib_rbuf_free(conn, RECV_BUFFER, 2547 (void *)(uintptr_t)clp->w.c_saddr3); 2548 clp = clp->c_next; 2549 } 2550 return (ret); 2551 } 2552 2553 rdma_stat 2554 rib_svc_post(CONN* conn, struct clist *cl) 2555 { 2556 rib_qp_t *qp = ctoqp(conn); 2557 struct svc_recv *s_recvp; 2558 int nds; 2559 ibt_wr_ds_t sgl[DSEG_MAX]; 2560 ibt_recv_wr_t recv_wr; 2561 ibt_status_t ibt_status; 2562 2563 nds = 0; 2564 while (cl != NULL) { 2565 if (nds >= DSEG_MAX) { 2566 return (RDMA_FAILED); 2567 } 2568 sgl[nds].ds_va = cl->w.c_saddr; 2569 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2570 sgl[nds].ds_len = cl->c_len; 2571 cl = cl->c_next; 2572 nds++; 2573 } 2574 2575 if (nds != 1) { 2576 rib_rbuf_free(conn, RECV_BUFFER, 2577 (caddr_t)(uintptr_t)sgl[0].ds_va); 2578 2579 return (RDMA_FAILED); 2580 } 2581 2582 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2583 recv_wr.wr_nds = nds; 2584 recv_wr.wr_sgl = sgl; 2585 2586 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2587 /* Use s_recvp's addr as wr id */ 2588 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2589 mutex_enter(&conn->c_lock); 2590 if (conn->c_state == C_CONNECTED) { 2591 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2592 } 2593 if (conn->c_state != C_CONNECTED || 2594 ibt_status != IBT_SUCCESS) { 2595 if (conn->c_state != C_DISCONN_PEND) 2596 conn->c_state = C_ERROR_CONN; 2597 mutex_exit(&conn->c_lock); 2598 rib_rbuf_free(conn, RECV_BUFFER, 2599 (caddr_t)(uintptr_t)sgl[0].ds_va); 2600 (void) rib_free_svc_recv(s_recvp); 2601 2602 return (RDMA_CONNLOST); 2603 } 2604 mutex_exit(&conn->c_lock); 2605 2606 return (RDMA_SUCCESS); 2607 } 2608 2609 /* Client */ 2610 rdma_stat 2611 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2612 { 2613 return (rib_clnt_post(conn, cl, msgid)); 2614 } 2615 2616 /* Client */ 2617 rdma_stat 2618 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2619 { 2620 rib_qp_t *qp = ctoqp(conn); 2621 struct reply *rep; 2622 2623 mutex_enter(&qp->replylist_lock); 2624 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2625 if (rep->xid == msgid) { 2626 if (rep->vaddr_cq) { 2627 rib_rbuf_free(conn, RECV_BUFFER, 2628 (caddr_t)(uintptr_t)rep->vaddr_cq); 2629 } 2630 (void) rib_remreply(qp, rep); 2631 break; 2632 } 2633 } 2634 mutex_exit(&qp->replylist_lock); 2635 2636 return (RDMA_SUCCESS); 2637 } 2638 2639 /* Server */ 2640 rdma_stat 2641 rib_post_recv(CONN *conn, struct clist *cl) 2642 { 2643 rib_qp_t *qp = ctoqp(conn); 2644 2645 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2646 mutex_enter(&qp->posted_rbufs_lock); 2647 qp->n_posted_rbufs++; 2648 mutex_exit(&qp->posted_rbufs_lock); 2649 return (RDMA_SUCCESS); 2650 } 2651 return (RDMA_FAILED); 2652 } 2653 2654 /* 2655 * Client side only interface to "recv" the rpc reply buf 2656 * posted earlier by rib_post_resp(conn, cl, msgid). 2657 */ 2658 rdma_stat 2659 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2660 { 2661 struct reply *rep = NULL; 2662 clock_t timout, cv_wait_ret; 2663 rdma_stat ret = RDMA_SUCCESS; 2664 rib_qp_t *qp = ctoqp(conn); 2665 2666 /* 2667 * Find the reply structure for this msgid 2668 */ 2669 mutex_enter(&qp->replylist_lock); 2670 2671 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2672 if (rep->xid == msgid) 2673 break; 2674 } 2675 2676 if (rep != NULL) { 2677 /* 2678 * If message not yet received, wait. 2679 */ 2680 if (rep->status == (uint_t)REPLY_WAIT) { 2681 timout = ddi_get_lbolt() + 2682 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2683 2684 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2685 &qp->replylist_lock, timout)) > 0 && 2686 rep->status == (uint_t)REPLY_WAIT) 2687 ; 2688 2689 switch (cv_wait_ret) { 2690 case -1: /* timeout */ 2691 ret = RDMA_TIMEDOUT; 2692 break; 2693 case 0: 2694 ret = RDMA_INTR; 2695 break; 2696 default: 2697 break; 2698 } 2699 } 2700 2701 if (rep->status == RDMA_SUCCESS) { 2702 struct clist *cl = NULL; 2703 2704 /* 2705 * Got message successfully 2706 */ 2707 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2708 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2709 *clp = cl; 2710 } else { 2711 if (rep->status != (uint_t)REPLY_WAIT) { 2712 /* 2713 * Got error in reply message. Free 2714 * recv buffer here. 2715 */ 2716 ret = rep->status; 2717 rib_rbuf_free(conn, RECV_BUFFER, 2718 (caddr_t)(uintptr_t)rep->vaddr_cq); 2719 } 2720 } 2721 (void) rib_remreply(qp, rep); 2722 } else { 2723 /* 2724 * No matching reply structure found for given msgid on the 2725 * reply wait list. 2726 */ 2727 ret = RDMA_INVAL; 2728 DTRACE_PROBE(rpcib__i__nomatchxid2); 2729 } 2730 2731 /* 2732 * Done. 2733 */ 2734 mutex_exit(&qp->replylist_lock); 2735 return (ret); 2736 } 2737 2738 /* 2739 * RDMA write a buffer to the remote address. 2740 */ 2741 rdma_stat 2742 rib_write(CONN *conn, struct clist *cl, int wait) 2743 { 2744 ibt_send_wr_t tx_wr; 2745 int cv_sig; 2746 ibt_wr_ds_t sgl[DSEG_MAX]; 2747 struct send_wid *wdesc; 2748 ibt_status_t ibt_status; 2749 rdma_stat ret = RDMA_SUCCESS; 2750 rib_qp_t *qp = ctoqp(conn); 2751 uint64_t n_writes = 0; 2752 2753 if (cl == NULL) { 2754 return (RDMA_FAILED); 2755 } 2756 2757 while ((cl != NULL)) { 2758 if (cl->c_len > 0) { 2759 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2760 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2761 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2762 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2763 sgl[0].ds_va = cl->w.c_saddr; 2764 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2765 sgl[0].ds_len = cl->c_len; 2766 2767 if (wait) { 2768 cv_sig = 1; 2769 } else { 2770 if (n_writes > max_unsignaled_rws) { 2771 n_writes = 0; 2772 cv_sig = 1; 2773 } else { 2774 cv_sig = 0; 2775 } 2776 } 2777 2778 if (cv_sig) { 2779 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2780 wdesc = rib_init_sendwait(0, cv_sig, qp); 2781 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2782 mutex_enter(&wdesc->sendwait_lock); 2783 } else { 2784 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2785 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2786 } 2787 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2788 tx_wr.wr_trans = IBT_RC_SRV; 2789 tx_wr.wr_nds = 1; 2790 tx_wr.wr_sgl = sgl; 2791 2792 mutex_enter(&conn->c_lock); 2793 if (conn->c_state == C_CONNECTED) { 2794 ibt_status = 2795 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2796 } 2797 if (conn->c_state != C_CONNECTED || 2798 ibt_status != IBT_SUCCESS) { 2799 if (conn->c_state != C_DISCONN_PEND) 2800 conn->c_state = C_ERROR_CONN; 2801 mutex_exit(&conn->c_lock); 2802 if (cv_sig) { 2803 mutex_exit(&wdesc->sendwait_lock); 2804 (void) rib_free_sendwait(wdesc); 2805 } 2806 return (RDMA_CONNLOST); 2807 } 2808 2809 mutex_exit(&conn->c_lock); 2810 2811 /* 2812 * Wait for send to complete 2813 */ 2814 if (cv_sig) { 2815 2816 rib_send_hold(qp); 2817 mutex_exit(&wdesc->sendwait_lock); 2818 2819 ret = rib_sendwait(qp, wdesc); 2820 if (ret != 0) 2821 return (ret); 2822 } 2823 n_writes ++; 2824 } 2825 cl = cl->c_next; 2826 } 2827 return (RDMA_SUCCESS); 2828 } 2829 2830 /* 2831 * RDMA Read a buffer from the remote address. 2832 */ 2833 rdma_stat 2834 rib_read(CONN *conn, struct clist *cl, int wait) 2835 { 2836 ibt_send_wr_t rx_wr; 2837 int cv_sig = 0; 2838 ibt_wr_ds_t sgl; 2839 struct send_wid *wdesc; 2840 ibt_status_t ibt_status = IBT_SUCCESS; 2841 rdma_stat ret = RDMA_SUCCESS; 2842 rib_qp_t *qp = ctoqp(conn); 2843 2844 if (cl == NULL) { 2845 return (RDMA_FAILED); 2846 } 2847 2848 while (cl != NULL) { 2849 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2850 /* 2851 * Remote address is at the head chunk item in list. 2852 */ 2853 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2854 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2855 2856 sgl.ds_va = cl->u.c_daddr; 2857 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2858 sgl.ds_len = cl->c_len; 2859 2860 /* 2861 * If there are multiple chunks to be read, and 2862 * wait is set, ask for signal only for the last chunk 2863 * and wait only on the last chunk. The completion of 2864 * RDMA_READ on last chunk ensures that reads on all 2865 * previous chunks are also completed. 2866 */ 2867 if (wait && (cl->c_next == NULL)) { 2868 cv_sig = 1; 2869 wdesc = rib_init_sendwait(0, cv_sig, qp); 2870 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2871 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2872 mutex_enter(&wdesc->sendwait_lock); 2873 } else { 2874 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2875 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2876 } 2877 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2878 rx_wr.wr_trans = IBT_RC_SRV; 2879 rx_wr.wr_nds = 1; 2880 rx_wr.wr_sgl = &sgl; 2881 2882 mutex_enter(&conn->c_lock); 2883 if (conn->c_state == C_CONNECTED) { 2884 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2885 } 2886 if (conn->c_state != C_CONNECTED || 2887 ibt_status != IBT_SUCCESS) { 2888 if (conn->c_state != C_DISCONN_PEND) 2889 conn->c_state = C_ERROR_CONN; 2890 mutex_exit(&conn->c_lock); 2891 if (wait && (cl->c_next == NULL)) { 2892 mutex_exit(&wdesc->sendwait_lock); 2893 (void) rib_free_sendwait(wdesc); 2894 } 2895 return (RDMA_CONNLOST); 2896 } 2897 2898 mutex_exit(&conn->c_lock); 2899 2900 /* 2901 * Wait for send to complete if this is the 2902 * last item in the list. 2903 */ 2904 if (wait && cl->c_next == NULL) { 2905 rib_send_hold(qp); 2906 mutex_exit(&wdesc->sendwait_lock); 2907 2908 ret = rib_sendwait(qp, wdesc); 2909 2910 if (ret != 0) 2911 return (ret); 2912 } 2913 cl = cl->c_next; 2914 } 2915 return (RDMA_SUCCESS); 2916 } 2917 2918 /* 2919 * rib_srv_cm_handler() 2920 * Connection Manager callback to handle RC connection requests. 2921 */ 2922 /* ARGSUSED */ 2923 static ibt_cm_status_t 2924 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2925 ibt_cm_return_args_t *ret_args, void *priv_data, 2926 ibt_priv_data_len_t len) 2927 { 2928 queue_t *q; 2929 rib_qp_t *qp; 2930 rib_hca_t *hca; 2931 rdma_stat status = RDMA_SUCCESS; 2932 int i; 2933 struct clist cl; 2934 rdma_buf_t rdbuf = {0}; 2935 void *buf = NULL; 2936 CONN *conn; 2937 ibt_ip_cm_info_t ipinfo; 2938 struct sockaddr_in *s; 2939 struct sockaddr_in6 *s6; 2940 int sin_size = sizeof (struct sockaddr_in); 2941 int in_size = sizeof (struct in_addr); 2942 int sin6_size = sizeof (struct sockaddr_in6); 2943 2944 ASSERT(any != NULL); 2945 ASSERT(event != NULL); 2946 2947 hca = (rib_hca_t *)any; 2948 2949 /* got a connection request */ 2950 switch (event->cm_type) { 2951 case IBT_CM_EVENT_REQ_RCV: 2952 /* 2953 * If the plugin is in the NO_ACCEPT state, bail out. 2954 */ 2955 mutex_enter(&plugin_state_lock); 2956 if (plugin_state == NO_ACCEPT) { 2957 mutex_exit(&plugin_state_lock); 2958 return (IBT_CM_REJECT); 2959 } 2960 mutex_exit(&plugin_state_lock); 2961 2962 /* 2963 * Need to send a MRA MAD to CM so that it does not 2964 * timeout on us. 2965 */ 2966 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2967 event->cm_event.req.req_timeout * 8, NULL, 0); 2968 2969 mutex_enter(&rib_stat->open_hca_lock); 2970 q = rib_stat->q; 2971 mutex_exit(&rib_stat->open_hca_lock); 2972 2973 status = rib_svc_create_chan(hca, (caddr_t)q, 2974 event->cm_event.req.req_prim_hca_port, &qp); 2975 2976 if (status) { 2977 return (IBT_CM_REJECT); 2978 } 2979 2980 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2981 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2982 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2983 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2984 2985 /* 2986 * Pre-posts RECV buffers 2987 */ 2988 conn = qptoc(qp); 2989 for (i = 0; i < preposted_rbufs; i++) { 2990 bzero(&rdbuf, sizeof (rdbuf)); 2991 rdbuf.type = RECV_BUFFER; 2992 buf = rib_rbuf_alloc(conn, &rdbuf); 2993 if (buf == NULL) { 2994 /* 2995 * A connection is not established yet. 2996 * Just flush the channel. Buffers 2997 * posted till now will error out with 2998 * IBT_WC_WR_FLUSHED_ERR. 2999 */ 3000 (void) ibt_flush_channel(qp->qp_hdl); 3001 (void) rib_disconnect_channel(conn, NULL); 3002 return (IBT_CM_REJECT); 3003 } 3004 3005 bzero(&cl, sizeof (cl)); 3006 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3007 cl.c_len = rdbuf.len; 3008 cl.c_smemhandle.mrc_lmr = 3009 rdbuf.handle.mrc_lmr; /* lkey */ 3010 cl.c_next = NULL; 3011 status = rib_post_recv(conn, &cl); 3012 if (status != RDMA_SUCCESS) { 3013 /* 3014 * A connection is not established yet. 3015 * Just flush the channel. Buffers 3016 * posted till now will error out with 3017 * IBT_WC_WR_FLUSHED_ERR. 3018 */ 3019 (void) ibt_flush_channel(qp->qp_hdl); 3020 (void) rib_disconnect_channel(conn, NULL); 3021 return (IBT_CM_REJECT); 3022 } 3023 } 3024 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3025 3026 /* 3027 * Get the address translation 3028 */ 3029 rw_enter(&hca->state_lock, RW_READER); 3030 if (hca->state == HCA_DETACHED) { 3031 rw_exit(&hca->state_lock); 3032 return (IBT_CM_REJECT); 3033 } 3034 rw_exit(&hca->state_lock); 3035 3036 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3037 3038 if (ibt_get_ip_data(event->cm_priv_data_len, 3039 event->cm_priv_data, 3040 &ipinfo) != IBT_SUCCESS) { 3041 3042 return (IBT_CM_REJECT); 3043 } 3044 3045 switch (ipinfo.src_addr.family) { 3046 case AF_INET: 3047 3048 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3049 KM_SLEEP); 3050 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3051 3052 conn->c_raddr.maxlen = 3053 conn->c_raddr.len = sin_size; 3054 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3055 3056 s = (struct sockaddr_in *)conn->c_raddr.buf; 3057 s->sin_family = AF_INET; 3058 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3059 &s->sin_addr, in_size); 3060 3061 conn->c_laddr.maxlen = 3062 conn->c_laddr.len = sin_size; 3063 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3064 3065 s = (struct sockaddr_in *)conn->c_laddr.buf; 3066 s->sin_family = AF_INET; 3067 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3068 &s->sin_addr, in_size); 3069 3070 break; 3071 3072 case AF_INET6: 3073 3074 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3075 KM_SLEEP); 3076 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3077 3078 conn->c_raddr.maxlen = 3079 conn->c_raddr.len = sin6_size; 3080 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3081 3082 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3083 s6->sin6_family = AF_INET6; 3084 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3085 &s6->sin6_addr, 3086 sizeof (struct in6_addr)); 3087 3088 conn->c_laddr.maxlen = 3089 conn->c_laddr.len = sin6_size; 3090 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3091 3092 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3093 s6->sin6_family = AF_INET6; 3094 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3095 &s6->sin6_addr, 3096 sizeof (struct in6_addr)); 3097 3098 break; 3099 3100 default: 3101 return (IBT_CM_REJECT); 3102 } 3103 3104 break; 3105 3106 case IBT_CM_EVENT_CONN_CLOSED: 3107 { 3108 CONN *conn; 3109 rib_qp_t *qp; 3110 3111 switch (event->cm_event.closed) { 3112 case IBT_CM_CLOSED_DREP_RCVD: 3113 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3114 case IBT_CM_CLOSED_DUP: 3115 case IBT_CM_CLOSED_ABORT: 3116 case IBT_CM_CLOSED_ALREADY: 3117 /* 3118 * These cases indicate the local end initiated 3119 * the closing of the channel. Nothing to do here. 3120 */ 3121 break; 3122 default: 3123 /* 3124 * Reason for CONN_CLOSED event must be one of 3125 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3126 * or IBT_CM_CLOSED_STALE. These indicate cases were 3127 * the remote end is closing the channel. In these 3128 * cases free the channel and transition to error 3129 * state 3130 */ 3131 qp = ibt_get_chan_private(event->cm_channel); 3132 conn = qptoc(qp); 3133 mutex_enter(&conn->c_lock); 3134 if (conn->c_state == C_DISCONN_PEND) { 3135 mutex_exit(&conn->c_lock); 3136 break; 3137 } 3138 conn->c_state = C_ERROR_CONN; 3139 3140 /* 3141 * Free the conn if c_ref goes down to 0 3142 */ 3143 if (conn->c_ref == 0) { 3144 /* 3145 * Remove from list and free conn 3146 */ 3147 conn->c_state = C_DISCONN_PEND; 3148 mutex_exit(&conn->c_lock); 3149 (void) rib_disconnect_channel(conn, 3150 &hca->srv_conn_list); 3151 } else { 3152 /* 3153 * conn will be freed when c_ref goes to 0. 3154 * Indicate to cleaning thread not to close 3155 * the connection, but just free the channel. 3156 */ 3157 conn->c_flags |= C_CLOSE_NOTNEEDED; 3158 mutex_exit(&conn->c_lock); 3159 } 3160 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3161 break; 3162 } 3163 break; 3164 } 3165 case IBT_CM_EVENT_CONN_EST: 3166 /* 3167 * RTU received, hence connection established. 3168 */ 3169 if (rib_debug > 1) 3170 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3171 "(CONN_EST) channel established"); 3172 break; 3173 3174 default: 3175 if (rib_debug > 2) { 3176 /* Let CM handle the following events. */ 3177 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3178 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3179 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3180 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3181 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3182 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3183 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3184 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3185 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3186 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3187 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3188 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3189 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3190 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3191 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3192 } 3193 } 3194 return (IBT_CM_DEFAULT); 3195 } 3196 3197 /* accept all other CM messages (i.e. let the CM handle them) */ 3198 return (IBT_CM_ACCEPT); 3199 } 3200 3201 static rdma_stat 3202 rib_register_service(rib_hca_t *hca, int service_type, 3203 uint8_t protocol_num, in_port_t dst_port) 3204 { 3205 ibt_srv_desc_t sdesc; 3206 ibt_hca_portinfo_t *port_infop; 3207 ib_svc_id_t srv_id; 3208 ibt_srv_hdl_t srv_hdl; 3209 uint_t port_size; 3210 uint_t pki, i, num_ports, nbinds; 3211 ibt_status_t ibt_status; 3212 rib_service_t *service; 3213 ib_pkey_t pkey; 3214 3215 /* 3216 * Query all ports for the given HCA 3217 */ 3218 rw_enter(&hca->state_lock, RW_READER); 3219 if (hca->state != HCA_DETACHED) { 3220 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3221 &num_ports, &port_size); 3222 rw_exit(&hca->state_lock); 3223 } else { 3224 rw_exit(&hca->state_lock); 3225 return (RDMA_FAILED); 3226 } 3227 if (ibt_status != IBT_SUCCESS) { 3228 return (RDMA_FAILED); 3229 } 3230 3231 DTRACE_PROBE1(rpcib__i__regservice_numports, 3232 int, num_ports); 3233 3234 for (i = 0; i < num_ports; i++) { 3235 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3236 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3237 int, i+1); 3238 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3239 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3240 int, i+1); 3241 } 3242 } 3243 3244 /* 3245 * Get all the IP addresses on this system to register the 3246 * given "service type" on all DNS recognized IP addrs. 3247 * Each service type such as NFS will have all the systems 3248 * IP addresses as its different names. For now the only 3249 * type of service we support in RPCIB is NFS. 3250 */ 3251 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3252 /* 3253 * Start registering and binding service to active 3254 * on active ports on this HCA. 3255 */ 3256 nbinds = 0; 3257 for (service = rib_stat->service_list; 3258 service && (service->srv_type != service_type); 3259 service = service->next) 3260 ; 3261 3262 if (service == NULL) { 3263 /* 3264 * We use IP addresses as the service names for 3265 * service registration. Register each of them 3266 * with CM to obtain a svc_id and svc_hdl. We do not 3267 * register the service with machine's loopback address. 3268 */ 3269 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3270 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3271 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3272 sdesc.sd_handler = rib_srv_cm_handler; 3273 sdesc.sd_flags = 0; 3274 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3275 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3276 1, &srv_hdl, &srv_id); 3277 if ((ibt_status != IBT_SUCCESS) && 3278 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3279 rw_exit(&rib_stat->service_list_lock); 3280 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3281 int, ibt_status); 3282 ibt_free_portinfo(port_infop, port_size); 3283 return (RDMA_FAILED); 3284 } 3285 3286 /* 3287 * Allocate and prepare a service entry 3288 */ 3289 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3290 3291 service->srv_type = service_type; 3292 service->srv_hdl = srv_hdl; 3293 service->srv_id = srv_id; 3294 3295 service->next = rib_stat->service_list; 3296 rib_stat->service_list = service; 3297 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3298 int, service->srv_type); 3299 } else { 3300 srv_hdl = service->srv_hdl; 3301 srv_id = service->srv_id; 3302 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3303 int, service->srv_type); 3304 } 3305 3306 for (i = 0; i < num_ports; i++) { 3307 ibt_sbind_hdl_t sbp; 3308 rib_hca_service_t *hca_srv; 3309 ib_gid_t gid; 3310 3311 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3312 continue; 3313 3314 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3315 pkey = port_infop[i].p_pkey_tbl[pki]; 3316 3317 rw_enter(&hca->bound_services_lock, RW_READER); 3318 gid = port_infop[i].p_sgid_tbl[0]; 3319 for (hca_srv = hca->bound_services; hca_srv; 3320 hca_srv = hca_srv->next) { 3321 if ((hca_srv->srv_id == service->srv_id) && 3322 (hca_srv->gid.gid_prefix == 3323 gid.gid_prefix) && 3324 (hca_srv->gid.gid_guid == gid.gid_guid)) 3325 break; 3326 } 3327 rw_exit(&hca->bound_services_lock); 3328 if (hca_srv != NULL) { 3329 /* 3330 * port is alreay bound the the service 3331 */ 3332 DTRACE_PROBE1( 3333 rpcib__i__regservice__already__bound, 3334 int, i+1); 3335 nbinds++; 3336 continue; 3337 } 3338 3339 if ((pkey & IBSRM_HB) && 3340 (pkey != IB_PKEY_INVALID_FULL)) { 3341 3342 sbp = NULL; 3343 ibt_status = ibt_bind_service(srv_hdl, 3344 gid, NULL, hca, &sbp); 3345 3346 if (ibt_status == IBT_SUCCESS) { 3347 hca_srv = kmem_zalloc( 3348 sizeof (rib_hca_service_t), 3349 KM_SLEEP); 3350 hca_srv->srv_id = srv_id; 3351 hca_srv->gid = gid; 3352 hca_srv->sbind_hdl = sbp; 3353 3354 rw_enter(&hca->bound_services_lock, 3355 RW_WRITER); 3356 hca_srv->next = hca->bound_services; 3357 hca->bound_services = hca_srv; 3358 rw_exit(&hca->bound_services_lock); 3359 nbinds++; 3360 } 3361 3362 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3363 int, ibt_status); 3364 } 3365 } 3366 } 3367 rw_exit(&rib_stat->service_list_lock); 3368 3369 ibt_free_portinfo(port_infop, port_size); 3370 3371 if (nbinds == 0) { 3372 return (RDMA_FAILED); 3373 } else { 3374 /* 3375 * Put this plugin into accept state, since atleast 3376 * one registration was successful. 3377 */ 3378 mutex_enter(&plugin_state_lock); 3379 plugin_state = ACCEPT; 3380 mutex_exit(&plugin_state_lock); 3381 return (RDMA_SUCCESS); 3382 } 3383 } 3384 3385 void 3386 rib_listen(struct rdma_svc_data *rd) 3387 { 3388 rdma_stat status; 3389 int n_listening = 0; 3390 rib_hca_t *hca; 3391 3392 mutex_enter(&rib_stat->listen_lock); 3393 /* 3394 * if rd parameter is NULL then it means that rib_stat->q is 3395 * already initialized by a call from RDMA and we just want to 3396 * add a newly attached HCA to the same listening state as other 3397 * HCAs. 3398 */ 3399 if (rd == NULL) { 3400 if (rib_stat->q == NULL) { 3401 mutex_exit(&rib_stat->listen_lock); 3402 return; 3403 } 3404 } else { 3405 rib_stat->q = &rd->q; 3406 } 3407 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3408 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3409 /* 3410 * First check if a hca is still attached 3411 */ 3412 rw_enter(&hca->state_lock, RW_READER); 3413 if (hca->state != HCA_INITED) { 3414 rw_exit(&hca->state_lock); 3415 continue; 3416 } 3417 rw_exit(&hca->state_lock); 3418 3419 /* 3420 * Right now the only service type is NFS. Hence 3421 * force feed this value. Ideally to communicate 3422 * the service type it should be passed down in 3423 * rdma_svc_data. 3424 */ 3425 status = rib_register_service(hca, NFS, 3426 IPPROTO_TCP, nfs_rdma_port); 3427 if (status == RDMA_SUCCESS) 3428 n_listening++; 3429 } 3430 rw_exit(&rib_stat->hcas_list_lock); 3431 3432 /* 3433 * Service active on an HCA, check rd->err_code for more 3434 * explainable errors. 3435 */ 3436 if (rd) { 3437 if (n_listening > 0) { 3438 rd->active = 1; 3439 rd->err_code = RDMA_SUCCESS; 3440 } else { 3441 rd->active = 0; 3442 rd->err_code = RDMA_FAILED; 3443 } 3444 } 3445 mutex_exit(&rib_stat->listen_lock); 3446 } 3447 3448 /* XXXX */ 3449 /* ARGSUSED */ 3450 static void 3451 rib_listen_stop(struct rdma_svc_data *svcdata) 3452 { 3453 rib_hca_t *hca; 3454 3455 mutex_enter(&rib_stat->listen_lock); 3456 /* 3457 * KRPC called the RDMATF to stop the listeners, this means 3458 * stop sending incomming or recieved requests to KRPC master 3459 * transport handle for RDMA-IB. This is also means that the 3460 * master transport handle, responsible for us, is going away. 3461 */ 3462 mutex_enter(&plugin_state_lock); 3463 plugin_state = NO_ACCEPT; 3464 if (svcdata != NULL) 3465 svcdata->active = 0; 3466 mutex_exit(&plugin_state_lock); 3467 3468 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3469 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3470 /* 3471 * First check if a hca is still attached 3472 */ 3473 rw_enter(&hca->state_lock, RW_READER); 3474 if (hca->state == HCA_DETACHED) { 3475 rw_exit(&hca->state_lock); 3476 continue; 3477 } 3478 rib_close_channels(&hca->srv_conn_list); 3479 rib_stop_services(hca); 3480 rw_exit(&hca->state_lock); 3481 } 3482 rw_exit(&rib_stat->hcas_list_lock); 3483 3484 /* 3485 * Avoid rib_listen() using the stale q field. 3486 * This could happen if a port goes up after all services 3487 * are already unregistered. 3488 */ 3489 rib_stat->q = NULL; 3490 mutex_exit(&rib_stat->listen_lock); 3491 } 3492 3493 /* 3494 * Traverse the HCA's service list to unbind and deregister services. 3495 * For each bound service of HCA to be removed, first find the corresponding 3496 * service handle (srv_hdl) and then unbind the service by calling 3497 * ibt_unbind_service(). 3498 */ 3499 static void 3500 rib_stop_services(rib_hca_t *hca) 3501 { 3502 rib_hca_service_t *srv_list, *to_remove; 3503 3504 /* 3505 * unbind and deregister the services for this service type. 3506 * Right now there is only one service type. In future it will 3507 * be passed down to this function. 3508 */ 3509 rw_enter(&hca->bound_services_lock, RW_READER); 3510 srv_list = hca->bound_services; 3511 hca->bound_services = NULL; 3512 rw_exit(&hca->bound_services_lock); 3513 3514 while (srv_list != NULL) { 3515 rib_service_t *sc; 3516 3517 to_remove = srv_list; 3518 srv_list = to_remove->next; 3519 rw_enter(&rib_stat->service_list_lock, RW_READER); 3520 for (sc = rib_stat->service_list; 3521 sc && (sc->srv_id != to_remove->srv_id); 3522 sc = sc->next) 3523 ; 3524 /* 3525 * if sc is NULL then the service doesn't exist anymore, 3526 * probably just removed completely through rib_stat. 3527 */ 3528 if (sc != NULL) 3529 (void) ibt_unbind_service(sc->srv_hdl, 3530 to_remove->sbind_hdl); 3531 rw_exit(&rib_stat->service_list_lock); 3532 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3533 } 3534 } 3535 3536 static struct svc_recv * 3537 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3538 { 3539 struct svc_recv *recvp; 3540 3541 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3542 recvp->vaddr = sgl->ds_va; 3543 recvp->qp = qp; 3544 recvp->bytes_xfer = 0; 3545 return (recvp); 3546 } 3547 3548 static int 3549 rib_free_svc_recv(struct svc_recv *recvp) 3550 { 3551 kmem_free(recvp, sizeof (*recvp)); 3552 3553 return (0); 3554 } 3555 3556 static struct reply * 3557 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3558 { 3559 struct reply *rep; 3560 3561 3562 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3563 if (rep == NULL) { 3564 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3565 return (NULL); 3566 } 3567 rep->xid = msgid; 3568 rep->vaddr_cq = NULL; 3569 rep->bytes_xfer = 0; 3570 rep->status = (uint_t)REPLY_WAIT; 3571 rep->prev = NULL; 3572 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3573 3574 mutex_enter(&qp->replylist_lock); 3575 if (qp->replylist) { 3576 rep->next = qp->replylist; 3577 qp->replylist->prev = rep; 3578 } 3579 qp->rep_list_size++; 3580 3581 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3582 int, qp->rep_list_size); 3583 3584 qp->replylist = rep; 3585 mutex_exit(&qp->replylist_lock); 3586 3587 return (rep); 3588 } 3589 3590 static rdma_stat 3591 rib_rem_replylist(rib_qp_t *qp) 3592 { 3593 struct reply *r, *n; 3594 3595 mutex_enter(&qp->replylist_lock); 3596 for (r = qp->replylist; r != NULL; r = n) { 3597 n = r->next; 3598 (void) rib_remreply(qp, r); 3599 } 3600 mutex_exit(&qp->replylist_lock); 3601 3602 return (RDMA_SUCCESS); 3603 } 3604 3605 static int 3606 rib_remreply(rib_qp_t *qp, struct reply *rep) 3607 { 3608 3609 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3610 if (rep->prev) { 3611 rep->prev->next = rep->next; 3612 } 3613 if (rep->next) { 3614 rep->next->prev = rep->prev; 3615 } 3616 if (qp->replylist == rep) 3617 qp->replylist = rep->next; 3618 3619 cv_destroy(&rep->wait_cv); 3620 qp->rep_list_size--; 3621 3622 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3623 int, qp->rep_list_size); 3624 3625 kmem_free(rep, sizeof (*rep)); 3626 3627 return (0); 3628 } 3629 3630 rdma_stat 3631 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3632 struct mrc *buf_handle) 3633 { 3634 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3635 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3636 rdma_stat status; 3637 rib_hca_t *hca = (ctoqp(conn))->hca; 3638 3639 /* 3640 * Note: ALL buffer pools use the same memory type RDMARW. 3641 */ 3642 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3643 if (status == RDMA_SUCCESS) { 3644 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3645 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3646 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3647 } else { 3648 buf_handle->mrc_linfo = NULL; 3649 buf_handle->mrc_lmr = 0; 3650 buf_handle->mrc_rmr = 0; 3651 } 3652 return (status); 3653 } 3654 3655 static rdma_stat 3656 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3657 ibt_mr_flags_t spec, 3658 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3659 { 3660 ibt_mr_attr_t mem_attr; 3661 ibt_status_t ibt_status; 3662 mem_attr.mr_vaddr = (uintptr_t)buf; 3663 mem_attr.mr_len = (ib_msglen_t)size; 3664 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3665 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3666 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3667 IBT_MR_ENABLE_WINDOW_BIND | spec; 3668 3669 rw_enter(&hca->state_lock, RW_READER); 3670 if (hca->state != HCA_DETACHED) { 3671 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3672 &mem_attr, mr_hdlp, mr_descp); 3673 rw_exit(&hca->state_lock); 3674 } else { 3675 rw_exit(&hca->state_lock); 3676 return (RDMA_FAILED); 3677 } 3678 3679 if (ibt_status != IBT_SUCCESS) { 3680 return (RDMA_FAILED); 3681 } 3682 return (RDMA_SUCCESS); 3683 } 3684 3685 rdma_stat 3686 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3687 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3688 { 3689 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3690 rib_lrc_entry_t *l; 3691 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3692 rdma_stat status; 3693 rib_hca_t *hca = (ctoqp(conn))->hca; 3694 3695 /* 3696 * Non-coherent memory registration. 3697 */ 3698 l = (rib_lrc_entry_t *)lrc; 3699 if (l) { 3700 if (l->registered) { 3701 buf_handle->mrc_linfo = 3702 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3703 buf_handle->mrc_lmr = 3704 (uint32_t)l->lrc_mhandle.mrc_lmr; 3705 buf_handle->mrc_rmr = 3706 (uint32_t)l->lrc_mhandle.mrc_rmr; 3707 *sync_handle = (RIB_SYNCMEM_HANDLE) 3708 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3709 return (RDMA_SUCCESS); 3710 } else { 3711 /* Always register the whole buffer */ 3712 buf = (caddr_t)l->lrc_buf; 3713 buflen = l->lrc_len; 3714 } 3715 } 3716 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3717 3718 if (status == RDMA_SUCCESS) { 3719 if (l) { 3720 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3721 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3722 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3723 l->registered = TRUE; 3724 } 3725 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3726 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3727 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3728 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3729 } else { 3730 buf_handle->mrc_linfo = NULL; 3731 buf_handle->mrc_lmr = 0; 3732 buf_handle->mrc_rmr = 0; 3733 } 3734 return (status); 3735 } 3736 3737 /* ARGSUSED */ 3738 rdma_stat 3739 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3740 { 3741 rib_hca_t *hca = (ctoqp(conn))->hca; 3742 /* 3743 * Allow memory deregistration even if HCA is 3744 * getting detached. Need all outstanding 3745 * memory registrations to be deregistered 3746 * before HCA_DETACH_EVENT can be accepted. 3747 */ 3748 (void) ibt_deregister_mr(hca->hca_hdl, 3749 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3750 return (RDMA_SUCCESS); 3751 } 3752 3753 /* ARGSUSED */ 3754 rdma_stat 3755 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3756 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3757 { 3758 rib_lrc_entry_t *l; 3759 l = (rib_lrc_entry_t *)lrc; 3760 if (l) 3761 if (l->registered) 3762 return (RDMA_SUCCESS); 3763 3764 (void) rib_deregistermem(conn, buf, buf_handle); 3765 3766 return (RDMA_SUCCESS); 3767 } 3768 3769 /* ARGSUSED */ 3770 rdma_stat 3771 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3772 int len, int cpu) 3773 { 3774 ibt_status_t status; 3775 rib_hca_t *hca = (ctoqp(conn))->hca; 3776 ibt_mr_sync_t mr_segment; 3777 3778 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3779 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3780 mr_segment.ms_len = (ib_memlen_t)len; 3781 if (cpu) { 3782 /* make incoming data visible to memory */ 3783 mr_segment.ms_flags = IBT_SYNC_WRITE; 3784 } else { 3785 /* make memory changes visible to IO */ 3786 mr_segment.ms_flags = IBT_SYNC_READ; 3787 } 3788 rw_enter(&hca->state_lock, RW_READER); 3789 if (hca->state != HCA_DETACHED) { 3790 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3791 rw_exit(&hca->state_lock); 3792 } else { 3793 rw_exit(&hca->state_lock); 3794 return (RDMA_FAILED); 3795 } 3796 3797 if (status == IBT_SUCCESS) 3798 return (RDMA_SUCCESS); 3799 else { 3800 return (RDMA_FAILED); 3801 } 3802 } 3803 3804 /* 3805 * XXXX ???? 3806 */ 3807 static rdma_stat 3808 rib_getinfo(rdma_info_t *info) 3809 { 3810 /* 3811 * XXXX Hack! 3812 */ 3813 info->addrlen = 16; 3814 info->mts = 1000000; 3815 info->mtu = 1000000; 3816 3817 return (RDMA_SUCCESS); 3818 } 3819 3820 rib_bufpool_t * 3821 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3822 { 3823 rib_bufpool_t *rbp = NULL; 3824 bufpool_t *bp = NULL; 3825 caddr_t buf; 3826 ibt_mr_attr_t mem_attr; 3827 ibt_status_t ibt_status; 3828 int i, j; 3829 3830 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3831 3832 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3833 num * sizeof (void *), KM_SLEEP); 3834 3835 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3836 bp->numelems = num; 3837 3838 3839 switch (ptype) { 3840 case SEND_BUFFER: 3841 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3842 bp->rsize = RPC_MSG_SZ; 3843 break; 3844 case RECV_BUFFER: 3845 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3846 bp->rsize = RPC_BUF_SIZE; 3847 break; 3848 default: 3849 goto fail; 3850 } 3851 3852 /* 3853 * Register the pool. 3854 */ 3855 bp->bufsize = num * bp->rsize; 3856 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3857 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3858 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3859 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3860 sizeof (ibt_mr_desc_t), KM_SLEEP); 3861 rw_enter(&hca->state_lock, RW_READER); 3862 3863 if (hca->state == HCA_DETACHED) { 3864 rw_exit(&hca->state_lock); 3865 goto fail; 3866 } 3867 3868 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3869 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3870 mem_attr.mr_vaddr = (uintptr_t)buf; 3871 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3872 mem_attr.mr_as = NULL; 3873 ibt_status = ibt_register_mr(hca->hca_hdl, 3874 hca->pd_hdl, &mem_attr, 3875 &rbp->mr_hdl[i], 3876 &rbp->mr_desc[i]); 3877 if (ibt_status != IBT_SUCCESS) { 3878 for (j = 0; j < i; j++) { 3879 (void) ibt_deregister_mr(hca->hca_hdl, 3880 rbp->mr_hdl[j]); 3881 } 3882 rw_exit(&hca->state_lock); 3883 goto fail; 3884 } 3885 } 3886 rw_exit(&hca->state_lock); 3887 buf = (caddr_t)bp->buf; 3888 for (i = 0; i < num; i++, buf += bp->rsize) { 3889 bp->buflist[i] = (void *)buf; 3890 } 3891 bp->buffree = num - 1; /* no. of free buffers */ 3892 rbp->bpool = bp; 3893 3894 return (rbp); 3895 fail: 3896 if (bp) { 3897 if (bp->buf) 3898 kmem_free(bp->buf, bp->bufsize); 3899 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3900 } 3901 if (rbp) { 3902 if (rbp->mr_hdl) 3903 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3904 if (rbp->mr_desc) 3905 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3906 kmem_free(rbp, sizeof (rib_bufpool_t)); 3907 } 3908 return (NULL); 3909 } 3910 3911 static void 3912 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3913 { 3914 int i; 3915 rib_bufpool_t *rbp = NULL; 3916 bufpool_t *bp; 3917 3918 /* 3919 * Obtain pool address based on type of pool 3920 */ 3921 switch (ptype) { 3922 case SEND_BUFFER: 3923 rbp = hca->send_pool; 3924 break; 3925 case RECV_BUFFER: 3926 rbp = hca->recv_pool; 3927 break; 3928 default: 3929 return; 3930 } 3931 if (rbp == NULL) 3932 return; 3933 3934 bp = rbp->bpool; 3935 3936 /* 3937 * Deregister the pool memory and free it. 3938 */ 3939 for (i = 0; i < bp->numelems; i++) { 3940 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3941 } 3942 } 3943 3944 static void 3945 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3946 { 3947 3948 rib_bufpool_t *rbp = NULL; 3949 bufpool_t *bp; 3950 3951 /* 3952 * Obtain pool address based on type of pool 3953 */ 3954 switch (ptype) { 3955 case SEND_BUFFER: 3956 rbp = hca->send_pool; 3957 break; 3958 case RECV_BUFFER: 3959 rbp = hca->recv_pool; 3960 break; 3961 default: 3962 return; 3963 } 3964 if (rbp == NULL) 3965 return; 3966 3967 bp = rbp->bpool; 3968 3969 /* 3970 * Free the pool memory. 3971 */ 3972 if (rbp->mr_hdl) 3973 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3974 3975 if (rbp->mr_desc) 3976 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3977 if (bp->buf) 3978 kmem_free(bp->buf, bp->bufsize); 3979 mutex_destroy(&bp->buflock); 3980 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3981 kmem_free(rbp, sizeof (rib_bufpool_t)); 3982 } 3983 3984 void 3985 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3986 { 3987 /* 3988 * Deregister the pool memory and free it. 3989 */ 3990 rib_rbufpool_deregister(hca, ptype); 3991 rib_rbufpool_free(hca, ptype); 3992 } 3993 3994 /* 3995 * Fetch a buffer from the pool of type specified in rdbuf->type. 3996 */ 3997 static rdma_stat 3998 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3999 { 4000 rib_lrc_entry_t *rlep; 4001 4002 if (rdbuf->type == RDMA_LONG_BUFFER) { 4003 rlep = rib_get_cache_buf(conn, rdbuf->len); 4004 rdbuf->rb_private = (caddr_t)rlep; 4005 rdbuf->addr = rlep->lrc_buf; 4006 rdbuf->handle = rlep->lrc_mhandle; 4007 return (RDMA_SUCCESS); 4008 } 4009 4010 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4011 if (rdbuf->addr) { 4012 switch (rdbuf->type) { 4013 case SEND_BUFFER: 4014 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4015 break; 4016 case RECV_BUFFER: 4017 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4018 break; 4019 default: 4020 rdbuf->len = 0; 4021 } 4022 return (RDMA_SUCCESS); 4023 } else 4024 return (RDMA_FAILED); 4025 } 4026 4027 /* 4028 * Fetch a buffer of specified type. 4029 * Note that rdbuf->handle is mw's rkey. 4030 */ 4031 static void * 4032 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4033 { 4034 rib_qp_t *qp = ctoqp(conn); 4035 rib_hca_t *hca = qp->hca; 4036 rdma_btype ptype = rdbuf->type; 4037 void *buf; 4038 rib_bufpool_t *rbp = NULL; 4039 bufpool_t *bp; 4040 int i; 4041 4042 /* 4043 * Obtain pool address based on type of pool 4044 */ 4045 switch (ptype) { 4046 case SEND_BUFFER: 4047 rbp = hca->send_pool; 4048 break; 4049 case RECV_BUFFER: 4050 rbp = hca->recv_pool; 4051 break; 4052 default: 4053 return (NULL); 4054 } 4055 if (rbp == NULL) 4056 return (NULL); 4057 4058 bp = rbp->bpool; 4059 4060 mutex_enter(&bp->buflock); 4061 if (bp->buffree < 0) { 4062 mutex_exit(&bp->buflock); 4063 return (NULL); 4064 } 4065 4066 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4067 buf = bp->buflist[bp->buffree]; 4068 rdbuf->addr = buf; 4069 rdbuf->len = bp->rsize; 4070 for (i = bp->numelems - 1; i >= 0; i--) { 4071 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4072 rdbuf->handle.mrc_rmr = 4073 (uint32_t)rbp->mr_desc[i].md_rkey; 4074 rdbuf->handle.mrc_linfo = 4075 (uintptr_t)rbp->mr_hdl[i]; 4076 rdbuf->handle.mrc_lmr = 4077 (uint32_t)rbp->mr_desc[i].md_lkey; 4078 bp->buffree--; 4079 4080 mutex_exit(&bp->buflock); 4081 4082 return (buf); 4083 } 4084 } 4085 4086 mutex_exit(&bp->buflock); 4087 4088 return (NULL); 4089 } 4090 4091 static void 4092 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4093 { 4094 4095 if (rdbuf->type == RDMA_LONG_BUFFER) { 4096 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4097 rdbuf->rb_private = NULL; 4098 return; 4099 } 4100 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4101 } 4102 4103 static void 4104 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4105 { 4106 rib_qp_t *qp = ctoqp(conn); 4107 rib_hca_t *hca = qp->hca; 4108 rib_bufpool_t *rbp = NULL; 4109 bufpool_t *bp; 4110 4111 /* 4112 * Obtain pool address based on type of pool 4113 */ 4114 switch (ptype) { 4115 case SEND_BUFFER: 4116 rbp = hca->send_pool; 4117 break; 4118 case RECV_BUFFER: 4119 rbp = hca->recv_pool; 4120 break; 4121 default: 4122 return; 4123 } 4124 if (rbp == NULL) 4125 return; 4126 4127 bp = rbp->bpool; 4128 4129 mutex_enter(&bp->buflock); 4130 if (++bp->buffree >= bp->numelems) { 4131 /* 4132 * Should never happen 4133 */ 4134 bp->buffree--; 4135 } else { 4136 bp->buflist[bp->buffree] = buf; 4137 } 4138 mutex_exit(&bp->buflock); 4139 } 4140 4141 static rdma_stat 4142 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4143 { 4144 rw_enter(&connlist->conn_lock, RW_WRITER); 4145 if (connlist->conn_hd) { 4146 cn->c_next = connlist->conn_hd; 4147 connlist->conn_hd->c_prev = cn; 4148 } 4149 connlist->conn_hd = cn; 4150 rw_exit(&connlist->conn_lock); 4151 4152 return (RDMA_SUCCESS); 4153 } 4154 4155 static rdma_stat 4156 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4157 { 4158 rw_enter(&connlist->conn_lock, RW_WRITER); 4159 if (cn->c_prev) { 4160 cn->c_prev->c_next = cn->c_next; 4161 } 4162 if (cn->c_next) { 4163 cn->c_next->c_prev = cn->c_prev; 4164 } 4165 if (connlist->conn_hd == cn) 4166 connlist->conn_hd = cn->c_next; 4167 rw_exit(&connlist->conn_lock); 4168 4169 return (RDMA_SUCCESS); 4170 } 4171 4172 /* ARGSUSED */ 4173 static rdma_stat 4174 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4175 int addr_type, void *handle, CONN **conn) 4176 { 4177 rdma_stat status; 4178 rpcib_ping_t rpt; 4179 4180 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4181 return (status); 4182 } 4183 4184 /* 4185 * rib_find_hca_connection 4186 * 4187 * if there is an existing connection to the specified address then 4188 * it will be returned in conn, otherwise conn will be set to NULL. 4189 * Also cleans up any connection that is in error state. 4190 */ 4191 static int 4192 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4193 struct netbuf *d_svcaddr, CONN **conn) 4194 { 4195 CONN *cn; 4196 clock_t cv_stat, timout; 4197 4198 *conn = NULL; 4199 again: 4200 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4201 cn = hca->cl_conn_list.conn_hd; 4202 while (cn != NULL) { 4203 /* 4204 * First, clear up any connection in the ERROR state 4205 */ 4206 mutex_enter(&cn->c_lock); 4207 if (cn->c_state == C_ERROR_CONN) { 4208 if (cn->c_ref == 0) { 4209 /* 4210 * Remove connection from list and destroy it. 4211 */ 4212 cn->c_state = C_DISCONN_PEND; 4213 mutex_exit(&cn->c_lock); 4214 rw_exit(&hca->cl_conn_list.conn_lock); 4215 rib_conn_close((void *)cn); 4216 goto again; 4217 } 4218 mutex_exit(&cn->c_lock); 4219 cn = cn->c_next; 4220 continue; 4221 } 4222 if (cn->c_state == C_DISCONN_PEND) { 4223 mutex_exit(&cn->c_lock); 4224 cn = cn->c_next; 4225 continue; 4226 } 4227 4228 /* 4229 * source address is only checked for if there is one, 4230 * this is the case for retries. 4231 */ 4232 if ((cn->c_raddr.len == d_svcaddr->len) && 4233 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4234 d_svcaddr->len) == 0) && 4235 ((s_svcaddr->len == 0) || 4236 ((cn->c_laddr.len == s_svcaddr->len) && 4237 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4238 s_svcaddr->len) == 0)))) { 4239 /* 4240 * Our connection. Give up conn list lock 4241 * as we are done traversing the list. 4242 */ 4243 rw_exit(&hca->cl_conn_list.conn_lock); 4244 if (cn->c_state == C_CONNECTED) { 4245 cn->c_ref++; /* sharing a conn */ 4246 mutex_exit(&cn->c_lock); 4247 *conn = cn; 4248 return (RDMA_SUCCESS); 4249 } 4250 if (cn->c_state == C_CONN_PEND) { 4251 /* 4252 * Hold a reference to this conn before 4253 * we give up the lock. 4254 */ 4255 cn->c_ref++; 4256 timout = ddi_get_lbolt() + 4257 drv_usectohz(CONN_WAIT_TIME * 1000000); 4258 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4259 &cn->c_lock, timout)) > 0 && 4260 cn->c_state == C_CONN_PEND) 4261 ; 4262 if (cv_stat == 0) { 4263 (void) rib_conn_release_locked(cn); 4264 return (RDMA_INTR); 4265 } 4266 if (cv_stat < 0) { 4267 (void) rib_conn_release_locked(cn); 4268 return (RDMA_TIMEDOUT); 4269 } 4270 if (cn->c_state == C_CONNECTED) { 4271 *conn = cn; 4272 mutex_exit(&cn->c_lock); 4273 return (RDMA_SUCCESS); 4274 } else { 4275 (void) rib_conn_release_locked(cn); 4276 return (RDMA_TIMEDOUT); 4277 } 4278 } 4279 } 4280 mutex_exit(&cn->c_lock); 4281 cn = cn->c_next; 4282 } 4283 rw_exit(&hca->cl_conn_list.conn_lock); 4284 *conn = NULL; 4285 return (RDMA_FAILED); 4286 } 4287 4288 /* 4289 * Connection management. 4290 * IBTF does not support recycling of channels. So connections are only 4291 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4292 * C_DISCONN_PEND state. No C_IDLE state. 4293 * C_CONN_PEND state: Connection establishment in progress to the server. 4294 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4295 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4296 * only in this state. 4297 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4298 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4299 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4300 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4301 * c_ref drops to 0 (this indicates that RPC has no more references to this 4302 * connection), the connection should be destroyed. A connection transitions 4303 * into this state when it is being destroyed. 4304 */ 4305 /* ARGSUSED */ 4306 static rdma_stat 4307 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4308 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4309 { 4310 CONN *cn; 4311 int status; 4312 rib_hca_t *hca; 4313 rib_qp_t *qp; 4314 int s_addr_len; 4315 char *s_addr_buf; 4316 4317 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4318 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4319 rw_enter(&hca->state_lock, RW_READER); 4320 if (hca->state != HCA_DETACHED) { 4321 status = rib_find_hca_connection(hca, s_svcaddr, 4322 d_svcaddr, conn); 4323 rw_exit(&hca->state_lock); 4324 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4325 rw_exit(&rib_stat->hcas_list_lock); 4326 return (status); 4327 } 4328 } else 4329 rw_exit(&hca->state_lock); 4330 } 4331 rw_exit(&rib_stat->hcas_list_lock); 4332 4333 /* 4334 * No existing connection found, establish a new connection. 4335 */ 4336 bzero(rpt, sizeof (rpcib_ping_t)); 4337 4338 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4339 if (status != RDMA_SUCCESS) { 4340 return (RDMA_FAILED); 4341 } 4342 hca = rpt->hca; 4343 4344 if (rpt->srcip.family == AF_INET) { 4345 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4346 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4347 } else if (rpt->srcip.family == AF_INET6) { 4348 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4349 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4350 } else { 4351 return (RDMA_FAILED); 4352 } 4353 4354 /* 4355 * Channel to server doesn't exist yet, create one. 4356 */ 4357 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4358 return (RDMA_FAILED); 4359 } 4360 cn = qptoc(qp); 4361 cn->c_state = C_CONN_PEND; 4362 cn->c_ref = 1; 4363 4364 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4365 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4366 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4367 4368 if (rpt->srcip.family == AF_INET) { 4369 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4370 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4371 } else { 4372 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4373 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4374 } 4375 4376 /* 4377 * Add to conn list. 4378 * We had given up the READER lock. In the time since then, 4379 * another thread might have created the connection we are 4380 * trying here. But for now, that is quiet alright - there 4381 * might be two connections between a pair of hosts instead 4382 * of one. If we really want to close that window, 4383 * then need to check the list after acquiring the 4384 * WRITER lock. 4385 */ 4386 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4387 status = rib_conn_to_srv(hca, qp, rpt); 4388 mutex_enter(&cn->c_lock); 4389 4390 if (cn->c_flags & C_CLOSE_PENDING) { 4391 /* 4392 * This handles a case where the module or 4393 * HCA detached in the time a connection is 4394 * established. In such a case close the 4395 * connection immediately if this is the 4396 * only reference. 4397 */ 4398 if (cn->c_ref == 1) { 4399 cn->c_ref--; 4400 cn->c_state = C_DISCONN_PEND; 4401 mutex_exit(&cn->c_lock); 4402 rib_conn_close((void *)cn); 4403 return (RDMA_FAILED); 4404 } 4405 4406 /* 4407 * Connection to be closed later when c_ref = 0 4408 */ 4409 status = RDMA_FAILED; 4410 } 4411 4412 if (status == RDMA_SUCCESS) { 4413 cn->c_state = C_CONNECTED; 4414 *conn = cn; 4415 } else { 4416 cn->c_state = C_ERROR_CONN; 4417 cn->c_ref--; 4418 } 4419 cv_signal(&cn->c_cv); 4420 mutex_exit(&cn->c_lock); 4421 return (status); 4422 } 4423 4424 static void 4425 rib_conn_close(void *rarg) 4426 { 4427 CONN *conn = (CONN *)rarg; 4428 rib_qp_t *qp = ctoqp(conn); 4429 4430 mutex_enter(&conn->c_lock); 4431 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4432 4433 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4434 4435 /* 4436 * Live connection in CONNECTED state. 4437 */ 4438 if (conn->c_state == C_CONNECTED) { 4439 conn->c_state = C_ERROR_CONN; 4440 } 4441 mutex_exit(&conn->c_lock); 4442 4443 rib_close_a_channel(conn); 4444 4445 mutex_enter(&conn->c_lock); 4446 conn->c_flags &= ~C_CLOSE_PENDING; 4447 } 4448 4449 mutex_exit(&conn->c_lock); 4450 4451 if (qp->mode == RIB_SERVER) 4452 (void) rib_disconnect_channel(conn, 4453 &qp->hca->srv_conn_list); 4454 else 4455 (void) rib_disconnect_channel(conn, 4456 &qp->hca->cl_conn_list); 4457 } 4458 4459 static void 4460 rib_conn_timeout_call(void *carg) 4461 { 4462 time_t idle_time; 4463 CONN *conn = (CONN *)carg; 4464 rib_hca_t *hca = ctoqp(conn)->hca; 4465 int error; 4466 4467 mutex_enter(&conn->c_lock); 4468 if ((conn->c_ref > 0) || 4469 (conn->c_state == C_DISCONN_PEND)) { 4470 conn->c_timeout = NULL; 4471 mutex_exit(&conn->c_lock); 4472 return; 4473 } 4474 4475 idle_time = (gethrestime_sec() - conn->c_last_used); 4476 4477 if ((idle_time <= rib_conn_timeout) && 4478 (conn->c_state != C_ERROR_CONN)) { 4479 /* 4480 * There was activity after the last timeout. 4481 * Extend the conn life. Unless the conn is 4482 * already in error state. 4483 */ 4484 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4485 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4486 mutex_exit(&conn->c_lock); 4487 return; 4488 } 4489 4490 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4491 (void *)conn, DDI_NOSLEEP); 4492 4493 /* 4494 * If taskq dispatch fails above, then reset the timeout 4495 * to try again after 10 secs. 4496 */ 4497 4498 if (error != DDI_SUCCESS) { 4499 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4500 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4501 mutex_exit(&conn->c_lock); 4502 return; 4503 } 4504 4505 conn->c_state = C_DISCONN_PEND; 4506 mutex_exit(&conn->c_lock); 4507 } 4508 4509 static rdma_stat 4510 rib_conn_release(CONN *conn) 4511 { 4512 mutex_enter(&conn->c_lock); 4513 return (rib_conn_release_locked(conn)); 4514 } 4515 4516 /* 4517 * Expects conn->c_lock to be held on entry. 4518 * c_lock released on return 4519 */ 4520 static rdma_stat 4521 rib_conn_release_locked(CONN *conn) 4522 { 4523 conn->c_ref--; 4524 4525 conn->c_last_used = gethrestime_sec(); 4526 if (conn->c_ref > 0) { 4527 mutex_exit(&conn->c_lock); 4528 return (RDMA_SUCCESS); 4529 } 4530 4531 /* 4532 * If a conn is C_ERROR_CONN, close the channel. 4533 */ 4534 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4535 conn->c_state = C_DISCONN_PEND; 4536 mutex_exit(&conn->c_lock); 4537 rib_conn_close((void *)conn); 4538 return (RDMA_SUCCESS); 4539 } 4540 4541 /* 4542 * c_ref == 0, set a timeout for conn release 4543 */ 4544 4545 if (conn->c_timeout == NULL) { 4546 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4547 SEC_TO_TICK(rib_conn_timeout)); 4548 } 4549 4550 mutex_exit(&conn->c_lock); 4551 return (RDMA_SUCCESS); 4552 } 4553 4554 /* 4555 * Add at front of list 4556 */ 4557 static struct rdma_done_list * 4558 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4559 { 4560 struct rdma_done_list *rd; 4561 4562 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4563 4564 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4565 rd->xid = xid; 4566 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4567 4568 rd->prev = NULL; 4569 rd->next = qp->rdlist; 4570 if (qp->rdlist != NULL) 4571 qp->rdlist->prev = rd; 4572 qp->rdlist = rd; 4573 4574 return (rd); 4575 } 4576 4577 static void 4578 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4579 { 4580 struct rdma_done_list *r; 4581 4582 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4583 4584 r = rd->next; 4585 if (r != NULL) { 4586 r->prev = rd->prev; 4587 } 4588 4589 r = rd->prev; 4590 if (r != NULL) { 4591 r->next = rd->next; 4592 } else { 4593 qp->rdlist = rd->next; 4594 } 4595 4596 cv_destroy(&rd->rdma_done_cv); 4597 kmem_free(rd, sizeof (*rd)); 4598 } 4599 4600 static void 4601 rdma_done_rem_list(rib_qp_t *qp) 4602 { 4603 struct rdma_done_list *r, *n; 4604 4605 mutex_enter(&qp->rdlist_lock); 4606 for (r = qp->rdlist; r != NULL; r = n) { 4607 n = r->next; 4608 rdma_done_rm(qp, r); 4609 } 4610 mutex_exit(&qp->rdlist_lock); 4611 } 4612 4613 static void 4614 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4615 { 4616 struct rdma_done_list *r = qp->rdlist; 4617 4618 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4619 4620 while (r) { 4621 if (r->xid == xid) { 4622 cv_signal(&r->rdma_done_cv); 4623 return; 4624 } else { 4625 r = r->next; 4626 } 4627 } 4628 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4629 int, xid); 4630 } 4631 4632 /* 4633 * Expects conn->c_lock to be held by the caller. 4634 */ 4635 4636 static void 4637 rib_close_a_channel(CONN *conn) 4638 { 4639 rib_qp_t *qp; 4640 qp = ctoqp(conn); 4641 4642 if (qp->qp_hdl == NULL) { 4643 /* channel already freed */ 4644 return; 4645 } 4646 4647 /* 4648 * Call ibt_close_rc_channel in blocking mode 4649 * with no callbacks. 4650 */ 4651 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4652 NULL, 0, NULL, NULL, 0); 4653 } 4654 4655 /* 4656 * Goes through all connections and closes the channel 4657 * This will cause all the WRs on those channels to be 4658 * flushed. 4659 */ 4660 static void 4661 rib_close_channels(rib_conn_list_t *connlist) 4662 { 4663 CONN *conn, *tmp; 4664 4665 rw_enter(&connlist->conn_lock, RW_READER); 4666 conn = connlist->conn_hd; 4667 while (conn != NULL) { 4668 mutex_enter(&conn->c_lock); 4669 tmp = conn->c_next; 4670 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4671 4672 if (conn->c_state == C_CONN_PEND) { 4673 conn->c_flags |= C_CLOSE_PENDING; 4674 goto next; 4675 } 4676 4677 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4678 4679 /* 4680 * Live connection in CONNECTED state. 4681 */ 4682 if (conn->c_state == C_CONNECTED) 4683 conn->c_state = C_ERROR_CONN; 4684 mutex_exit(&conn->c_lock); 4685 4686 rib_close_a_channel(conn); 4687 4688 mutex_enter(&conn->c_lock); 4689 conn->c_flags &= ~C_CLOSE_PENDING; 4690 /* Signal a pending rib_disconnect_channel() */ 4691 cv_signal(&conn->c_cv); 4692 } 4693 next: 4694 mutex_exit(&conn->c_lock); 4695 conn = tmp; 4696 } 4697 rw_exit(&connlist->conn_lock); 4698 } 4699 4700 /* 4701 * Frees up all connections that are no longer being referenced 4702 */ 4703 static void 4704 rib_purge_connlist(rib_conn_list_t *connlist) 4705 { 4706 CONN *conn; 4707 4708 top: 4709 rw_enter(&connlist->conn_lock, RW_READER); 4710 conn = connlist->conn_hd; 4711 while (conn != NULL) { 4712 mutex_enter(&conn->c_lock); 4713 4714 /* 4715 * At this point connection is either in ERROR 4716 * or DISCONN_PEND state. If in DISCONN_PEND state 4717 * then some other thread is culling that connection. 4718 * If not and if c_ref is 0, then destroy the connection. 4719 */ 4720 if (conn->c_ref == 0 && 4721 conn->c_state != C_DISCONN_PEND) { 4722 /* 4723 * Cull the connection 4724 */ 4725 conn->c_state = C_DISCONN_PEND; 4726 mutex_exit(&conn->c_lock); 4727 rw_exit(&connlist->conn_lock); 4728 (void) rib_disconnect_channel(conn, connlist); 4729 goto top; 4730 } else { 4731 /* 4732 * conn disconnect already scheduled or will 4733 * happen from conn_release when c_ref drops to 0. 4734 */ 4735 mutex_exit(&conn->c_lock); 4736 } 4737 conn = conn->c_next; 4738 } 4739 rw_exit(&connlist->conn_lock); 4740 4741 /* 4742 * At this point, only connections with c_ref != 0 are on the list 4743 */ 4744 } 4745 4746 /* 4747 * Free all the HCA resources and close 4748 * the hca. 4749 */ 4750 4751 static void 4752 rib_free_hca(rib_hca_t *hca) 4753 { 4754 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4755 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4756 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4757 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4758 4759 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4760 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4761 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4762 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4763 4764 rib_rbufpool_destroy(hca, RECV_BUFFER); 4765 rib_rbufpool_destroy(hca, SEND_BUFFER); 4766 rib_destroy_cache(hca); 4767 if (rib_mod.rdma_count == 0) 4768 (void) rdma_unregister_mod(&rib_mod); 4769 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4770 (void) ibt_close_hca(hca->hca_hdl); 4771 hca->hca_hdl = NULL; 4772 } 4773 4774 4775 static void 4776 rib_stop_hca_services(rib_hca_t *hca) 4777 { 4778 rib_stop_services(hca); 4779 rib_close_channels(&hca->cl_conn_list); 4780 rib_close_channels(&hca->srv_conn_list); 4781 4782 rib_purge_connlist(&hca->cl_conn_list); 4783 rib_purge_connlist(&hca->srv_conn_list); 4784 4785 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4786 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4787 GLOBAL_ZONEID); 4788 stats_enabled = FALSE; 4789 } 4790 4791 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4792 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4793 if (hca->srv_conn_list.conn_hd == NULL && 4794 hca->cl_conn_list.conn_hd == NULL) { 4795 /* 4796 * conn_lists are NULL, so destroy 4797 * buffers, close hca and be done. 4798 */ 4799 rib_free_hca(hca); 4800 } 4801 rw_exit(&hca->cl_conn_list.conn_lock); 4802 rw_exit(&hca->srv_conn_list.conn_lock); 4803 4804 if (hca->hca_hdl != NULL) { 4805 mutex_enter(&hca->inuse_lock); 4806 while (hca->inuse) 4807 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4808 mutex_exit(&hca->inuse_lock); 4809 4810 rib_free_hca(hca); 4811 } 4812 rw_destroy(&hca->bound_services_lock); 4813 4814 if (hca->cleanup_helper != NULL) { 4815 ddi_taskq_destroy(hca->cleanup_helper); 4816 hca->cleanup_helper = NULL; 4817 } 4818 } 4819 4820 /* 4821 * Cleans and closes up all uses of the HCA 4822 */ 4823 static void 4824 rib_detach_hca(ibt_hca_hdl_t hca_hdl) 4825 { 4826 rib_hca_t *hca = NULL; 4827 rib_hca_t **hcap; 4828 4829 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4830 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { 4831 hca = *hcap; 4832 rw_enter(&hca->state_lock, RW_WRITER); 4833 if (hca->hca_hdl == hca_hdl) { 4834 /* 4835 * Mark as detached and remove from 4836 * hca list. 4837 */ 4838 hca->state = HCA_DETACHED; 4839 *hcap = hca->next; 4840 rib_stat->nhca_inited--; 4841 rib_mod.rdma_count--; 4842 rw_exit(&hca->state_lock); 4843 break; 4844 } 4845 rw_exit(&hca->state_lock); 4846 } 4847 rw_exit(&rib_stat->hcas_list_lock); 4848 4849 if (hca == NULL) 4850 return; 4851 ASSERT(hca->hca_hdl == hca_hdl); 4852 4853 /* 4854 * Stop all services on the HCA 4855 * Go through cl_conn_list and close all rc_channels 4856 * Go through svr_conn_list and close all rc_channels 4857 * Free connections whose c_ref has dropped to 0 4858 * Destroy all CQs 4859 * Deregister and released all buffer pool memory after all 4860 * connections are destroyed 4861 * Free the protection domain 4862 * ibt_close_hca() 4863 */ 4864 rib_stop_hca_services(hca); 4865 4866 kmem_free(hca, sizeof (*hca)); 4867 } 4868 4869 static void 4870 rib_server_side_cache_reclaim(void *argp) 4871 { 4872 cache_avl_struct_t *rcas; 4873 rib_lrc_entry_t *rb; 4874 rib_hca_t *hca = (rib_hca_t *)argp; 4875 4876 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4877 rcas = avl_first(&hca->avl_tree); 4878 if (rcas != NULL) 4879 avl_remove(&hca->avl_tree, rcas); 4880 4881 while (rcas != NULL) { 4882 while (rcas->r.forw != &rcas->r) { 4883 rcas->elements--; 4884 rb = rcas->r.forw; 4885 remque(rb); 4886 if (rb->registered) 4887 (void) rib_deregistermem_via_hca(hca, 4888 rb->lrc_buf, rb->lrc_mhandle); 4889 4890 hca->cache_allocation -= rb->lrc_len; 4891 kmem_free(rb->lrc_buf, rb->lrc_len); 4892 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4893 } 4894 mutex_destroy(&rcas->node_lock); 4895 kmem_cache_free(hca->server_side_cache, rcas); 4896 rcas = avl_first(&hca->avl_tree); 4897 if (rcas != NULL) 4898 avl_remove(&hca->avl_tree, rcas); 4899 } 4900 rw_exit(&hca->avl_rw_lock); 4901 } 4902 4903 static void 4904 rib_server_side_cache_cleanup(void *argp) 4905 { 4906 cache_avl_struct_t *rcas; 4907 rib_lrc_entry_t *rb; 4908 rib_hca_t *hca = (rib_hca_t *)argp; 4909 4910 mutex_enter(&hca->cache_allocation_lock); 4911 if (hca->cache_allocation < cache_limit) { 4912 mutex_exit(&hca->cache_allocation_lock); 4913 return; 4914 } 4915 mutex_exit(&hca->cache_allocation_lock); 4916 4917 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4918 rcas = avl_last(&hca->avl_tree); 4919 if (rcas != NULL) 4920 avl_remove(&hca->avl_tree, rcas); 4921 4922 while (rcas != NULL) { 4923 while (rcas->r.forw != &rcas->r) { 4924 rcas->elements--; 4925 rb = rcas->r.forw; 4926 remque(rb); 4927 if (rb->registered) 4928 (void) rib_deregistermem_via_hca(hca, 4929 rb->lrc_buf, rb->lrc_mhandle); 4930 4931 hca->cache_allocation -= rb->lrc_len; 4932 4933 kmem_free(rb->lrc_buf, rb->lrc_len); 4934 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4935 } 4936 mutex_destroy(&rcas->node_lock); 4937 if (hca->server_side_cache) { 4938 kmem_cache_free(hca->server_side_cache, rcas); 4939 } 4940 4941 if (hca->cache_allocation < cache_limit) { 4942 rw_exit(&hca->avl_rw_lock); 4943 return; 4944 } 4945 4946 rcas = avl_last(&hca->avl_tree); 4947 if (rcas != NULL) 4948 avl_remove(&hca->avl_tree, rcas); 4949 } 4950 rw_exit(&hca->avl_rw_lock); 4951 } 4952 4953 static int 4954 avl_compare(const void *t1, const void *t2) 4955 { 4956 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4957 return (0); 4958 4959 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4960 return (-1); 4961 4962 return (1); 4963 } 4964 4965 static void 4966 rib_destroy_cache(rib_hca_t *hca) 4967 { 4968 if (hca->avl_init) { 4969 rib_server_side_cache_reclaim((void *)hca); 4970 if (hca->server_side_cache) { 4971 kmem_cache_destroy(hca->server_side_cache); 4972 hca->server_side_cache = NULL; 4973 } 4974 avl_destroy(&hca->avl_tree); 4975 mutex_destroy(&hca->cache_allocation_lock); 4976 rw_destroy(&hca->avl_rw_lock); 4977 } 4978 hca->avl_init = FALSE; 4979 } 4980 4981 static void 4982 rib_force_cleanup(void *hca) 4983 { 4984 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4985 (void) ddi_taskq_dispatch( 4986 ((rib_hca_t *)hca)->cleanup_helper, 4987 rib_server_side_cache_cleanup, 4988 (void *)hca, DDI_NOSLEEP); 4989 } 4990 4991 static rib_lrc_entry_t * 4992 rib_get_cache_buf(CONN *conn, uint32_t len) 4993 { 4994 cache_avl_struct_t cas, *rcas; 4995 rib_hca_t *hca = (ctoqp(conn))->hca; 4996 rib_lrc_entry_t *reply_buf; 4997 avl_index_t where = NULL; 4998 uint64_t c_alloc = 0; 4999 5000 if (!hca->avl_init) 5001 goto error_alloc; 5002 5003 cas.len = len; 5004 5005 rw_enter(&hca->avl_rw_lock, RW_READER); 5006 5007 mutex_enter(&hca->cache_allocation_lock); 5008 c_alloc = hca->cache_allocation; 5009 mutex_exit(&hca->cache_allocation_lock); 5010 5011 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 5012 &where)) == NULL) { 5013 /* Am I above the cache limit */ 5014 if ((c_alloc + len) >= cache_limit) { 5015 rib_force_cleanup((void *)hca); 5016 rw_exit(&hca->avl_rw_lock); 5017 mutex_enter(&hca->cache_allocation_lock); 5018 hca->cache_misses_above_the_limit ++; 5019 mutex_exit(&hca->cache_allocation_lock); 5020 5021 /* Allocate and register the buffer directly */ 5022 goto error_alloc; 5023 } 5024 5025 rw_exit(&hca->avl_rw_lock); 5026 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5027 5028 /* Recheck to make sure no other thread added the entry in */ 5029 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 5030 &cas, &where)) == NULL) { 5031 /* Allocate an avl tree entry */ 5032 rcas = (cache_avl_struct_t *) 5033 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 5034 5035 bzero(rcas, sizeof (cache_avl_struct_t)); 5036 rcas->elements = 0; 5037 rcas->r.forw = &rcas->r; 5038 rcas->r.back = &rcas->r; 5039 rcas->len = len; 5040 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5041 avl_insert(&hca->avl_tree, rcas, where); 5042 } 5043 } 5044 5045 mutex_enter(&rcas->node_lock); 5046 5047 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5048 reply_buf = rcas->r.forw; 5049 remque(reply_buf); 5050 rcas->elements--; 5051 mutex_exit(&rcas->node_lock); 5052 rw_exit(&hca->avl_rw_lock); 5053 5054 mutex_enter(&hca->cache_allocation_lock); 5055 hca->cache_hits++; 5056 hca->cache_allocation -= len; 5057 mutex_exit(&hca->cache_allocation_lock); 5058 } else { 5059 /* Am I above the cache limit */ 5060 mutex_exit(&rcas->node_lock); 5061 if ((c_alloc + len) >= cache_limit) { 5062 rib_force_cleanup((void *)hca); 5063 rw_exit(&hca->avl_rw_lock); 5064 5065 mutex_enter(&hca->cache_allocation_lock); 5066 hca->cache_misses_above_the_limit++; 5067 mutex_exit(&hca->cache_allocation_lock); 5068 /* Allocate and register the buffer directly */ 5069 goto error_alloc; 5070 } 5071 rw_exit(&hca->avl_rw_lock); 5072 mutex_enter(&hca->cache_allocation_lock); 5073 hca->cache_misses++; 5074 mutex_exit(&hca->cache_allocation_lock); 5075 /* Allocate a reply_buf entry */ 5076 reply_buf = (rib_lrc_entry_t *) 5077 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5078 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5079 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5080 reply_buf->lrc_len = len; 5081 reply_buf->registered = FALSE; 5082 reply_buf->avl_node = (void *)rcas; 5083 } 5084 5085 return (reply_buf); 5086 5087 error_alloc: 5088 reply_buf = (rib_lrc_entry_t *) 5089 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5090 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5091 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5092 reply_buf->lrc_len = len; 5093 reply_buf->registered = FALSE; 5094 reply_buf->avl_node = NULL; 5095 5096 return (reply_buf); 5097 } 5098 5099 /* 5100 * Return a pre-registered back to the cache (without 5101 * unregistering the buffer).. 5102 */ 5103 5104 static void 5105 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5106 { 5107 cache_avl_struct_t cas, *rcas; 5108 avl_index_t where = NULL; 5109 rib_hca_t *hca = (ctoqp(conn))->hca; 5110 5111 if (!hca->avl_init) 5112 goto error_free; 5113 5114 cas.len = reg_buf->lrc_len; 5115 rw_enter(&hca->avl_rw_lock, RW_READER); 5116 if ((rcas = (cache_avl_struct_t *) 5117 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5118 rw_exit(&hca->avl_rw_lock); 5119 goto error_free; 5120 } else { 5121 cas.len = reg_buf->lrc_len; 5122 mutex_enter(&rcas->node_lock); 5123 insque(reg_buf, &rcas->r); 5124 rcas->elements ++; 5125 mutex_exit(&rcas->node_lock); 5126 rw_exit(&hca->avl_rw_lock); 5127 mutex_enter(&hca->cache_allocation_lock); 5128 hca->cache_allocation += cas.len; 5129 mutex_exit(&hca->cache_allocation_lock); 5130 } 5131 5132 return; 5133 5134 error_free: 5135 5136 if (reg_buf->registered) 5137 (void) rib_deregistermem_via_hca(hca, 5138 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5139 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5140 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5141 } 5142 5143 static rdma_stat 5144 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5145 uint_t buflen, struct mrc *buf_handle) 5146 { 5147 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5148 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5149 rdma_stat status; 5150 5151 5152 /* 5153 * Note: ALL buffer pools use the same memory type RDMARW. 5154 */ 5155 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5156 if (status == RDMA_SUCCESS) { 5157 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5158 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5159 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5160 } else { 5161 buf_handle->mrc_linfo = NULL; 5162 buf_handle->mrc_lmr = 0; 5163 buf_handle->mrc_rmr = 0; 5164 } 5165 return (status); 5166 } 5167 5168 /* ARGSUSED */ 5169 static rdma_stat 5170 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5171 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5172 { 5173 5174 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5175 return (RDMA_SUCCESS); 5176 } 5177 5178 /* ARGSUSED */ 5179 static rdma_stat 5180 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5181 { 5182 5183 (void) ibt_deregister_mr(hca->hca_hdl, 5184 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5185 return (RDMA_SUCCESS); 5186 } 5187 5188 /* 5189 * Check if the IP interface named by `lifrp' is RDMA-capable. 5190 */ 5191 static boolean_t 5192 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5193 { 5194 char ifname[LIFNAMSIZ]; 5195 char *cp; 5196 5197 if (lifrp->lifr_type == IFT_IB) 5198 return (B_TRUE); 5199 5200 /* 5201 * Strip off the logical interface portion before getting 5202 * intimate with the name. 5203 */ 5204 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5205 if ((cp = strchr(ifname, ':')) != NULL) 5206 *cp = '\0'; 5207 5208 return (strcmp("lo0", ifname) == 0); 5209 } 5210 5211 static int 5212 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5213 { 5214 vnode_t *kkvp, *vp; 5215 TIUSER *tiptr; 5216 struct strioctl iocb; 5217 k_sigset_t smask; 5218 int err = 0; 5219 5220 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { 5221 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, 5222 &tiptr, CRED()) == 0) { 5223 vp = tiptr->fp->f_vnode; 5224 } else { 5225 VN_RELE(kkvp); 5226 return (EPROTO); 5227 } 5228 } else { 5229 return (EPROTO); 5230 } 5231 5232 iocb.ic_cmd = cmd; 5233 iocb.ic_timout = 0; 5234 iocb.ic_len = len; 5235 iocb.ic_dp = (caddr_t)arg; 5236 sigintr(&smask, 0); 5237 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5238 sigunintr(&smask); 5239 (void) t_kclose(tiptr, 0); 5240 VN_RELE(kkvp); 5241 return (err); 5242 } 5243 5244 /* 5245 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5246 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5247 */ 5248 static int 5249 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5250 { 5251 int err; 5252 struct lifnum lifn; 5253 5254 bzero(&lifn, sizeof (struct lifnum)); 5255 lifn.lifn_family = AF_UNSPEC; 5256 5257 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5258 if (err != 0) 5259 return (err); 5260 5261 /* 5262 * Pad the interface count to account for additional interfaces that 5263 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5264 */ 5265 lifn.lifn_count += 4; 5266 5267 bzero(lifcp, sizeof (struct lifconf)); 5268 lifcp->lifc_family = AF_UNSPEC; 5269 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5270 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5271 5272 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5273 if (err != 0) { 5274 kmem_free(lifcp->lifc_buf, *bufsizep); 5275 return (err); 5276 } 5277 return (0); 5278 } 5279 5280 static boolean_t 5281 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5282 { 5283 uint_t i, nifs; 5284 uint_t bufsize; 5285 struct lifconf lifc; 5286 struct lifreq *lifrp; 5287 struct sockaddr_in *sinp; 5288 struct sockaddr_in6 *sin6p; 5289 5290 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5291 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5292 5293 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5294 return (B_FALSE); 5295 5296 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5297 kmem_free(lifc.lifc_buf, bufsize); 5298 return (B_FALSE); 5299 } 5300 5301 /* 5302 * Worst case is that all of the addresses are IB-capable and have 5303 * the same address family, so size our buffers accordingly. 5304 */ 5305 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5306 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5307 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5308 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5309 5310 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5311 if (!rpcib_rdma_capable_interface(lifrp)) 5312 continue; 5313 5314 if (lifrp->lifr_addr.ss_family == AF_INET) { 5315 sinp = addrs4->ri_list; 5316 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5317 sizeof (struct sockaddr_in)); 5318 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5319 sin6p = addrs6->ri_list; 5320 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5321 sizeof (struct sockaddr_in6)); 5322 } 5323 } 5324 5325 kmem_free(lifc.lifc_buf, bufsize); 5326 return (B_TRUE); 5327 } 5328 5329 /* ARGSUSED */ 5330 static int 5331 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5332 { 5333 rib_hca_t *hca; 5334 5335 if (KSTAT_WRITE == rw) { 5336 return (EACCES); 5337 } 5338 5339 rpcib_kstat.cache_limit.value.ui64 = 5340 (uint64_t)cache_limit; 5341 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5342 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5343 rpcib_kstat.cache_allocation.value.ui64 += 5344 (uint64_t)hca->cache_allocation; 5345 rpcib_kstat.cache_hits.value.ui64 += 5346 (uint64_t)hca->cache_hits; 5347 rpcib_kstat.cache_misses.value.ui64 += 5348 (uint64_t)hca->cache_misses; 5349 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5350 (uint64_t)hca->cache_misses_above_the_limit; 5351 } 5352 rw_exit(&rib_stat->hcas_list_lock); 5353 return (0); 5354 } 5355