1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 #define RIBNETID_TCP "tcp" 221 #define RIBNETID_TCP6 "tcp6" 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 234 235 int preposted_rbufs = RDMA_BUFS_GRANT; 236 int send_threshold = 1; 237 238 /* 239 * Old cards with Tavor driver have limited memory footprint 240 * when booted in 32bit. The rib_max_rbufs tunable can be 241 * tuned for more buffers if needed. 242 */ 243 244 #if !defined(_ELF64) && !defined(__sparc) 245 int rib_max_rbufs = MAX_BUFS; 246 #else 247 int rib_max_rbufs = 10 * MAX_BUFS; 248 #endif /* !(_ELF64) && !(__sparc) */ 249 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 251 252 /* 253 * State of the plugin. 254 * ACCEPT = accepting new connections and requests. 255 * NO_ACCEPT = not accepting new connection and requests. 256 * This should eventually move to rpcib_state_t structure, since this 257 * will tell in which state the plugin is for a particular type of service 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 259 * state for one and in no_accept state for the other. 260 */ 261 int plugin_state; 262 kmutex_t plugin_state_lock; 263 264 ldi_ident_t rpcib_li; 265 266 /* 267 * RPCIB RDMATF operations 268 */ 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 270 static rdma_stat rib_disconnect(CONN *conn); 271 static void rib_listen(struct rdma_svc_data *rd); 272 static void rib_listen_stop(struct rdma_svc_data *rd); 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 274 uint_t buflen, struct mrc *buf_handle); 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 276 struct mrc buf_handle); 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 280 struct mrc buf_handle); 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 283 void *lrc); 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 287 caddr_t buf, int len, int cpu); 288 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 290 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 293 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 295 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 306 int addr_type, void *, CONN **); 307 static rdma_stat rib_conn_release(CONN *conn); 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 309 rpcib_ping_t *, CONN **); 310 static rdma_stat rib_getinfo(rdma_info_t *info); 311 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 314 static void rib_destroy_cache(rib_hca_t *hca); 315 static void rib_server_side_cache_reclaim(void *argp); 316 static int avl_compare(const void *t1, const void *t2); 317 318 static void rib_stop_services(rib_hca_t *); 319 static void rib_close_channels(rib_conn_list_t *); 320 static void rib_conn_close(void *); 321 322 /* 323 * RPCIB addressing operations 324 */ 325 326 /* 327 * RDMA operations the RPCIB module exports 328 */ 329 static rdmaops_t rib_ops = { 330 rib_reachable, 331 rib_conn_get, 332 rib_conn_release, 333 rib_listen, 334 rib_listen_stop, 335 rib_registermem, 336 rib_deregistermem, 337 rib_registermemsync, 338 rib_deregistermemsync, 339 rib_syncmem, 340 rib_reg_buf_alloc, 341 rib_reg_buf_free, 342 rib_send, 343 rib_send_resp, 344 rib_post_resp, 345 rib_post_resp_remove, 346 rib_post_recv, 347 rib_recv, 348 rib_read, 349 rib_write, 350 rib_getinfo, 351 }; 352 353 /* 354 * RDMATF RPCIB plugin details 355 */ 356 static rdma_mod_t rib_mod = { 357 "ibtf", /* api name */ 358 RDMATF_VERS_1, 359 0, 360 &rib_ops, /* rdma op vector for ibtf */ 361 }; 362 363 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 364 static rdma_stat rib_qp_init(rib_qp_t *, int); 365 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 366 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 367 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 368 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 369 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 370 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 371 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 372 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 373 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 374 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 375 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 376 rib_qp_t **); 377 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 378 rib_qp_t **); 379 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 380 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 381 static int rib_free_sendwait(struct send_wid *); 382 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 383 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 384 static void rdma_done_rem_list(rib_qp_t *); 385 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 386 387 static void rib_async_handler(void *, 388 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 389 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 390 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 391 static int rib_free_svc_recv(struct svc_recv *); 392 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 393 static void rib_free_wid(struct recv_wid *); 394 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 395 static void rib_detach_hca(rib_hca_t *); 396 static void rib_close_a_channel(CONN *); 397 static void rib_send_hold(rib_qp_t *); 398 static void rib_send_rele(rib_qp_t *); 399 400 /* 401 * Registration with IBTF as a consumer 402 */ 403 static struct ibt_clnt_modinfo_s rib_modinfo = { 404 IBTI_V_CURR, 405 IBT_GENERIC, 406 rib_async_handler, /* async event handler */ 407 NULL, /* Memory Region Handler */ 408 "nfs/ib" 409 }; 410 411 /* 412 * Global strucuture 413 */ 414 415 typedef struct rpcib_s { 416 dev_info_t *rpcib_dip; 417 kmutex_t rpcib_mutex; 418 } rpcib_t; 419 420 rpcib_t rpcib; 421 422 /* 423 * /etc/system controlled variable to control 424 * debugging in rpcib kernel module. 425 * Set it to values greater that 1 to control 426 * the amount of debugging messages required. 427 */ 428 int rib_debug = 0; 429 430 int 431 _init(void) 432 { 433 int error; 434 435 error = mod_install((struct modlinkage *)&rib_modlinkage); 436 if (error != 0) { 437 /* 438 * Could not load module 439 */ 440 return (error); 441 } 442 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 443 return (0); 444 } 445 446 int 447 _fini() 448 { 449 int status; 450 451 /* 452 * Remove module 453 */ 454 if ((status = mod_remove(&rib_modlinkage)) != 0) { 455 return (status); 456 } 457 mutex_destroy(&plugin_state_lock); 458 return (0); 459 } 460 461 int 462 _info(struct modinfo *modinfop) 463 { 464 return (mod_info(&rib_modlinkage, modinfop)); 465 } 466 467 /* 468 * rpcib_getinfo() 469 * Given the device number, return the devinfo pointer or the 470 * instance number. 471 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 472 */ 473 474 /*ARGSUSED*/ 475 static int 476 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 477 { 478 int ret = DDI_SUCCESS; 479 480 switch (cmd) { 481 case DDI_INFO_DEVT2DEVINFO: 482 if (rpcib.rpcib_dip != NULL) 483 *result = rpcib.rpcib_dip; 484 else { 485 *result = NULL; 486 ret = DDI_FAILURE; 487 } 488 break; 489 490 case DDI_INFO_DEVT2INSTANCE: 491 *result = NULL; 492 break; 493 494 default: 495 ret = DDI_FAILURE; 496 } 497 return (ret); 498 } 499 500 static void 501 rpcib_free_hca_list() 502 { 503 rib_hca_t *hca, *hcap; 504 505 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 506 hca = rib_stat->hcas_list; 507 rib_stat->hcas_list = NULL; 508 rw_exit(&rib_stat->hcas_list_lock); 509 while (hca != NULL) { 510 rw_enter(&hca->state_lock, RW_WRITER); 511 hcap = hca; 512 hca = hca->next; 513 rib_stat->nhca_inited--; 514 rib_mod.rdma_count--; 515 hcap->state = HCA_DETACHED; 516 rw_exit(&hcap->state_lock); 517 rib_stop_hca_services(hcap); 518 519 kmem_free(hcap, sizeof (*hcap)); 520 } 521 } 522 523 static rdma_stat 524 rpcib_free_service_list() 525 { 526 rib_service_t *service; 527 ibt_status_t ret; 528 529 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 530 while (rib_stat->service_list != NULL) { 531 service = rib_stat->service_list; 532 ret = ibt_unbind_all_services(service->srv_hdl); 533 if (ret != IBT_SUCCESS) { 534 rw_exit(&rib_stat->service_list_lock); 535 #ifdef DEBUG 536 cmn_err(CE_NOTE, "rpcib_free_service_list: " 537 "ibt_unbind_all_services failed (%d)\n", (int)ret); 538 #endif 539 return (RDMA_FAILED); 540 } 541 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 542 service->srv_hdl); 543 if (ret != IBT_SUCCESS) { 544 rw_exit(&rib_stat->service_list_lock); 545 #ifdef DEBUG 546 cmn_err(CE_NOTE, "rpcib_free_service_list: " 547 "ibt_deregister_service failed (%d)\n", (int)ret); 548 #endif 549 return (RDMA_FAILED); 550 } 551 rib_stat->service_list = service->next; 552 kmem_free(service, sizeof (rib_service_t)); 553 } 554 rw_exit(&rib_stat->service_list_lock); 555 556 return (RDMA_SUCCESS); 557 } 558 559 static int 560 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 561 { 562 ibt_status_t ibt_status; 563 rdma_stat r_status; 564 565 switch (cmd) { 566 case DDI_ATTACH: 567 break; 568 case DDI_RESUME: 569 return (DDI_SUCCESS); 570 default: 571 return (DDI_FAILURE); 572 } 573 574 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 575 576 mutex_enter(&rpcib.rpcib_mutex); 577 if (rpcib.rpcib_dip != NULL) { 578 mutex_exit(&rpcib.rpcib_mutex); 579 return (DDI_FAILURE); 580 } 581 rpcib.rpcib_dip = dip; 582 mutex_exit(&rpcib.rpcib_mutex); 583 /* 584 * Create the "rpcib" minor-node. 585 */ 586 if (ddi_create_minor_node(dip, 587 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 588 /* Error message, no cmn_err as they print on console */ 589 return (DDI_FAILURE); 590 } 591 592 if (rib_stat == NULL) { 593 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 594 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 595 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 596 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 597 } 598 599 rib_stat->hca_count = ibt_get_hca_list(NULL); 600 if (rib_stat->hca_count < 1) { 601 mutex_destroy(&rib_stat->listen_lock); 602 rw_destroy(&rib_stat->hcas_list_lock); 603 mutex_destroy(&rib_stat->open_hca_lock); 604 kmem_free(rib_stat, sizeof (*rib_stat)); 605 rib_stat = NULL; 606 return (DDI_FAILURE); 607 } 608 609 ibt_status = ibt_attach(&rib_modinfo, dip, 610 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 611 612 if (ibt_status != IBT_SUCCESS) { 613 mutex_destroy(&rib_stat->listen_lock); 614 rw_destroy(&rib_stat->hcas_list_lock); 615 mutex_destroy(&rib_stat->open_hca_lock); 616 kmem_free(rib_stat, sizeof (*rib_stat)); 617 rib_stat = NULL; 618 return (DDI_FAILURE); 619 } 620 621 rib_stat->service_list = NULL; 622 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 623 mutex_enter(&rib_stat->open_hca_lock); 624 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 625 mutex_exit(&rib_stat->open_hca_lock); 626 goto open_fail; 627 } 628 mutex_exit(&rib_stat->open_hca_lock); 629 630 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 631 DDI_PROP_SUCCESS) { 632 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 633 "failed."); 634 goto register_fail; 635 } 636 637 /* 638 * Register with rdmatf 639 */ 640 r_status = rdma_register_mod(&rib_mod); 641 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 642 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 643 "status = %d", r_status); 644 goto register_fail; 645 } 646 647 return (DDI_SUCCESS); 648 649 register_fail: 650 651 open_fail: 652 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 653 rpcib_free_hca_list(); 654 (void) rpcib_free_service_list(); 655 mutex_destroy(&rib_stat->listen_lock); 656 rw_destroy(&rib_stat->hcas_list_lock); 657 mutex_destroy(&rib_stat->open_hca_lock); 658 rw_destroy(&rib_stat->service_list_lock); 659 kmem_free(rib_stat, sizeof (*rib_stat)); 660 rib_stat = NULL; 661 return (DDI_FAILURE); 662 } 663 664 /*ARGSUSED*/ 665 static int 666 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 667 { 668 switch (cmd) { 669 670 case DDI_DETACH: 671 break; 672 673 case DDI_SUSPEND: 674 default: 675 return (DDI_FAILURE); 676 } 677 678 /* 679 * Detach the hca and free resources 680 */ 681 mutex_enter(&plugin_state_lock); 682 plugin_state = NO_ACCEPT; 683 mutex_exit(&plugin_state_lock); 684 685 if (rpcib_free_service_list() != RDMA_SUCCESS) 686 return (DDI_FAILURE); 687 rpcib_free_hca_list(); 688 689 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 690 mutex_destroy(&rib_stat->listen_lock); 691 rw_destroy(&rib_stat->hcas_list_lock); 692 mutex_destroy(&rib_stat->open_hca_lock); 693 rw_destroy(&rib_stat->service_list_lock); 694 695 kmem_free(rib_stat, sizeof (*rib_stat)); 696 rib_stat = NULL; 697 698 mutex_enter(&rpcib.rpcib_mutex); 699 rpcib.rpcib_dip = NULL; 700 mutex_exit(&rpcib.rpcib_mutex); 701 mutex_destroy(&rpcib.rpcib_mutex); 702 return (DDI_SUCCESS); 703 } 704 705 706 static void rib_rbufpool_free(rib_hca_t *, int); 707 static void rib_rbufpool_deregister(rib_hca_t *, int); 708 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 709 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 710 static rdma_stat rib_rem_replylist(rib_qp_t *); 711 static int rib_remreply(rib_qp_t *, struct reply *); 712 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 713 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 714 715 716 /* 717 * One CQ pair per HCA 718 */ 719 static rdma_stat 720 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 721 rib_cq_t **cqp) 722 { 723 rib_cq_t *cq; 724 ibt_cq_attr_t cq_attr; 725 uint32_t real_size; 726 ibt_status_t status; 727 rdma_stat error = RDMA_SUCCESS; 728 729 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 730 cq->rib_hca = hca; 731 cq_attr.cq_size = cq_size; 732 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 733 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 734 &real_size); 735 if (status != IBT_SUCCESS) { 736 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 737 " status=%d", status); 738 error = RDMA_FAILED; 739 goto fail; 740 } 741 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 742 743 /* 744 * Enable CQ callbacks. CQ Callbacks are single shot 745 * (e.g. you have to call ibt_enable_cq_notify() 746 * after each callback to get another one). 747 */ 748 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 749 if (status != IBT_SUCCESS) { 750 cmn_err(CE_WARN, "rib_create_cq: " 751 "enable_cq_notify failed, status %d", status); 752 error = RDMA_FAILED; 753 goto fail; 754 } 755 *cqp = cq; 756 757 return (error); 758 fail: 759 if (cq->rib_cq_hdl) 760 (void) ibt_free_cq(cq->rib_cq_hdl); 761 if (cq) 762 kmem_free(cq, sizeof (rib_cq_t)); 763 return (error); 764 } 765 766 /* 767 * rpcib_find_hca 768 * 769 * Caller should have already locked the hcas_lock before calling 770 * this function. 771 */ 772 static rib_hca_t * 773 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 774 { 775 rib_hca_t *hca = ribstat->hcas_list; 776 777 while (hca && hca->hca_guid != guid) 778 hca = hca->next; 779 780 return (hca); 781 } 782 783 static rdma_stat 784 rpcib_open_hcas(rpcib_state_t *ribstat) 785 { 786 rib_hca_t *hca; 787 ibt_status_t ibt_status; 788 rdma_stat status; 789 ibt_hca_portinfo_t *pinfop; 790 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 791 uint_t size, cq_size; 792 int i; 793 kstat_t *ksp; 794 cache_avl_struct_t example_avl_node; 795 char rssc_name[32]; 796 int old_nhca_inited = ribstat->nhca_inited; 797 ib_guid_t *hca_guids; 798 799 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 800 801 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 802 if (ribstat->hca_count == 0) 803 return (RDMA_FAILED); 804 805 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 806 /* 807 * Open a hca and setup for RDMA 808 */ 809 for (i = 0; i < ribstat->hca_count; i++) { 810 if (rpcib_find_hca(ribstat, hca_guids[i])) 811 continue; 812 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 813 814 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 815 hca_guids[i], &hca->hca_hdl); 816 if (ibt_status != IBT_SUCCESS) { 817 kmem_free(hca, sizeof (rib_hca_t)); 818 continue; 819 } 820 hca->hca_guid = hca_guids[i]; 821 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 822 hca->state = HCA_INITED; 823 824 /* 825 * query HCA info 826 */ 827 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 828 if (ibt_status != IBT_SUCCESS) { 829 goto fail1; 830 } 831 832 /* 833 * One PD (Protection Domain) per HCA. 834 * A qp is allowed to access a memory region 835 * only when it's in the same PD as that of 836 * the memory region. 837 */ 838 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 839 if (ibt_status != IBT_SUCCESS) { 840 goto fail1; 841 } 842 843 /* 844 * query HCA ports 845 */ 846 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 847 0, &pinfop, &hca->hca_nports, &size); 848 if (ibt_status != IBT_SUCCESS) { 849 goto fail2; 850 } 851 hca->hca_ports = pinfop; 852 hca->hca_pinfosz = size; 853 pinfop = NULL; 854 855 cq_size = DEF_CQ_SIZE; /* default cq size */ 856 /* 857 * Create 2 pairs of cq's (1 pair for client 858 * and the other pair for server) on this hca. 859 * If number of qp's gets too large, then several 860 * cq's will be needed. 861 */ 862 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 863 &hca->svc_rcq); 864 if (status != RDMA_SUCCESS) { 865 goto fail3; 866 } 867 868 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 869 &hca->svc_scq); 870 if (status != RDMA_SUCCESS) { 871 goto fail3; 872 } 873 874 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 875 &hca->clnt_rcq); 876 if (status != RDMA_SUCCESS) { 877 goto fail3; 878 } 879 880 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 881 &hca->clnt_scq); 882 if (status != RDMA_SUCCESS) { 883 goto fail3; 884 } 885 886 /* 887 * Create buffer pools. 888 * Note rib_rbuf_create also allocates memory windows. 889 */ 890 hca->recv_pool = rib_rbufpool_create(hca, 891 RECV_BUFFER, rib_max_rbufs); 892 if (hca->recv_pool == NULL) { 893 goto fail3; 894 } 895 896 hca->send_pool = rib_rbufpool_create(hca, 897 SEND_BUFFER, rib_max_rbufs); 898 if (hca->send_pool == NULL) { 899 rib_rbufpool_destroy(hca, RECV_BUFFER); 900 goto fail3; 901 } 902 903 if (hca->server_side_cache == NULL) { 904 (void) sprintf(rssc_name, 905 "rib_srvr_cache_%llx", 906 (long long unsigned int) hca->hca_guid); 907 hca->server_side_cache = kmem_cache_create( 908 rssc_name, 909 sizeof (cache_avl_struct_t), 0, 910 NULL, 911 NULL, 912 rib_server_side_cache_reclaim, 913 hca, NULL, 0); 914 } 915 916 avl_create(&hca->avl_tree, 917 avl_compare, 918 sizeof (cache_avl_struct_t), 919 (uint_t)(uintptr_t)&example_avl_node.avl_link- 920 (uint_t)(uintptr_t)&example_avl_node); 921 922 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 923 hca->iblock); 924 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 925 rw_init(&hca->avl_rw_lock, 926 NULL, RW_DRIVER, hca->iblock); 927 mutex_init(&hca->cache_allocation_lock, 928 NULL, MUTEX_DRIVER, NULL); 929 hca->avl_init = TRUE; 930 931 /* Create kstats for the cache */ 932 ASSERT(INGLOBALZONE(curproc)); 933 934 if (!stats_enabled) { 935 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 936 KSTAT_TYPE_NAMED, 937 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 938 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 939 GLOBAL_ZONEID); 940 if (ksp) { 941 ksp->ks_data = (void *) &rpcib_kstat; 942 ksp->ks_update = rpcib_cache_kstat_update; 943 kstat_install(ksp); 944 stats_enabled = TRUE; 945 } 946 } 947 if (hca->cleanup_helper == NULL) { 948 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 949 950 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 951 (unsigned long long int) hca->hca_guid); 952 hca->cleanup_helper = ddi_taskq_create(NULL, 953 tq_name, 1, TASKQ_DEFAULTPRI, 0); 954 } 955 956 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 957 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 958 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 959 hca->iblock); 960 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 961 hca->iblock); 962 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 963 hca->inuse = TRUE; 964 965 hca->next = ribstat->hcas_list; 966 ribstat->hcas_list = hca; 967 ribstat->nhca_inited++; 968 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 969 continue; 970 971 fail3: 972 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 973 fail2: 974 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 975 fail1: 976 (void) ibt_close_hca(hca->hca_hdl); 977 kmem_free(hca, sizeof (rib_hca_t)); 978 } 979 rw_exit(&ribstat->hcas_list_lock); 980 ibt_free_hca_list(hca_guids, ribstat->hca_count); 981 rib_mod.rdma_count = rib_stat->nhca_inited; 982 983 /* 984 * return success if at least one new hca has been configured. 985 */ 986 if (ribstat->nhca_inited != old_nhca_inited) 987 return (RDMA_SUCCESS); 988 else 989 return (RDMA_FAILED); 990 } 991 992 /* 993 * Callback routines 994 */ 995 996 /* 997 * SCQ handlers 998 */ 999 /* ARGSUSED */ 1000 static void 1001 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1002 { 1003 ibt_status_t ibt_status; 1004 ibt_wc_t wc; 1005 struct send_wid *wd; 1006 CONN *conn; 1007 rib_qp_t *qp; 1008 int i; 1009 1010 /* 1011 * Re-enable cq notify here to avoid missing any 1012 * completion queue notification. 1013 */ 1014 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1015 1016 ibt_status = IBT_SUCCESS; 1017 while (ibt_status != IBT_CQ_EMPTY) { 1018 bzero(&wc, sizeof (wc)); 1019 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1020 if (ibt_status != IBT_SUCCESS) 1021 return; 1022 1023 /* 1024 * Got a send completion 1025 */ 1026 if (wc.wc_id != RDMA_DUMMY_WRID) { 1027 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1028 qp = wd->qp; 1029 conn = qptoc(qp); 1030 1031 mutex_enter(&wd->sendwait_lock); 1032 switch (wc.wc_status) { 1033 case IBT_WC_SUCCESS: 1034 wd->status = RDMA_SUCCESS; 1035 break; 1036 default: 1037 /* 1038 * RC Send Q Error Code Local state Remote State 1039 * ==================== =========== ============ 1040 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1041 * IBT_WC_LOCAL_LEN_ERR ERROR None 1042 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1043 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1044 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1045 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1046 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1047 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1048 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1049 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1050 * IBT_WC_WR_FLUSHED_ERR ERROR None 1051 */ 1052 /* 1053 * Channel in error state. Set connection to 1054 * ERROR and cleanup will happen either from 1055 * conn_release or from rib_conn_get 1056 */ 1057 wd->status = RDMA_FAILED; 1058 mutex_enter(&conn->c_lock); 1059 if (conn->c_state != C_DISCONN_PEND) 1060 conn->c_state = C_ERROR_CONN; 1061 mutex_exit(&conn->c_lock); 1062 break; 1063 } 1064 1065 if (wd->cv_sig == 1) { 1066 /* 1067 * Notify poster 1068 */ 1069 cv_signal(&wd->wait_cv); 1070 mutex_exit(&wd->sendwait_lock); 1071 } else { 1072 /* 1073 * Poster not waiting for notification. 1074 * Free the send buffers and send_wid 1075 */ 1076 for (i = 0; i < wd->nsbufs; i++) { 1077 rib_rbuf_free(qptoc(wd->qp), 1078 SEND_BUFFER, 1079 (void *)(uintptr_t)wd->sbufaddr[i]); 1080 } 1081 1082 /* decrement the send ref count */ 1083 rib_send_rele(qp); 1084 1085 mutex_exit(&wd->sendwait_lock); 1086 (void) rib_free_sendwait(wd); 1087 } 1088 } 1089 } 1090 } 1091 1092 /* ARGSUSED */ 1093 static void 1094 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1095 { 1096 ibt_status_t ibt_status; 1097 ibt_wc_t wc; 1098 struct send_wid *wd; 1099 rib_qp_t *qp; 1100 CONN *conn; 1101 int i; 1102 1103 /* 1104 * Re-enable cq notify here to avoid missing any 1105 * completion queue notification. 1106 */ 1107 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1108 1109 ibt_status = IBT_SUCCESS; 1110 while (ibt_status != IBT_CQ_EMPTY) { 1111 bzero(&wc, sizeof (wc)); 1112 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1113 if (ibt_status != IBT_SUCCESS) 1114 return; 1115 1116 /* 1117 * Got a send completion 1118 */ 1119 if (wc.wc_id != RDMA_DUMMY_WRID) { 1120 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1121 qp = wd->qp; 1122 conn = qptoc(qp); 1123 mutex_enter(&wd->sendwait_lock); 1124 1125 switch (wc.wc_status) { 1126 case IBT_WC_SUCCESS: 1127 wd->status = RDMA_SUCCESS; 1128 break; 1129 default: 1130 /* 1131 * Channel in error state. Set connection to 1132 * ERROR and cleanup will happen either from 1133 * conn_release or conn timeout. 1134 */ 1135 wd->status = RDMA_FAILED; 1136 mutex_enter(&conn->c_lock); 1137 if (conn->c_state != C_DISCONN_PEND) 1138 conn->c_state = C_ERROR_CONN; 1139 mutex_exit(&conn->c_lock); 1140 break; 1141 } 1142 1143 if (wd->cv_sig == 1) { 1144 /* 1145 * Update completion status and notify poster 1146 */ 1147 cv_signal(&wd->wait_cv); 1148 mutex_exit(&wd->sendwait_lock); 1149 } else { 1150 /* 1151 * Poster not waiting for notification. 1152 * Free the send buffers and send_wid 1153 */ 1154 for (i = 0; i < wd->nsbufs; i++) { 1155 rib_rbuf_free(qptoc(wd->qp), 1156 SEND_BUFFER, 1157 (void *)(uintptr_t)wd->sbufaddr[i]); 1158 } 1159 1160 /* decrement the send ref count */ 1161 rib_send_rele(qp); 1162 1163 mutex_exit(&wd->sendwait_lock); 1164 (void) rib_free_sendwait(wd); 1165 } 1166 } 1167 } 1168 } 1169 1170 /* 1171 * RCQ handler 1172 */ 1173 /* ARGSUSED */ 1174 static void 1175 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1176 { 1177 rib_qp_t *qp; 1178 ibt_status_t ibt_status; 1179 ibt_wc_t wc; 1180 struct recv_wid *rwid; 1181 1182 /* 1183 * Re-enable cq notify here to avoid missing any 1184 * completion queue notification. 1185 */ 1186 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1187 1188 ibt_status = IBT_SUCCESS; 1189 while (ibt_status != IBT_CQ_EMPTY) { 1190 bzero(&wc, sizeof (wc)); 1191 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1192 if (ibt_status != IBT_SUCCESS) 1193 return; 1194 1195 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1196 qp = rwid->qp; 1197 if (wc.wc_status == IBT_WC_SUCCESS) { 1198 XDR inxdrs, *xdrs; 1199 uint_t xid, vers, op, find_xid = 0; 1200 struct reply *r; 1201 CONN *conn = qptoc(qp); 1202 uint32_t rdma_credit = 0; 1203 1204 xdrs = &inxdrs; 1205 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1206 wc.wc_bytes_xfer, XDR_DECODE); 1207 /* 1208 * Treat xid as opaque (xid is the first entity 1209 * in the rpc rdma message). 1210 */ 1211 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1212 1213 /* Skip xid and set the xdr position accordingly. */ 1214 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1215 (void) xdr_u_int(xdrs, &vers); 1216 (void) xdr_u_int(xdrs, &rdma_credit); 1217 (void) xdr_u_int(xdrs, &op); 1218 XDR_DESTROY(xdrs); 1219 1220 if (vers != RPCRDMA_VERS) { 1221 /* 1222 * Invalid RPC/RDMA version. Cannot 1223 * interoperate. Set connection to 1224 * ERROR state and bail out. 1225 */ 1226 mutex_enter(&conn->c_lock); 1227 if (conn->c_state != C_DISCONN_PEND) 1228 conn->c_state = C_ERROR_CONN; 1229 mutex_exit(&conn->c_lock); 1230 rib_rbuf_free(conn, RECV_BUFFER, 1231 (void *)(uintptr_t)rwid->addr); 1232 rib_free_wid(rwid); 1233 continue; 1234 } 1235 1236 mutex_enter(&qp->replylist_lock); 1237 for (r = qp->replylist; r != NULL; r = r->next) { 1238 if (r->xid == xid) { 1239 find_xid = 1; 1240 switch (op) { 1241 case RDMA_MSG: 1242 case RDMA_NOMSG: 1243 case RDMA_MSGP: 1244 r->status = RDMA_SUCCESS; 1245 r->vaddr_cq = rwid->addr; 1246 r->bytes_xfer = 1247 wc.wc_bytes_xfer; 1248 cv_signal(&r->wait_cv); 1249 break; 1250 default: 1251 rib_rbuf_free(qptoc(qp), 1252 RECV_BUFFER, 1253 (void *)(uintptr_t) 1254 rwid->addr); 1255 break; 1256 } 1257 break; 1258 } 1259 } 1260 mutex_exit(&qp->replylist_lock); 1261 if (find_xid == 0) { 1262 /* RPC caller not waiting for reply */ 1263 1264 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1265 int, xid); 1266 1267 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1268 (void *)(uintptr_t)rwid->addr); 1269 } 1270 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1271 CONN *conn = qptoc(qp); 1272 1273 /* 1274 * Connection being flushed. Just free 1275 * the posted buffer 1276 */ 1277 rib_rbuf_free(conn, RECV_BUFFER, 1278 (void *)(uintptr_t)rwid->addr); 1279 } else { 1280 CONN *conn = qptoc(qp); 1281 /* 1282 * RC Recv Q Error Code Local state Remote State 1283 * ==================== =========== ============ 1284 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1285 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1286 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1287 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1288 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_WR_FLUSHED_ERR None None 1290 */ 1291 /* 1292 * Channel in error state. Set connection 1293 * in ERROR state. 1294 */ 1295 mutex_enter(&conn->c_lock); 1296 if (conn->c_state != C_DISCONN_PEND) 1297 conn->c_state = C_ERROR_CONN; 1298 mutex_exit(&conn->c_lock); 1299 rib_rbuf_free(conn, RECV_BUFFER, 1300 (void *)(uintptr_t)rwid->addr); 1301 } 1302 rib_free_wid(rwid); 1303 } 1304 } 1305 1306 /* Server side */ 1307 /* ARGSUSED */ 1308 static void 1309 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1310 { 1311 rdma_recv_data_t *rdp; 1312 rib_qp_t *qp; 1313 ibt_status_t ibt_status; 1314 ibt_wc_t wc; 1315 struct svc_recv *s_recvp; 1316 CONN *conn; 1317 mblk_t *mp; 1318 1319 /* 1320 * Re-enable cq notify here to avoid missing any 1321 * completion queue notification. 1322 */ 1323 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1324 1325 ibt_status = IBT_SUCCESS; 1326 while (ibt_status != IBT_CQ_EMPTY) { 1327 bzero(&wc, sizeof (wc)); 1328 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1329 if (ibt_status != IBT_SUCCESS) 1330 return; 1331 1332 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1333 qp = s_recvp->qp; 1334 conn = qptoc(qp); 1335 mutex_enter(&qp->posted_rbufs_lock); 1336 qp->n_posted_rbufs--; 1337 if (qp->n_posted_rbufs == 0) 1338 cv_signal(&qp->posted_rbufs_cv); 1339 mutex_exit(&qp->posted_rbufs_lock); 1340 1341 if (wc.wc_status == IBT_WC_SUCCESS) { 1342 XDR inxdrs, *xdrs; 1343 uint_t xid, vers, op; 1344 uint32_t rdma_credit; 1345 1346 xdrs = &inxdrs; 1347 /* s_recvp->vaddr stores data */ 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1349 wc.wc_bytes_xfer, XDR_DECODE); 1350 1351 /* 1352 * Treat xid as opaque (xid is the first entity 1353 * in the rpc rdma message). 1354 */ 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1356 /* Skip xid and set the xdr position accordingly. */ 1357 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1358 if (!xdr_u_int(xdrs, &vers) || 1359 !xdr_u_int(xdrs, &rdma_credit) || 1360 !xdr_u_int(xdrs, &op)) { 1361 rib_rbuf_free(conn, RECV_BUFFER, 1362 (void *)(uintptr_t)s_recvp->vaddr); 1363 XDR_DESTROY(xdrs); 1364 (void) rib_free_svc_recv(s_recvp); 1365 continue; 1366 } 1367 XDR_DESTROY(xdrs); 1368 1369 if (vers != RPCRDMA_VERS) { 1370 /* 1371 * Invalid RPC/RDMA version. 1372 * Drop rpc rdma message. 1373 */ 1374 rib_rbuf_free(conn, RECV_BUFFER, 1375 (void *)(uintptr_t)s_recvp->vaddr); 1376 (void) rib_free_svc_recv(s_recvp); 1377 continue; 1378 } 1379 /* 1380 * Is this for RDMA_DONE? 1381 */ 1382 if (op == RDMA_DONE) { 1383 rib_rbuf_free(conn, RECV_BUFFER, 1384 (void *)(uintptr_t)s_recvp->vaddr); 1385 /* 1386 * Wake up the thread waiting on 1387 * a RDMA_DONE for xid 1388 */ 1389 mutex_enter(&qp->rdlist_lock); 1390 rdma_done_notify(qp, xid); 1391 mutex_exit(&qp->rdlist_lock); 1392 (void) rib_free_svc_recv(s_recvp); 1393 continue; 1394 } 1395 1396 mutex_enter(&plugin_state_lock); 1397 if (plugin_state == ACCEPT) { 1398 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1399 == NULL) 1400 (void) strwaitbuf( 1401 sizeof (*rdp), BPRI_LO); 1402 /* 1403 * Plugin is in accept state, hence the master 1404 * transport queue for this is still accepting 1405 * requests. Hence we can call svc_queuereq to 1406 * queue this recieved msg. 1407 */ 1408 rdp = (rdma_recv_data_t *)mp->b_rptr; 1409 rdp->conn = conn; 1410 rdp->rpcmsg.addr = 1411 (caddr_t)(uintptr_t)s_recvp->vaddr; 1412 rdp->rpcmsg.type = RECV_BUFFER; 1413 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1414 rdp->status = wc.wc_status; 1415 mutex_enter(&conn->c_lock); 1416 conn->c_ref++; 1417 mutex_exit(&conn->c_lock); 1418 mp->b_wptr += sizeof (*rdp); 1419 svc_queuereq((queue_t *)rib_stat->q, mp); 1420 mutex_exit(&plugin_state_lock); 1421 } else { 1422 /* 1423 * The master transport for this is going 1424 * away and the queue is not accepting anymore 1425 * requests for krpc, so don't do anything, just 1426 * free the msg. 1427 */ 1428 mutex_exit(&plugin_state_lock); 1429 rib_rbuf_free(conn, RECV_BUFFER, 1430 (void *)(uintptr_t)s_recvp->vaddr); 1431 } 1432 } else { 1433 rib_rbuf_free(conn, RECV_BUFFER, 1434 (void *)(uintptr_t)s_recvp->vaddr); 1435 } 1436 (void) rib_free_svc_recv(s_recvp); 1437 } 1438 } 1439 1440 static void 1441 rib_attach_hca() 1442 { 1443 mutex_enter(&rib_stat->open_hca_lock); 1444 (void) rpcib_open_hcas(rib_stat); 1445 rib_listen(NULL); 1446 mutex_exit(&rib_stat->open_hca_lock); 1447 } 1448 1449 /* 1450 * Handles DR event of IBT_HCA_DETACH_EVENT. 1451 */ 1452 /* ARGSUSED */ 1453 static void 1454 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1455 ibt_async_code_t code, ibt_async_event_t *event) 1456 { 1457 switch (code) { 1458 case IBT_HCA_ATTACH_EVENT: 1459 rib_attach_hca(); 1460 break; 1461 case IBT_HCA_DETACH_EVENT: 1462 { 1463 rib_hca_t *hca; 1464 1465 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1466 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1467 rw_enter(&hca->state_lock, RW_READER); 1468 if ((hca->state != HCA_DETACHED) && 1469 (hca->hca_hdl == hca_hdl)) { 1470 rw_exit(&hca->state_lock); 1471 break; 1472 } 1473 rw_exit(&hca->state_lock); 1474 } 1475 rw_exit(&rib_stat->hcas_list_lock); 1476 1477 if (hca == NULL) 1478 return; 1479 ASSERT(hca->hca_hdl == hca_hdl); 1480 rib_detach_hca(hca); 1481 #ifdef DEBUG 1482 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1483 #endif 1484 break; 1485 } 1486 case IBT_EVENT_PORT_UP: 1487 /* 1488 * A port is up. We should call rib_listen() since there is 1489 * a chance that rib_listen() may have failed during 1490 * rib_attach_hca() because the port had not been up yet. 1491 */ 1492 rib_listen(NULL); 1493 #ifdef DEBUG 1494 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1495 #endif 1496 break; 1497 #ifdef DEBUG 1498 case IBT_EVENT_PATH_MIGRATED: 1499 cmn_err(CE_NOTE, "rib_async_handler(): " 1500 "IBT_EVENT_PATH_MIGRATED\n"); 1501 break; 1502 case IBT_EVENT_SQD: 1503 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1504 break; 1505 case IBT_EVENT_COM_EST: 1506 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1507 break; 1508 case IBT_ERROR_CATASTROPHIC_CHAN: 1509 cmn_err(CE_NOTE, "rib_async_handler(): " 1510 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1511 break; 1512 case IBT_ERROR_INVALID_REQUEST_CHAN: 1513 cmn_err(CE_NOTE, "rib_async_handler(): " 1514 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1515 break; 1516 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1517 cmn_err(CE_NOTE, "rib_async_handler(): " 1518 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1519 break; 1520 case IBT_ERROR_PATH_MIGRATE_REQ: 1521 cmn_err(CE_NOTE, "rib_async_handler(): " 1522 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1523 break; 1524 case IBT_ERROR_CQ: 1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1526 break; 1527 case IBT_ERROR_PORT_DOWN: 1528 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1529 break; 1530 case IBT_ASYNC_OPAQUE1: 1531 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1532 break; 1533 case IBT_ASYNC_OPAQUE2: 1534 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1535 break; 1536 case IBT_ASYNC_OPAQUE3: 1537 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1538 break; 1539 case IBT_ASYNC_OPAQUE4: 1540 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1541 break; 1542 #endif 1543 default: 1544 break; 1545 } 1546 } 1547 1548 /* 1549 * Client's reachable function. 1550 */ 1551 static rdma_stat 1552 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1553 { 1554 rdma_stat status; 1555 rpcib_ping_t rpt; 1556 struct netbuf saddr; 1557 CONN *conn; 1558 1559 bzero(&saddr, sizeof (struct netbuf)); 1560 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1561 1562 if (status == RDMA_SUCCESS) { 1563 *handle = (void *)rpt.hca; 1564 /* release the reference */ 1565 (void) rib_conn_release(conn); 1566 return (RDMA_SUCCESS); 1567 } else { 1568 *handle = NULL; 1569 DTRACE_PROBE(rpcib__i__pingfailed); 1570 return (RDMA_FAILED); 1571 } 1572 } 1573 1574 /* Client side qp creation */ 1575 static rdma_stat 1576 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1577 { 1578 rib_qp_t *kqp = NULL; 1579 CONN *conn; 1580 rdma_clnt_cred_ctrl_t *cc_info; 1581 1582 ASSERT(qp != NULL); 1583 *qp = NULL; 1584 1585 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1586 conn = qptoc(kqp); 1587 kqp->hca = hca; 1588 kqp->rdmaconn.c_rdmamod = &rib_mod; 1589 kqp->rdmaconn.c_private = (caddr_t)kqp; 1590 1591 kqp->mode = RIB_CLIENT; 1592 kqp->chan_flags = IBT_BLOCKING; 1593 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1594 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1595 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1596 /* 1597 * Initialize 1598 */ 1599 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1600 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1601 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1602 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1603 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1604 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1605 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1606 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1607 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1608 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1609 /* 1610 * Initialize the client credit control 1611 * portion of the rdmaconn struct. 1612 */ 1613 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1614 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1615 cc_info->clnt_cc_granted_ops = 0; 1616 cc_info->clnt_cc_in_flight_ops = 0; 1617 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1618 1619 *qp = kqp; 1620 return (RDMA_SUCCESS); 1621 } 1622 1623 /* Server side qp creation */ 1624 static rdma_stat 1625 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1626 { 1627 rib_qp_t *kqp = NULL; 1628 ibt_chan_sizes_t chan_sizes; 1629 ibt_rc_chan_alloc_args_t qp_attr; 1630 ibt_status_t ibt_status; 1631 rdma_srv_cred_ctrl_t *cc_info; 1632 1633 *qp = NULL; 1634 1635 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1636 kqp->hca = hca; 1637 kqp->port_num = port; 1638 kqp->rdmaconn.c_rdmamod = &rib_mod; 1639 kqp->rdmaconn.c_private = (caddr_t)kqp; 1640 1641 /* 1642 * Create the qp handle 1643 */ 1644 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1645 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1646 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1647 qp_attr.rc_pd = hca->pd_hdl; 1648 qp_attr.rc_hca_port_num = port; 1649 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1650 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1651 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1652 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1653 qp_attr.rc_clone_chan = NULL; 1654 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1655 qp_attr.rc_flags = IBT_WR_SIGNALED; 1656 1657 rw_enter(&hca->state_lock, RW_READER); 1658 if (hca->state != HCA_DETACHED) { 1659 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1660 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1661 &chan_sizes); 1662 } else { 1663 rw_exit(&hca->state_lock); 1664 goto fail; 1665 } 1666 rw_exit(&hca->state_lock); 1667 1668 if (ibt_status != IBT_SUCCESS) { 1669 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1670 int, ibt_status); 1671 goto fail; 1672 } 1673 1674 kqp->mode = RIB_SERVER; 1675 kqp->chan_flags = IBT_BLOCKING; 1676 kqp->q = q; /* server ONLY */ 1677 1678 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1679 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1680 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1681 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1682 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1683 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1684 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1685 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1686 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1687 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1688 /* 1689 * Set the private data area to qp to be used in callbacks 1690 */ 1691 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1692 kqp->rdmaconn.c_state = C_CONNECTED; 1693 1694 /* 1695 * Initialize the server credit control 1696 * portion of the rdmaconn struct. 1697 */ 1698 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1699 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1700 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1701 cc_info->srv_cc_cur_buffers_used = 0; 1702 cc_info->srv_cc_posted = preposted_rbufs; 1703 1704 *qp = kqp; 1705 1706 return (RDMA_SUCCESS); 1707 fail: 1708 if (kqp) 1709 kmem_free(kqp, sizeof (rib_qp_t)); 1710 1711 return (RDMA_FAILED); 1712 } 1713 1714 /* ARGSUSED */ 1715 ibt_cm_status_t 1716 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1717 ibt_cm_return_args_t *ret_args, void *priv_data, 1718 ibt_priv_data_len_t len) 1719 { 1720 rib_hca_t *hca; 1721 1722 hca = (rib_hca_t *)clnt_hdl; 1723 1724 switch (event->cm_type) { 1725 1726 /* got a connection close event */ 1727 case IBT_CM_EVENT_CONN_CLOSED: 1728 { 1729 CONN *conn; 1730 rib_qp_t *qp; 1731 1732 /* check reason why connection was closed */ 1733 switch (event->cm_event.closed) { 1734 case IBT_CM_CLOSED_DREP_RCVD: 1735 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1736 case IBT_CM_CLOSED_DUP: 1737 case IBT_CM_CLOSED_ABORT: 1738 case IBT_CM_CLOSED_ALREADY: 1739 /* 1740 * These cases indicate the local end initiated 1741 * the closing of the channel. Nothing to do here. 1742 */ 1743 break; 1744 default: 1745 /* 1746 * Reason for CONN_CLOSED event must be one of 1747 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1748 * or IBT_CM_CLOSED_STALE. These indicate cases were 1749 * the remote end is closing the channel. In these 1750 * cases free the channel and transition to error 1751 * state 1752 */ 1753 qp = ibt_get_chan_private(event->cm_channel); 1754 conn = qptoc(qp); 1755 mutex_enter(&conn->c_lock); 1756 if (conn->c_state == C_DISCONN_PEND) { 1757 mutex_exit(&conn->c_lock); 1758 break; 1759 } 1760 1761 conn->c_state = C_ERROR_CONN; 1762 1763 /* 1764 * Free the conn if c_ref is down to 0 already 1765 */ 1766 if (conn->c_ref == 0) { 1767 /* 1768 * Remove from list and free conn 1769 */ 1770 conn->c_state = C_DISCONN_PEND; 1771 mutex_exit(&conn->c_lock); 1772 rw_enter(&hca->state_lock, RW_READER); 1773 if (hca->state != HCA_DETACHED) 1774 (void) rib_disconnect_channel(conn, 1775 &hca->cl_conn_list); 1776 rw_exit(&hca->state_lock); 1777 } else { 1778 /* 1779 * conn will be freed when c_ref goes to 0. 1780 * Indicate to cleaning thread not to close 1781 * the connection, but just free the channel. 1782 */ 1783 conn->c_flags |= C_CLOSE_NOTNEEDED; 1784 mutex_exit(&conn->c_lock); 1785 } 1786 #ifdef DEBUG 1787 if (rib_debug) 1788 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1789 "(CONN_CLOSED) channel disconnected"); 1790 #endif 1791 break; 1792 } 1793 break; 1794 } 1795 default: 1796 break; 1797 } 1798 return (IBT_CM_ACCEPT); 1799 } 1800 1801 /* 1802 * Connect to the server. 1803 */ 1804 rdma_stat 1805 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1806 { 1807 ibt_chan_open_args_t chan_args; /* channel args */ 1808 ibt_chan_sizes_t chan_sizes; 1809 ibt_rc_chan_alloc_args_t qp_attr; 1810 ibt_status_t ibt_status; 1811 ibt_rc_returns_t ret_args; /* conn reject info */ 1812 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1813 ibt_ip_cm_info_t ipcm_info; 1814 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1815 1816 1817 (void) bzero(&chan_args, sizeof (chan_args)); 1818 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1819 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1820 1821 ipcm_info.src_addr.family = rptp->srcip.family; 1822 switch (ipcm_info.src_addr.family) { 1823 case AF_INET: 1824 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1825 break; 1826 case AF_INET6: 1827 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1828 break; 1829 } 1830 1831 ipcm_info.dst_addr.family = rptp->srcip.family; 1832 switch (ipcm_info.dst_addr.family) { 1833 case AF_INET: 1834 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1835 break; 1836 case AF_INET6: 1837 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1838 break; 1839 } 1840 1841 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1842 1843 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1844 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1845 1846 if (ibt_status != IBT_SUCCESS) { 1847 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1848 return (-1); 1849 } 1850 1851 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1852 /* Alloc a RC channel */ 1853 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1854 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1855 qp_attr.rc_pd = hca->pd_hdl; 1856 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1857 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1858 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1859 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1860 qp_attr.rc_clone_chan = NULL; 1861 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1862 qp_attr.rc_flags = IBT_WR_SIGNALED; 1863 1864 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1865 chan_args.oc_path = &rptp->path; 1866 1867 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1868 chan_args.oc_cm_clnt_private = (void *)hca; 1869 chan_args.oc_rdma_ra_out = 4; 1870 chan_args.oc_rdma_ra_in = 4; 1871 chan_args.oc_path_retry_cnt = 2; 1872 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1873 chan_args.oc_priv_data = cmp_ip_pvt; 1874 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1875 1876 refresh: 1877 rw_enter(&hca->state_lock, RW_READER); 1878 if (hca->state != HCA_DETACHED) { 1879 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1880 IBT_ACHAN_NO_FLAGS, 1881 &qp_attr, &qp->qp_hdl, 1882 &chan_sizes); 1883 } else { 1884 rw_exit(&hca->state_lock); 1885 return (RDMA_FAILED); 1886 } 1887 rw_exit(&hca->state_lock); 1888 1889 if (ibt_status != IBT_SUCCESS) { 1890 DTRACE_PROBE1(rpcib__i_conntosrv, 1891 int, ibt_status); 1892 return (RDMA_FAILED); 1893 } 1894 1895 /* Connect to the Server */ 1896 (void) bzero(&ret_args, sizeof (ret_args)); 1897 mutex_enter(&qp->cb_lock); 1898 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1899 IBT_BLOCKING, &chan_args, &ret_args); 1900 if (ibt_status != IBT_SUCCESS) { 1901 DTRACE_PROBE2(rpcib__i_openrctosrv, 1902 int, ibt_status, int, ret_args.rc_status); 1903 1904 (void) ibt_free_channel(qp->qp_hdl); 1905 qp->qp_hdl = NULL; 1906 mutex_exit(&qp->cb_lock); 1907 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1908 ret_args.rc_status == IBT_CM_CONN_STALE) { 1909 /* 1910 * Got IBT_CM_CONN_STALE probably because of stale 1911 * data on the passive end of a channel that existed 1912 * prior to reboot. Retry establishing a channel 1913 * REFRESH_ATTEMPTS times, during which time the 1914 * stale conditions on the server might clear up. 1915 */ 1916 goto refresh; 1917 } 1918 return (RDMA_FAILED); 1919 } 1920 mutex_exit(&qp->cb_lock); 1921 /* 1922 * Set the private data area to qp to be used in callbacks 1923 */ 1924 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1925 return (RDMA_SUCCESS); 1926 } 1927 1928 rdma_stat 1929 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1930 { 1931 uint_t i, addr_count; 1932 ibt_status_t ibt_status; 1933 uint8_t num_paths_p; 1934 ibt_ip_path_attr_t ipattr; 1935 ibt_path_ip_src_t srcip; 1936 rpcib_ipaddrs_t addrs4; 1937 rpcib_ipaddrs_t addrs6; 1938 struct sockaddr_in *sinp; 1939 struct sockaddr_in6 *sin6p; 1940 rdma_stat retval = RDMA_FAILED; 1941 rib_hca_t *hca; 1942 1943 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1944 return (RDMA_INVAL); 1945 ASSERT(raddr->buf != NULL); 1946 1947 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1948 1949 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1950 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1951 retval = RDMA_FAILED; 1952 goto done2; 1953 } 1954 1955 if (addr_type == AF_INET) { 1956 addr_count = addrs4.ri_count; 1957 sinp = (struct sockaddr_in *)raddr->buf; 1958 rptp->dstip.family = AF_INET; 1959 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1960 sinp = addrs4.ri_list; 1961 } else { 1962 addr_count = addrs6.ri_count; 1963 sin6p = (struct sockaddr_in6 *)raddr->buf; 1964 rptp->dstip.family = AF_INET6; 1965 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1966 sin6p = addrs6.ri_list; 1967 } 1968 1969 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1970 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1971 rw_enter(&hca->state_lock, RW_READER); 1972 if (hca->state == HCA_DETACHED) { 1973 rw_exit(&hca->state_lock); 1974 continue; 1975 } 1976 1977 ipattr.ipa_dst_ip = &rptp->dstip; 1978 ipattr.ipa_hca_guid = hca->hca_guid; 1979 ipattr.ipa_ndst = 1; 1980 ipattr.ipa_max_paths = 1; 1981 ipattr.ipa_src_ip.family = rptp->dstip.family; 1982 for (i = 0; i < addr_count; i++) { 1983 num_paths_p = 0; 1984 if (addr_type == AF_INET) { 1985 ipattr.ipa_src_ip.un.ip4addr = 1986 sinp[i].sin_addr.s_addr; 1987 } else { 1988 ipattr.ipa_src_ip.un.ip6addr = 1989 sin6p[i].sin6_addr; 1990 } 1991 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1992 1993 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1994 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1995 &num_paths_p, &srcip); 1996 if (ibt_status == IBT_SUCCESS && 1997 num_paths_p != 0 && 1998 rptp->path.pi_hca_guid == hca->hca_guid) { 1999 rptp->hca = hca; 2000 rw_exit(&hca->state_lock); 2001 if (addr_type == AF_INET) { 2002 rptp->srcip.family = AF_INET; 2003 rptp->srcip.un.ip4addr = 2004 srcip.ip_primary.un.ip4addr; 2005 } else { 2006 rptp->srcip.family = AF_INET6; 2007 rptp->srcip.un.ip6addr = 2008 srcip.ip_primary.un.ip6addr; 2009 2010 } 2011 retval = RDMA_SUCCESS; 2012 goto done1; 2013 } 2014 } 2015 rw_exit(&hca->state_lock); 2016 } 2017 done1: 2018 rw_exit(&rib_stat->hcas_list_lock); 2019 done2: 2020 if (addrs4.ri_size > 0) 2021 kmem_free(addrs4.ri_list, addrs4.ri_size); 2022 if (addrs6.ri_size > 0) 2023 kmem_free(addrs6.ri_list, addrs6.ri_size); 2024 return (retval); 2025 } 2026 2027 /* 2028 * Close channel, remove from connection list and 2029 * free up resources allocated for that channel. 2030 */ 2031 rdma_stat 2032 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2033 { 2034 rib_qp_t *qp = ctoqp(conn); 2035 rib_hca_t *hca; 2036 2037 mutex_enter(&conn->c_lock); 2038 if (conn->c_timeout != NULL) { 2039 mutex_exit(&conn->c_lock); 2040 (void) untimeout(conn->c_timeout); 2041 mutex_enter(&conn->c_lock); 2042 } 2043 2044 while (conn->c_flags & C_CLOSE_PENDING) { 2045 cv_wait(&conn->c_cv, &conn->c_lock); 2046 } 2047 mutex_exit(&conn->c_lock); 2048 2049 /* 2050 * c_ref == 0 and connection is in C_DISCONN_PEND 2051 */ 2052 hca = qp->hca; 2053 if (conn_list != NULL) 2054 (void) rib_rm_conn(conn, conn_list); 2055 2056 /* 2057 * There is only one case where we get here with 2058 * qp_hdl = NULL, which is during connection setup on 2059 * the client. In such a case there are no posted 2060 * send/recv buffers. 2061 */ 2062 if (qp->qp_hdl != NULL) { 2063 mutex_enter(&qp->posted_rbufs_lock); 2064 while (qp->n_posted_rbufs) 2065 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2066 mutex_exit(&qp->posted_rbufs_lock); 2067 2068 mutex_enter(&qp->send_rbufs_lock); 2069 while (qp->n_send_rbufs) 2070 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2071 mutex_exit(&qp->send_rbufs_lock); 2072 2073 (void) ibt_free_channel(qp->qp_hdl); 2074 qp->qp_hdl = NULL; 2075 } 2076 2077 ASSERT(qp->rdlist == NULL); 2078 2079 if (qp->replylist != NULL) { 2080 (void) rib_rem_replylist(qp); 2081 } 2082 2083 cv_destroy(&qp->cb_conn_cv); 2084 cv_destroy(&qp->posted_rbufs_cv); 2085 cv_destroy(&qp->send_rbufs_cv); 2086 mutex_destroy(&qp->cb_lock); 2087 mutex_destroy(&qp->replylist_lock); 2088 mutex_destroy(&qp->posted_rbufs_lock); 2089 mutex_destroy(&qp->send_rbufs_lock); 2090 mutex_destroy(&qp->rdlist_lock); 2091 2092 cv_destroy(&conn->c_cv); 2093 mutex_destroy(&conn->c_lock); 2094 2095 if (conn->c_raddr.buf != NULL) { 2096 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2097 } 2098 if (conn->c_laddr.buf != NULL) { 2099 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2100 } 2101 if (conn->c_netid != NULL) { 2102 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2103 } 2104 2105 /* 2106 * Credit control cleanup. 2107 */ 2108 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2109 rdma_clnt_cred_ctrl_t *cc_info; 2110 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2111 cv_destroy(&cc_info->clnt_cc_cv); 2112 } 2113 2114 kmem_free(qp, sizeof (rib_qp_t)); 2115 2116 /* 2117 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2118 * then the hca is no longer being used. 2119 */ 2120 if (conn_list != NULL) { 2121 rw_enter(&hca->state_lock, RW_READER); 2122 if (hca->state == HCA_DETACHED) { 2123 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2124 if (hca->srv_conn_list.conn_hd == NULL) { 2125 rw_enter(&hca->cl_conn_list.conn_lock, 2126 RW_READER); 2127 2128 if (hca->cl_conn_list.conn_hd == NULL) { 2129 mutex_enter(&hca->inuse_lock); 2130 hca->inuse = FALSE; 2131 cv_signal(&hca->cb_cv); 2132 mutex_exit(&hca->inuse_lock); 2133 } 2134 rw_exit(&hca->cl_conn_list.conn_lock); 2135 } 2136 rw_exit(&hca->srv_conn_list.conn_lock); 2137 } 2138 rw_exit(&hca->state_lock); 2139 } 2140 2141 return (RDMA_SUCCESS); 2142 } 2143 2144 /* 2145 * All sends are done under the protection of 2146 * the wdesc->sendwait_lock. n_send_rbufs count 2147 * is protected using the send_rbufs_lock. 2148 * lock ordering is: 2149 * sendwait_lock -> send_rbufs_lock 2150 */ 2151 2152 void 2153 rib_send_hold(rib_qp_t *qp) 2154 { 2155 mutex_enter(&qp->send_rbufs_lock); 2156 qp->n_send_rbufs++; 2157 mutex_exit(&qp->send_rbufs_lock); 2158 } 2159 2160 void 2161 rib_send_rele(rib_qp_t *qp) 2162 { 2163 mutex_enter(&qp->send_rbufs_lock); 2164 qp->n_send_rbufs--; 2165 if (qp->n_send_rbufs == 0) 2166 cv_signal(&qp->send_rbufs_cv); 2167 mutex_exit(&qp->send_rbufs_lock); 2168 } 2169 2170 /* 2171 * Wait for send completion notification. Only on receiving a 2172 * notification be it a successful or error completion, free the 2173 * send_wid. 2174 */ 2175 static rdma_stat 2176 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2177 { 2178 clock_t timout, cv_wait_ret; 2179 rdma_stat error = RDMA_SUCCESS; 2180 int i; 2181 2182 /* 2183 * Wait for send to complete 2184 */ 2185 ASSERT(wd != NULL); 2186 mutex_enter(&wd->sendwait_lock); 2187 if (wd->status == (uint_t)SEND_WAIT) { 2188 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2189 ddi_get_lbolt(); 2190 2191 if (qp->mode == RIB_SERVER) { 2192 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2193 &wd->sendwait_lock, timout)) > 0 && 2194 wd->status == (uint_t)SEND_WAIT) 2195 ; 2196 switch (cv_wait_ret) { 2197 case -1: /* timeout */ 2198 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2199 2200 wd->cv_sig = 0; /* no signal needed */ 2201 error = RDMA_TIMEDOUT; 2202 break; 2203 default: /* got send completion */ 2204 break; 2205 } 2206 } else { 2207 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2208 &wd->sendwait_lock, timout)) > 0 && 2209 wd->status == (uint_t)SEND_WAIT) 2210 ; 2211 switch (cv_wait_ret) { 2212 case -1: /* timeout */ 2213 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2214 2215 wd->cv_sig = 0; /* no signal needed */ 2216 error = RDMA_TIMEDOUT; 2217 break; 2218 case 0: /* interrupted */ 2219 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2220 2221 wd->cv_sig = 0; /* no signal needed */ 2222 error = RDMA_INTR; 2223 break; 2224 default: /* got send completion */ 2225 break; 2226 } 2227 } 2228 } 2229 2230 if (wd->status != (uint_t)SEND_WAIT) { 2231 /* got send completion */ 2232 if (wd->status != RDMA_SUCCESS) { 2233 switch (wd->status) { 2234 case RDMA_CONNLOST: 2235 error = RDMA_CONNLOST; 2236 break; 2237 default: 2238 error = RDMA_FAILED; 2239 break; 2240 } 2241 } 2242 for (i = 0; i < wd->nsbufs; i++) { 2243 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2244 (void *)(uintptr_t)wd->sbufaddr[i]); 2245 } 2246 2247 rib_send_rele(qp); 2248 2249 mutex_exit(&wd->sendwait_lock); 2250 (void) rib_free_sendwait(wd); 2251 2252 } else { 2253 mutex_exit(&wd->sendwait_lock); 2254 } 2255 return (error); 2256 } 2257 2258 static struct send_wid * 2259 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2260 { 2261 struct send_wid *wd; 2262 2263 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2264 wd->xid = xid; 2265 wd->cv_sig = cv_sig; 2266 wd->qp = qp; 2267 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2268 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2269 wd->status = (uint_t)SEND_WAIT; 2270 2271 return (wd); 2272 } 2273 2274 static int 2275 rib_free_sendwait(struct send_wid *wdesc) 2276 { 2277 cv_destroy(&wdesc->wait_cv); 2278 mutex_destroy(&wdesc->sendwait_lock); 2279 kmem_free(wdesc, sizeof (*wdesc)); 2280 2281 return (0); 2282 } 2283 2284 static rdma_stat 2285 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2286 { 2287 mutex_enter(&qp->replylist_lock); 2288 if (rep != NULL) { 2289 (void) rib_remreply(qp, rep); 2290 mutex_exit(&qp->replylist_lock); 2291 return (RDMA_SUCCESS); 2292 } 2293 mutex_exit(&qp->replylist_lock); 2294 return (RDMA_FAILED); 2295 } 2296 2297 /* 2298 * Send buffers are freed here only in case of error in posting 2299 * on QP. If the post succeeded, the send buffers are freed upon 2300 * send completion in rib_sendwait() or in the scq_handler. 2301 */ 2302 rdma_stat 2303 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2304 int send_sig, int cv_sig, caddr_t *swid) 2305 { 2306 struct send_wid *wdesc; 2307 struct clist *clp; 2308 ibt_status_t ibt_status = IBT_SUCCESS; 2309 rdma_stat ret = RDMA_SUCCESS; 2310 ibt_send_wr_t tx_wr; 2311 int i, nds; 2312 ibt_wr_ds_t sgl[DSEG_MAX]; 2313 uint_t total_msg_size; 2314 rib_qp_t *qp; 2315 2316 qp = ctoqp(conn); 2317 2318 ASSERT(cl != NULL); 2319 2320 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2321 2322 nds = 0; 2323 total_msg_size = 0; 2324 clp = cl; 2325 while (clp != NULL) { 2326 if (nds >= DSEG_MAX) { 2327 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2328 return (RDMA_FAILED); 2329 } 2330 sgl[nds].ds_va = clp->w.c_saddr; 2331 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2332 sgl[nds].ds_len = clp->c_len; 2333 total_msg_size += clp->c_len; 2334 clp = clp->c_next; 2335 nds++; 2336 } 2337 2338 if (send_sig) { 2339 /* Set SEND_SIGNAL flag. */ 2340 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2341 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2342 *swid = (caddr_t)wdesc; 2343 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2344 mutex_enter(&wdesc->sendwait_lock); 2345 wdesc->nsbufs = nds; 2346 for (i = 0; i < nds; i++) { 2347 wdesc->sbufaddr[i] = sgl[i].ds_va; 2348 } 2349 } else { 2350 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2351 *swid = NULL; 2352 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2353 } 2354 2355 tx_wr.wr_opcode = IBT_WRC_SEND; 2356 tx_wr.wr_trans = IBT_RC_SRV; 2357 tx_wr.wr_nds = nds; 2358 tx_wr.wr_sgl = sgl; 2359 2360 mutex_enter(&conn->c_lock); 2361 if (conn->c_state == C_CONNECTED) { 2362 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2363 } 2364 if (conn->c_state != C_CONNECTED || 2365 ibt_status != IBT_SUCCESS) { 2366 if (conn->c_state != C_DISCONN_PEND) 2367 conn->c_state = C_ERROR_CONN; 2368 mutex_exit(&conn->c_lock); 2369 if (send_sig) { 2370 for (i = 0; i < nds; i++) { 2371 rib_rbuf_free(conn, SEND_BUFFER, 2372 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2373 } 2374 mutex_exit(&wdesc->sendwait_lock); 2375 (void) rib_free_sendwait(wdesc); 2376 } 2377 return (RDMA_CONNLOST); 2378 } 2379 2380 mutex_exit(&conn->c_lock); 2381 2382 if (send_sig) { 2383 rib_send_hold(qp); 2384 mutex_exit(&wdesc->sendwait_lock); 2385 if (cv_sig) { 2386 /* 2387 * cv_wait for send to complete. 2388 * We can fail due to a timeout or signal or 2389 * unsuccessful send. 2390 */ 2391 ret = rib_sendwait(qp, wdesc); 2392 2393 return (ret); 2394 } 2395 } 2396 2397 return (RDMA_SUCCESS); 2398 } 2399 2400 2401 rdma_stat 2402 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2403 { 2404 rdma_stat ret; 2405 caddr_t wd; 2406 2407 /* send-wait & cv_signal */ 2408 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2409 return (ret); 2410 } 2411 2412 /* 2413 * Deprecated/obsolete interface not used currently 2414 * but earlier used for READ-READ protocol. 2415 * Send RPC reply and wait for RDMA_DONE. 2416 */ 2417 rdma_stat 2418 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2419 { 2420 rdma_stat ret = RDMA_SUCCESS; 2421 struct rdma_done_list *rd; 2422 clock_t cv_wait_ret; 2423 caddr_t *wid = NULL; 2424 rib_qp_t *qp = ctoqp(conn); 2425 2426 mutex_enter(&qp->rdlist_lock); 2427 rd = rdma_done_add(qp, msgid); 2428 2429 /* No cv_signal (whether send-wait or no-send-wait) */ 2430 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2431 2432 if (ret != RDMA_SUCCESS) { 2433 rdma_done_rm(qp, rd); 2434 } else { 2435 /* 2436 * Wait for RDMA_DONE from remote end 2437 */ 2438 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2439 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2440 TR_CLOCK_TICK); 2441 2442 rdma_done_rm(qp, rd); 2443 2444 if (cv_wait_ret < 0) { 2445 ret = RDMA_TIMEDOUT; 2446 } 2447 } 2448 2449 mutex_exit(&qp->rdlist_lock); 2450 return (ret); 2451 } 2452 2453 static struct recv_wid * 2454 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2455 { 2456 struct recv_wid *rwid; 2457 2458 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2459 rwid->xid = msgid; 2460 rwid->addr = sgl->ds_va; 2461 rwid->qp = qp; 2462 2463 return (rwid); 2464 } 2465 2466 static void 2467 rib_free_wid(struct recv_wid *rwid) 2468 { 2469 kmem_free(rwid, sizeof (struct recv_wid)); 2470 } 2471 2472 rdma_stat 2473 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2474 { 2475 rib_qp_t *qp = ctoqp(conn); 2476 struct clist *clp = cl; 2477 struct reply *rep; 2478 struct recv_wid *rwid; 2479 int nds; 2480 ibt_wr_ds_t sgl[DSEG_MAX]; 2481 ibt_recv_wr_t recv_wr; 2482 rdma_stat ret; 2483 ibt_status_t ibt_status; 2484 2485 /* 2486 * rdma_clnt_postrecv uses RECV_BUFFER. 2487 */ 2488 2489 nds = 0; 2490 while (cl != NULL) { 2491 if (nds >= DSEG_MAX) { 2492 ret = RDMA_FAILED; 2493 goto done; 2494 } 2495 sgl[nds].ds_va = cl->w.c_saddr; 2496 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2497 sgl[nds].ds_len = cl->c_len; 2498 cl = cl->c_next; 2499 nds++; 2500 } 2501 2502 if (nds != 1) { 2503 ret = RDMA_FAILED; 2504 goto done; 2505 } 2506 2507 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2508 recv_wr.wr_nds = nds; 2509 recv_wr.wr_sgl = sgl; 2510 2511 rwid = rib_create_wid(qp, &sgl[0], msgid); 2512 if (rwid) { 2513 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2514 } else { 2515 ret = RDMA_NORESOURCE; 2516 goto done; 2517 } 2518 rep = rib_addreplylist(qp, msgid); 2519 if (!rep) { 2520 rib_free_wid(rwid); 2521 ret = RDMA_NORESOURCE; 2522 goto done; 2523 } 2524 2525 mutex_enter(&conn->c_lock); 2526 2527 if (conn->c_state == C_CONNECTED) { 2528 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2529 } 2530 2531 if (conn->c_state != C_CONNECTED || 2532 ibt_status != IBT_SUCCESS) { 2533 if (conn->c_state != C_DISCONN_PEND) 2534 conn->c_state = C_ERROR_CONN; 2535 mutex_exit(&conn->c_lock); 2536 rib_free_wid(rwid); 2537 (void) rib_rem_rep(qp, rep); 2538 ret = RDMA_CONNLOST; 2539 goto done; 2540 } 2541 mutex_exit(&conn->c_lock); 2542 return (RDMA_SUCCESS); 2543 2544 done: 2545 while (clp != NULL) { 2546 rib_rbuf_free(conn, RECV_BUFFER, 2547 (void *)(uintptr_t)clp->w.c_saddr3); 2548 clp = clp->c_next; 2549 } 2550 return (ret); 2551 } 2552 2553 rdma_stat 2554 rib_svc_post(CONN* conn, struct clist *cl) 2555 { 2556 rib_qp_t *qp = ctoqp(conn); 2557 struct svc_recv *s_recvp; 2558 int nds; 2559 ibt_wr_ds_t sgl[DSEG_MAX]; 2560 ibt_recv_wr_t recv_wr; 2561 ibt_status_t ibt_status; 2562 2563 nds = 0; 2564 while (cl != NULL) { 2565 if (nds >= DSEG_MAX) { 2566 return (RDMA_FAILED); 2567 } 2568 sgl[nds].ds_va = cl->w.c_saddr; 2569 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2570 sgl[nds].ds_len = cl->c_len; 2571 cl = cl->c_next; 2572 nds++; 2573 } 2574 2575 if (nds != 1) { 2576 rib_rbuf_free(conn, RECV_BUFFER, 2577 (caddr_t)(uintptr_t)sgl[0].ds_va); 2578 2579 return (RDMA_FAILED); 2580 } 2581 2582 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2583 recv_wr.wr_nds = nds; 2584 recv_wr.wr_sgl = sgl; 2585 2586 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2587 /* Use s_recvp's addr as wr id */ 2588 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2589 mutex_enter(&conn->c_lock); 2590 if (conn->c_state == C_CONNECTED) { 2591 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2592 } 2593 if (conn->c_state != C_CONNECTED || 2594 ibt_status != IBT_SUCCESS) { 2595 if (conn->c_state != C_DISCONN_PEND) 2596 conn->c_state = C_ERROR_CONN; 2597 mutex_exit(&conn->c_lock); 2598 rib_rbuf_free(conn, RECV_BUFFER, 2599 (caddr_t)(uintptr_t)sgl[0].ds_va); 2600 (void) rib_free_svc_recv(s_recvp); 2601 2602 return (RDMA_CONNLOST); 2603 } 2604 mutex_exit(&conn->c_lock); 2605 2606 return (RDMA_SUCCESS); 2607 } 2608 2609 /* Client */ 2610 rdma_stat 2611 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2612 { 2613 2614 return (rib_clnt_post(conn, cl, msgid)); 2615 } 2616 2617 /* Client */ 2618 rdma_stat 2619 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2620 { 2621 rib_qp_t *qp = ctoqp(conn); 2622 struct reply *rep; 2623 2624 mutex_enter(&qp->replylist_lock); 2625 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2626 if (rep->xid == msgid) { 2627 if (rep->vaddr_cq) { 2628 rib_rbuf_free(conn, RECV_BUFFER, 2629 (caddr_t)(uintptr_t)rep->vaddr_cq); 2630 } 2631 (void) rib_remreply(qp, rep); 2632 break; 2633 } 2634 } 2635 mutex_exit(&qp->replylist_lock); 2636 2637 return (RDMA_SUCCESS); 2638 } 2639 2640 /* Server */ 2641 rdma_stat 2642 rib_post_recv(CONN *conn, struct clist *cl) 2643 { 2644 rib_qp_t *qp = ctoqp(conn); 2645 2646 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2647 mutex_enter(&qp->posted_rbufs_lock); 2648 qp->n_posted_rbufs++; 2649 mutex_exit(&qp->posted_rbufs_lock); 2650 return (RDMA_SUCCESS); 2651 } 2652 return (RDMA_FAILED); 2653 } 2654 2655 /* 2656 * Client side only interface to "recv" the rpc reply buf 2657 * posted earlier by rib_post_resp(conn, cl, msgid). 2658 */ 2659 rdma_stat 2660 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2661 { 2662 struct reply *rep = NULL; 2663 clock_t timout, cv_wait_ret; 2664 rdma_stat ret = RDMA_SUCCESS; 2665 rib_qp_t *qp = ctoqp(conn); 2666 2667 /* 2668 * Find the reply structure for this msgid 2669 */ 2670 mutex_enter(&qp->replylist_lock); 2671 2672 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2673 if (rep->xid == msgid) 2674 break; 2675 } 2676 2677 if (rep != NULL) { 2678 /* 2679 * If message not yet received, wait. 2680 */ 2681 if (rep->status == (uint_t)REPLY_WAIT) { 2682 timout = ddi_get_lbolt() + 2683 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2684 2685 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2686 &qp->replylist_lock, timout)) > 0 && 2687 rep->status == (uint_t)REPLY_WAIT) 2688 ; 2689 2690 switch (cv_wait_ret) { 2691 case -1: /* timeout */ 2692 ret = RDMA_TIMEDOUT; 2693 break; 2694 case 0: 2695 ret = RDMA_INTR; 2696 break; 2697 default: 2698 break; 2699 } 2700 } 2701 2702 if (rep->status == RDMA_SUCCESS) { 2703 struct clist *cl = NULL; 2704 2705 /* 2706 * Got message successfully 2707 */ 2708 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2709 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2710 *clp = cl; 2711 } else { 2712 if (rep->status != (uint_t)REPLY_WAIT) { 2713 /* 2714 * Got error in reply message. Free 2715 * recv buffer here. 2716 */ 2717 ret = rep->status; 2718 rib_rbuf_free(conn, RECV_BUFFER, 2719 (caddr_t)(uintptr_t)rep->vaddr_cq); 2720 } 2721 } 2722 (void) rib_remreply(qp, rep); 2723 } else { 2724 /* 2725 * No matching reply structure found for given msgid on the 2726 * reply wait list. 2727 */ 2728 ret = RDMA_INVAL; 2729 DTRACE_PROBE(rpcib__i__nomatchxid2); 2730 } 2731 2732 /* 2733 * Done. 2734 */ 2735 mutex_exit(&qp->replylist_lock); 2736 return (ret); 2737 } 2738 2739 /* 2740 * RDMA write a buffer to the remote address. 2741 */ 2742 rdma_stat 2743 rib_write(CONN *conn, struct clist *cl, int wait) 2744 { 2745 ibt_send_wr_t tx_wr; 2746 int cv_sig; 2747 ibt_wr_ds_t sgl[DSEG_MAX]; 2748 struct send_wid *wdesc; 2749 ibt_status_t ibt_status; 2750 rdma_stat ret = RDMA_SUCCESS; 2751 rib_qp_t *qp = ctoqp(conn); 2752 uint64_t n_writes = 0; 2753 2754 if (cl == NULL) { 2755 return (RDMA_FAILED); 2756 } 2757 2758 while ((cl != NULL)) { 2759 if (cl->c_len > 0) { 2760 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2761 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2762 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2763 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2764 sgl[0].ds_va = cl->w.c_saddr; 2765 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2766 sgl[0].ds_len = cl->c_len; 2767 2768 if (wait) { 2769 cv_sig = 1; 2770 } else { 2771 if (n_writes > max_unsignaled_rws) { 2772 n_writes = 0; 2773 cv_sig = 1; 2774 } else { 2775 cv_sig = 0; 2776 } 2777 } 2778 2779 if (cv_sig) { 2780 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2781 wdesc = rib_init_sendwait(0, cv_sig, qp); 2782 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2783 mutex_enter(&wdesc->sendwait_lock); 2784 } else { 2785 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2786 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2787 } 2788 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2789 tx_wr.wr_trans = IBT_RC_SRV; 2790 tx_wr.wr_nds = 1; 2791 tx_wr.wr_sgl = sgl; 2792 2793 mutex_enter(&conn->c_lock); 2794 if (conn->c_state == C_CONNECTED) { 2795 ibt_status = 2796 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2797 } 2798 if (conn->c_state != C_CONNECTED || 2799 ibt_status != IBT_SUCCESS) { 2800 if (conn->c_state != C_DISCONN_PEND) 2801 conn->c_state = C_ERROR_CONN; 2802 mutex_exit(&conn->c_lock); 2803 if (cv_sig) { 2804 mutex_exit(&wdesc->sendwait_lock); 2805 (void) rib_free_sendwait(wdesc); 2806 } 2807 return (RDMA_CONNLOST); 2808 } 2809 2810 mutex_exit(&conn->c_lock); 2811 2812 /* 2813 * Wait for send to complete 2814 */ 2815 if (cv_sig) { 2816 2817 rib_send_hold(qp); 2818 mutex_exit(&wdesc->sendwait_lock); 2819 2820 ret = rib_sendwait(qp, wdesc); 2821 if (ret != 0) 2822 return (ret); 2823 } 2824 n_writes ++; 2825 } 2826 cl = cl->c_next; 2827 } 2828 return (RDMA_SUCCESS); 2829 } 2830 2831 /* 2832 * RDMA Read a buffer from the remote address. 2833 */ 2834 rdma_stat 2835 rib_read(CONN *conn, struct clist *cl, int wait) 2836 { 2837 ibt_send_wr_t rx_wr; 2838 int cv_sig = 0; 2839 ibt_wr_ds_t sgl; 2840 struct send_wid *wdesc; 2841 ibt_status_t ibt_status = IBT_SUCCESS; 2842 rdma_stat ret = RDMA_SUCCESS; 2843 rib_qp_t *qp = ctoqp(conn); 2844 2845 if (cl == NULL) { 2846 return (RDMA_FAILED); 2847 } 2848 2849 while (cl != NULL) { 2850 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2851 /* 2852 * Remote address is at the head chunk item in list. 2853 */ 2854 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2855 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2856 2857 sgl.ds_va = cl->u.c_daddr; 2858 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2859 sgl.ds_len = cl->c_len; 2860 2861 /* 2862 * If there are multiple chunks to be read, and 2863 * wait is set, ask for signal only for the last chunk 2864 * and wait only on the last chunk. The completion of 2865 * RDMA_READ on last chunk ensures that reads on all 2866 * previous chunks are also completed. 2867 */ 2868 if (wait && (cl->c_next == NULL)) { 2869 cv_sig = 1; 2870 wdesc = rib_init_sendwait(0, cv_sig, qp); 2871 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2872 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2873 mutex_enter(&wdesc->sendwait_lock); 2874 } else { 2875 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2876 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2877 } 2878 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2879 rx_wr.wr_trans = IBT_RC_SRV; 2880 rx_wr.wr_nds = 1; 2881 rx_wr.wr_sgl = &sgl; 2882 2883 mutex_enter(&conn->c_lock); 2884 if (conn->c_state == C_CONNECTED) { 2885 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2886 } 2887 if (conn->c_state != C_CONNECTED || 2888 ibt_status != IBT_SUCCESS) { 2889 if (conn->c_state != C_DISCONN_PEND) 2890 conn->c_state = C_ERROR_CONN; 2891 mutex_exit(&conn->c_lock); 2892 if (wait && (cl->c_next == NULL)) { 2893 mutex_exit(&wdesc->sendwait_lock); 2894 (void) rib_free_sendwait(wdesc); 2895 } 2896 return (RDMA_CONNLOST); 2897 } 2898 2899 mutex_exit(&conn->c_lock); 2900 2901 /* 2902 * Wait for send to complete if this is the 2903 * last item in the list. 2904 */ 2905 if (wait && cl->c_next == NULL) { 2906 rib_send_hold(qp); 2907 mutex_exit(&wdesc->sendwait_lock); 2908 2909 ret = rib_sendwait(qp, wdesc); 2910 2911 if (ret != 0) 2912 return (ret); 2913 } 2914 cl = cl->c_next; 2915 } 2916 return (RDMA_SUCCESS); 2917 } 2918 2919 /* 2920 * rib_srv_cm_handler() 2921 * Connection Manager callback to handle RC connection requests. 2922 */ 2923 /* ARGSUSED */ 2924 static ibt_cm_status_t 2925 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2926 ibt_cm_return_args_t *ret_args, void *priv_data, 2927 ibt_priv_data_len_t len) 2928 { 2929 queue_t *q; 2930 rib_qp_t *qp; 2931 rib_hca_t *hca; 2932 rdma_stat status = RDMA_SUCCESS; 2933 int i; 2934 struct clist cl; 2935 rdma_buf_t rdbuf = {0}; 2936 void *buf = NULL; 2937 CONN *conn; 2938 ibt_ip_cm_info_t ipinfo; 2939 struct sockaddr_in *s; 2940 struct sockaddr_in6 *s6; 2941 int sin_size = sizeof (struct sockaddr_in); 2942 int in_size = sizeof (struct in_addr); 2943 int sin6_size = sizeof (struct sockaddr_in6); 2944 2945 ASSERT(any != NULL); 2946 ASSERT(event != NULL); 2947 2948 hca = (rib_hca_t *)any; 2949 2950 /* got a connection request */ 2951 switch (event->cm_type) { 2952 case IBT_CM_EVENT_REQ_RCV: 2953 /* 2954 * If the plugin is in the NO_ACCEPT state, bail out. 2955 */ 2956 mutex_enter(&plugin_state_lock); 2957 if (plugin_state == NO_ACCEPT) { 2958 mutex_exit(&plugin_state_lock); 2959 return (IBT_CM_REJECT); 2960 } 2961 mutex_exit(&plugin_state_lock); 2962 2963 /* 2964 * Need to send a MRA MAD to CM so that it does not 2965 * timeout on us. 2966 */ 2967 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2968 event->cm_event.req.req_timeout * 8, NULL, 0); 2969 2970 mutex_enter(&rib_stat->open_hca_lock); 2971 q = rib_stat->q; 2972 mutex_exit(&rib_stat->open_hca_lock); 2973 2974 status = rib_svc_create_chan(hca, (caddr_t)q, 2975 event->cm_event.req.req_prim_hca_port, &qp); 2976 2977 if (status) { 2978 return (IBT_CM_REJECT); 2979 } 2980 2981 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2982 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2983 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2984 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2985 2986 /* 2987 * Pre-posts RECV buffers 2988 */ 2989 conn = qptoc(qp); 2990 for (i = 0; i < preposted_rbufs; i++) { 2991 bzero(&rdbuf, sizeof (rdbuf)); 2992 rdbuf.type = RECV_BUFFER; 2993 buf = rib_rbuf_alloc(conn, &rdbuf); 2994 if (buf == NULL) { 2995 /* 2996 * A connection is not established yet. 2997 * Just flush the channel. Buffers 2998 * posted till now will error out with 2999 * IBT_WC_WR_FLUSHED_ERR. 3000 */ 3001 (void) ibt_flush_channel(qp->qp_hdl); 3002 (void) rib_disconnect_channel(conn, NULL); 3003 return (IBT_CM_REJECT); 3004 } 3005 3006 bzero(&cl, sizeof (cl)); 3007 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3008 cl.c_len = rdbuf.len; 3009 cl.c_smemhandle.mrc_lmr = 3010 rdbuf.handle.mrc_lmr; /* lkey */ 3011 cl.c_next = NULL; 3012 status = rib_post_recv(conn, &cl); 3013 if (status != RDMA_SUCCESS) { 3014 /* 3015 * A connection is not established yet. 3016 * Just flush the channel. Buffers 3017 * posted till now will error out with 3018 * IBT_WC_WR_FLUSHED_ERR. 3019 */ 3020 (void) ibt_flush_channel(qp->qp_hdl); 3021 (void) rib_disconnect_channel(conn, NULL); 3022 return (IBT_CM_REJECT); 3023 } 3024 } 3025 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3026 3027 /* 3028 * Get the address translation 3029 */ 3030 rw_enter(&hca->state_lock, RW_READER); 3031 if (hca->state == HCA_DETACHED) { 3032 rw_exit(&hca->state_lock); 3033 return (IBT_CM_REJECT); 3034 } 3035 rw_exit(&hca->state_lock); 3036 3037 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3038 3039 if (ibt_get_ip_data(event->cm_priv_data_len, 3040 event->cm_priv_data, 3041 &ipinfo) != IBT_SUCCESS) { 3042 3043 return (IBT_CM_REJECT); 3044 } 3045 3046 switch (ipinfo.src_addr.family) { 3047 case AF_INET: 3048 3049 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3050 KM_SLEEP); 3051 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3052 3053 conn->c_raddr.maxlen = 3054 conn->c_raddr.len = sin_size; 3055 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3056 3057 s = (struct sockaddr_in *)conn->c_raddr.buf; 3058 s->sin_family = AF_INET; 3059 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3060 &s->sin_addr, in_size); 3061 3062 conn->c_laddr.maxlen = 3063 conn->c_laddr.len = sin_size; 3064 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3065 3066 s = (struct sockaddr_in *)conn->c_laddr.buf; 3067 s->sin_family = AF_INET; 3068 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3069 &s->sin_addr, in_size); 3070 3071 break; 3072 3073 case AF_INET6: 3074 3075 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3076 KM_SLEEP); 3077 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3078 3079 conn->c_raddr.maxlen = 3080 conn->c_raddr.len = sin6_size; 3081 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3082 3083 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3084 s6->sin6_family = AF_INET6; 3085 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3086 &s6->sin6_addr, 3087 sizeof (struct in6_addr)); 3088 3089 conn->c_laddr.maxlen = 3090 conn->c_laddr.len = sin6_size; 3091 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3092 3093 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3094 s6->sin6_family = AF_INET6; 3095 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3096 &s6->sin6_addr, 3097 sizeof (struct in6_addr)); 3098 3099 break; 3100 3101 default: 3102 return (IBT_CM_REJECT); 3103 } 3104 3105 break; 3106 3107 case IBT_CM_EVENT_CONN_CLOSED: 3108 { 3109 CONN *conn; 3110 rib_qp_t *qp; 3111 3112 switch (event->cm_event.closed) { 3113 case IBT_CM_CLOSED_DREP_RCVD: 3114 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3115 case IBT_CM_CLOSED_DUP: 3116 case IBT_CM_CLOSED_ABORT: 3117 case IBT_CM_CLOSED_ALREADY: 3118 /* 3119 * These cases indicate the local end initiated 3120 * the closing of the channel. Nothing to do here. 3121 */ 3122 break; 3123 default: 3124 /* 3125 * Reason for CONN_CLOSED event must be one of 3126 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3127 * or IBT_CM_CLOSED_STALE. These indicate cases were 3128 * the remote end is closing the channel. In these 3129 * cases free the channel and transition to error 3130 * state 3131 */ 3132 qp = ibt_get_chan_private(event->cm_channel); 3133 conn = qptoc(qp); 3134 mutex_enter(&conn->c_lock); 3135 if (conn->c_state == C_DISCONN_PEND) { 3136 mutex_exit(&conn->c_lock); 3137 break; 3138 } 3139 conn->c_state = C_ERROR_CONN; 3140 3141 /* 3142 * Free the conn if c_ref goes down to 0 3143 */ 3144 if (conn->c_ref == 0) { 3145 /* 3146 * Remove from list and free conn 3147 */ 3148 conn->c_state = C_DISCONN_PEND; 3149 mutex_exit(&conn->c_lock); 3150 (void) rib_disconnect_channel(conn, 3151 &hca->srv_conn_list); 3152 } else { 3153 /* 3154 * conn will be freed when c_ref goes to 0. 3155 * Indicate to cleaning thread not to close 3156 * the connection, but just free the channel. 3157 */ 3158 conn->c_flags |= C_CLOSE_NOTNEEDED; 3159 mutex_exit(&conn->c_lock); 3160 } 3161 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3162 break; 3163 } 3164 break; 3165 } 3166 case IBT_CM_EVENT_CONN_EST: 3167 /* 3168 * RTU received, hence connection established. 3169 */ 3170 if (rib_debug > 1) 3171 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3172 "(CONN_EST) channel established"); 3173 break; 3174 3175 default: 3176 if (rib_debug > 2) { 3177 /* Let CM handle the following events. */ 3178 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3179 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3180 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3181 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3182 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3183 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3184 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3185 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3186 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3187 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3188 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3189 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3190 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3191 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3192 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3193 } 3194 } 3195 return (IBT_CM_DEFAULT); 3196 } 3197 3198 /* accept all other CM messages (i.e. let the CM handle them) */ 3199 return (IBT_CM_ACCEPT); 3200 } 3201 3202 static rdma_stat 3203 rib_register_service(rib_hca_t *hca, int service_type, 3204 uint8_t protocol_num, in_port_t dst_port) 3205 { 3206 ibt_srv_desc_t sdesc; 3207 ibt_hca_portinfo_t *port_infop; 3208 ib_svc_id_t srv_id; 3209 ibt_srv_hdl_t srv_hdl; 3210 uint_t port_size; 3211 uint_t pki, i, num_ports, nbinds; 3212 ibt_status_t ibt_status; 3213 rib_service_t *service; 3214 ib_pkey_t pkey; 3215 3216 /* 3217 * Query all ports for the given HCA 3218 */ 3219 rw_enter(&hca->state_lock, RW_READER); 3220 if (hca->state != HCA_DETACHED) { 3221 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3222 &num_ports, &port_size); 3223 rw_exit(&hca->state_lock); 3224 } else { 3225 rw_exit(&hca->state_lock); 3226 return (RDMA_FAILED); 3227 } 3228 if (ibt_status != IBT_SUCCESS) { 3229 return (RDMA_FAILED); 3230 } 3231 3232 DTRACE_PROBE1(rpcib__i__regservice_numports, 3233 int, num_ports); 3234 3235 for (i = 0; i < num_ports; i++) { 3236 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3237 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3238 int, i+1); 3239 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3240 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3241 int, i+1); 3242 } 3243 } 3244 3245 /* 3246 * Get all the IP addresses on this system to register the 3247 * given "service type" on all DNS recognized IP addrs. 3248 * Each service type such as NFS will have all the systems 3249 * IP addresses as its different names. For now the only 3250 * type of service we support in RPCIB is NFS. 3251 */ 3252 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3253 /* 3254 * Start registering and binding service to active 3255 * on active ports on this HCA. 3256 */ 3257 nbinds = 0; 3258 for (service = rib_stat->service_list; 3259 service && (service->srv_type != service_type); 3260 service = service->next) 3261 ; 3262 3263 if (service == NULL) { 3264 /* 3265 * We use IP addresses as the service names for 3266 * service registration. Register each of them 3267 * with CM to obtain a svc_id and svc_hdl. We do not 3268 * register the service with machine's loopback address. 3269 */ 3270 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3271 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3272 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3273 sdesc.sd_handler = rib_srv_cm_handler; 3274 sdesc.sd_flags = 0; 3275 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3276 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3277 1, &srv_hdl, &srv_id); 3278 if ((ibt_status != IBT_SUCCESS) && 3279 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3280 rw_exit(&rib_stat->service_list_lock); 3281 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3282 int, ibt_status); 3283 ibt_free_portinfo(port_infop, port_size); 3284 return (RDMA_FAILED); 3285 } 3286 3287 /* 3288 * Allocate and prepare a service entry 3289 */ 3290 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3291 3292 service->srv_type = service_type; 3293 service->srv_hdl = srv_hdl; 3294 service->srv_id = srv_id; 3295 3296 service->next = rib_stat->service_list; 3297 rib_stat->service_list = service; 3298 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3299 int, service->srv_type); 3300 } else { 3301 srv_hdl = service->srv_hdl; 3302 srv_id = service->srv_id; 3303 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3304 int, service->srv_type); 3305 } 3306 3307 for (i = 0; i < num_ports; i++) { 3308 ibt_sbind_hdl_t sbp; 3309 rib_hca_service_t *hca_srv; 3310 ib_gid_t gid; 3311 3312 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3313 continue; 3314 3315 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3316 pkey = port_infop[i].p_pkey_tbl[pki]; 3317 3318 rw_enter(&hca->bound_services_lock, RW_READER); 3319 gid = port_infop[i].p_sgid_tbl[0]; 3320 for (hca_srv = hca->bound_services; hca_srv; 3321 hca_srv = hca_srv->next) { 3322 if ((hca_srv->srv_id == service->srv_id) && 3323 (hca_srv->gid.gid_prefix == 3324 gid.gid_prefix) && 3325 (hca_srv->gid.gid_guid == gid.gid_guid)) 3326 break; 3327 } 3328 rw_exit(&hca->bound_services_lock); 3329 if (hca_srv != NULL) { 3330 /* 3331 * port is alreay bound the the service 3332 */ 3333 DTRACE_PROBE1( 3334 rpcib__i__regservice__already__bound, 3335 int, i+1); 3336 nbinds++; 3337 continue; 3338 } 3339 3340 if ((pkey & IBSRM_HB) && 3341 (pkey != IB_PKEY_INVALID_FULL)) { 3342 3343 sbp = NULL; 3344 ibt_status = ibt_bind_service(srv_hdl, 3345 gid, NULL, hca, &sbp); 3346 3347 if (ibt_status == IBT_SUCCESS) { 3348 hca_srv = kmem_zalloc( 3349 sizeof (rib_hca_service_t), 3350 KM_SLEEP); 3351 hca_srv->srv_id = srv_id; 3352 hca_srv->gid = gid; 3353 hca_srv->sbind_hdl = sbp; 3354 3355 rw_enter(&hca->bound_services_lock, 3356 RW_WRITER); 3357 hca_srv->next = hca->bound_services; 3358 hca->bound_services = hca_srv; 3359 rw_exit(&hca->bound_services_lock); 3360 nbinds++; 3361 } 3362 3363 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3364 int, ibt_status); 3365 } 3366 } 3367 } 3368 rw_exit(&rib_stat->service_list_lock); 3369 3370 ibt_free_portinfo(port_infop, port_size); 3371 3372 if (nbinds == 0) { 3373 return (RDMA_FAILED); 3374 } else { 3375 /* 3376 * Put this plugin into accept state, since atleast 3377 * one registration was successful. 3378 */ 3379 mutex_enter(&plugin_state_lock); 3380 plugin_state = ACCEPT; 3381 mutex_exit(&plugin_state_lock); 3382 return (RDMA_SUCCESS); 3383 } 3384 } 3385 3386 void 3387 rib_listen(struct rdma_svc_data *rd) 3388 { 3389 rdma_stat status; 3390 int n_listening = 0; 3391 rib_hca_t *hca; 3392 3393 mutex_enter(&rib_stat->listen_lock); 3394 /* 3395 * if rd parameter is NULL then it means that rib_stat->q is 3396 * already initialized by a call from RDMA and we just want to 3397 * add a newly attached HCA to the same listening state as other 3398 * HCAs. 3399 */ 3400 if (rd == NULL) { 3401 if (rib_stat->q == NULL) { 3402 mutex_exit(&rib_stat->listen_lock); 3403 return; 3404 } 3405 } else { 3406 rib_stat->q = &rd->q; 3407 } 3408 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3409 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3410 /* 3411 * First check if a hca is still attached 3412 */ 3413 rw_enter(&hca->state_lock, RW_READER); 3414 if (hca->state != HCA_INITED) { 3415 rw_exit(&hca->state_lock); 3416 continue; 3417 } 3418 rw_exit(&hca->state_lock); 3419 3420 /* 3421 * Right now the only service type is NFS. Hence 3422 * force feed this value. Ideally to communicate 3423 * the service type it should be passed down in 3424 * rdma_svc_data. 3425 */ 3426 status = rib_register_service(hca, NFS, 3427 IPPROTO_TCP, nfs_rdma_port); 3428 if (status == RDMA_SUCCESS) 3429 n_listening++; 3430 } 3431 rw_exit(&rib_stat->hcas_list_lock); 3432 3433 /* 3434 * Service active on an HCA, check rd->err_code for more 3435 * explainable errors. 3436 */ 3437 if (rd) { 3438 if (n_listening > 0) { 3439 rd->active = 1; 3440 rd->err_code = RDMA_SUCCESS; 3441 } else { 3442 rd->active = 0; 3443 rd->err_code = RDMA_FAILED; 3444 } 3445 } 3446 mutex_exit(&rib_stat->listen_lock); 3447 } 3448 3449 /* XXXX */ 3450 /* ARGSUSED */ 3451 static void 3452 rib_listen_stop(struct rdma_svc_data *svcdata) 3453 { 3454 rib_hca_t *hca; 3455 3456 mutex_enter(&rib_stat->listen_lock); 3457 /* 3458 * KRPC called the RDMATF to stop the listeners, this means 3459 * stop sending incomming or recieved requests to KRPC master 3460 * transport handle for RDMA-IB. This is also means that the 3461 * master transport handle, responsible for us, is going away. 3462 */ 3463 mutex_enter(&plugin_state_lock); 3464 plugin_state = NO_ACCEPT; 3465 if (svcdata != NULL) 3466 svcdata->active = 0; 3467 mutex_exit(&plugin_state_lock); 3468 3469 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3470 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3471 /* 3472 * First check if a hca is still attached 3473 */ 3474 rw_enter(&hca->state_lock, RW_READER); 3475 if (hca->state == HCA_DETACHED) { 3476 rw_exit(&hca->state_lock); 3477 continue; 3478 } 3479 rib_close_channels(&hca->srv_conn_list); 3480 rib_stop_services(hca); 3481 rw_exit(&hca->state_lock); 3482 } 3483 rw_exit(&rib_stat->hcas_list_lock); 3484 3485 /* 3486 * Avoid rib_listen() using the stale q field. 3487 * This could happen if a port goes up after all services 3488 * are already unregistered. 3489 */ 3490 rib_stat->q = NULL; 3491 mutex_exit(&rib_stat->listen_lock); 3492 } 3493 3494 /* 3495 * Traverse the HCA's service list to unbind and deregister services. 3496 * For each bound service of HCA to be removed, first find the corresponding 3497 * service handle (srv_hdl) and then unbind the service by calling 3498 * ibt_unbind_service(). 3499 */ 3500 static void 3501 rib_stop_services(rib_hca_t *hca) 3502 { 3503 rib_hca_service_t *srv_list, *to_remove; 3504 3505 /* 3506 * unbind and deregister the services for this service type. 3507 * Right now there is only one service type. In future it will 3508 * be passed down to this function. 3509 */ 3510 rw_enter(&hca->bound_services_lock, RW_READER); 3511 srv_list = hca->bound_services; 3512 hca->bound_services = NULL; 3513 rw_exit(&hca->bound_services_lock); 3514 3515 while (srv_list != NULL) { 3516 rib_service_t *sc; 3517 3518 to_remove = srv_list; 3519 srv_list = to_remove->next; 3520 rw_enter(&rib_stat->service_list_lock, RW_READER); 3521 for (sc = rib_stat->service_list; 3522 sc && (sc->srv_id != to_remove->srv_id); 3523 sc = sc->next) 3524 ; 3525 /* 3526 * if sc is NULL then the service doesn't exist anymore, 3527 * probably just removed completely through rib_stat. 3528 */ 3529 if (sc != NULL) 3530 (void) ibt_unbind_service(sc->srv_hdl, 3531 to_remove->sbind_hdl); 3532 rw_exit(&rib_stat->service_list_lock); 3533 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3534 } 3535 } 3536 3537 static struct svc_recv * 3538 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3539 { 3540 struct svc_recv *recvp; 3541 3542 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3543 recvp->vaddr = sgl->ds_va; 3544 recvp->qp = qp; 3545 recvp->bytes_xfer = 0; 3546 return (recvp); 3547 } 3548 3549 static int 3550 rib_free_svc_recv(struct svc_recv *recvp) 3551 { 3552 kmem_free(recvp, sizeof (*recvp)); 3553 3554 return (0); 3555 } 3556 3557 static struct reply * 3558 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3559 { 3560 struct reply *rep; 3561 3562 3563 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3564 if (rep == NULL) { 3565 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3566 return (NULL); 3567 } 3568 rep->xid = msgid; 3569 rep->vaddr_cq = NULL; 3570 rep->bytes_xfer = 0; 3571 rep->status = (uint_t)REPLY_WAIT; 3572 rep->prev = NULL; 3573 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3574 3575 mutex_enter(&qp->replylist_lock); 3576 if (qp->replylist) { 3577 rep->next = qp->replylist; 3578 qp->replylist->prev = rep; 3579 } 3580 qp->rep_list_size++; 3581 3582 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3583 int, qp->rep_list_size); 3584 3585 qp->replylist = rep; 3586 mutex_exit(&qp->replylist_lock); 3587 3588 return (rep); 3589 } 3590 3591 static rdma_stat 3592 rib_rem_replylist(rib_qp_t *qp) 3593 { 3594 struct reply *r, *n; 3595 3596 mutex_enter(&qp->replylist_lock); 3597 for (r = qp->replylist; r != NULL; r = n) { 3598 n = r->next; 3599 (void) rib_remreply(qp, r); 3600 } 3601 mutex_exit(&qp->replylist_lock); 3602 3603 return (RDMA_SUCCESS); 3604 } 3605 3606 static int 3607 rib_remreply(rib_qp_t *qp, struct reply *rep) 3608 { 3609 3610 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3611 if (rep->prev) { 3612 rep->prev->next = rep->next; 3613 } 3614 if (rep->next) { 3615 rep->next->prev = rep->prev; 3616 } 3617 if (qp->replylist == rep) 3618 qp->replylist = rep->next; 3619 3620 cv_destroy(&rep->wait_cv); 3621 qp->rep_list_size--; 3622 3623 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3624 int, qp->rep_list_size); 3625 3626 kmem_free(rep, sizeof (*rep)); 3627 3628 return (0); 3629 } 3630 3631 rdma_stat 3632 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3633 struct mrc *buf_handle) 3634 { 3635 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3636 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3637 rdma_stat status; 3638 rib_hca_t *hca = (ctoqp(conn))->hca; 3639 3640 /* 3641 * Note: ALL buffer pools use the same memory type RDMARW. 3642 */ 3643 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3644 if (status == RDMA_SUCCESS) { 3645 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3646 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3647 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3648 } else { 3649 buf_handle->mrc_linfo = NULL; 3650 buf_handle->mrc_lmr = 0; 3651 buf_handle->mrc_rmr = 0; 3652 } 3653 return (status); 3654 } 3655 3656 static rdma_stat 3657 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3658 ibt_mr_flags_t spec, 3659 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3660 { 3661 ibt_mr_attr_t mem_attr; 3662 ibt_status_t ibt_status; 3663 mem_attr.mr_vaddr = (uintptr_t)buf; 3664 mem_attr.mr_len = (ib_msglen_t)size; 3665 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3666 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3667 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3668 IBT_MR_ENABLE_WINDOW_BIND | spec; 3669 3670 rw_enter(&hca->state_lock, RW_READER); 3671 if (hca->state != HCA_DETACHED) { 3672 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3673 &mem_attr, mr_hdlp, mr_descp); 3674 rw_exit(&hca->state_lock); 3675 } else { 3676 rw_exit(&hca->state_lock); 3677 return (RDMA_FAILED); 3678 } 3679 3680 if (ibt_status != IBT_SUCCESS) { 3681 return (RDMA_FAILED); 3682 } 3683 return (RDMA_SUCCESS); 3684 } 3685 3686 rdma_stat 3687 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3688 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3689 { 3690 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3691 rib_lrc_entry_t *l; 3692 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3693 rdma_stat status; 3694 rib_hca_t *hca = (ctoqp(conn))->hca; 3695 3696 /* 3697 * Non-coherent memory registration. 3698 */ 3699 l = (rib_lrc_entry_t *)lrc; 3700 if (l) { 3701 if (l->registered) { 3702 buf_handle->mrc_linfo = 3703 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3704 buf_handle->mrc_lmr = 3705 (uint32_t)l->lrc_mhandle.mrc_lmr; 3706 buf_handle->mrc_rmr = 3707 (uint32_t)l->lrc_mhandle.mrc_rmr; 3708 *sync_handle = (RIB_SYNCMEM_HANDLE) 3709 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3710 return (RDMA_SUCCESS); 3711 } else { 3712 /* Always register the whole buffer */ 3713 buf = (caddr_t)l->lrc_buf; 3714 buflen = l->lrc_len; 3715 } 3716 } 3717 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3718 3719 if (status == RDMA_SUCCESS) { 3720 if (l) { 3721 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3722 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3723 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3724 l->registered = TRUE; 3725 } 3726 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3727 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3728 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3729 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3730 } else { 3731 buf_handle->mrc_linfo = NULL; 3732 buf_handle->mrc_lmr = 0; 3733 buf_handle->mrc_rmr = 0; 3734 } 3735 return (status); 3736 } 3737 3738 /* ARGSUSED */ 3739 rdma_stat 3740 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3741 { 3742 rib_hca_t *hca = (ctoqp(conn))->hca; 3743 /* 3744 * Allow memory deregistration even if HCA is 3745 * getting detached. Need all outstanding 3746 * memory registrations to be deregistered 3747 * before HCA_DETACH_EVENT can be accepted. 3748 */ 3749 (void) ibt_deregister_mr(hca->hca_hdl, 3750 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3751 return (RDMA_SUCCESS); 3752 } 3753 3754 /* ARGSUSED */ 3755 rdma_stat 3756 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3757 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3758 { 3759 rib_lrc_entry_t *l; 3760 l = (rib_lrc_entry_t *)lrc; 3761 if (l) 3762 if (l->registered) 3763 return (RDMA_SUCCESS); 3764 3765 (void) rib_deregistermem(conn, buf, buf_handle); 3766 3767 return (RDMA_SUCCESS); 3768 } 3769 3770 /* ARGSUSED */ 3771 rdma_stat 3772 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3773 int len, int cpu) 3774 { 3775 ibt_status_t status; 3776 rib_hca_t *hca = (ctoqp(conn))->hca; 3777 ibt_mr_sync_t mr_segment; 3778 3779 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3780 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3781 mr_segment.ms_len = (ib_memlen_t)len; 3782 if (cpu) { 3783 /* make incoming data visible to memory */ 3784 mr_segment.ms_flags = IBT_SYNC_WRITE; 3785 } else { 3786 /* make memory changes visible to IO */ 3787 mr_segment.ms_flags = IBT_SYNC_READ; 3788 } 3789 rw_enter(&hca->state_lock, RW_READER); 3790 if (hca->state != HCA_DETACHED) { 3791 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3792 rw_exit(&hca->state_lock); 3793 } else { 3794 rw_exit(&hca->state_lock); 3795 return (RDMA_FAILED); 3796 } 3797 3798 if (status == IBT_SUCCESS) 3799 return (RDMA_SUCCESS); 3800 else { 3801 return (RDMA_FAILED); 3802 } 3803 } 3804 3805 /* 3806 * XXXX ???? 3807 */ 3808 static rdma_stat 3809 rib_getinfo(rdma_info_t *info) 3810 { 3811 /* 3812 * XXXX Hack! 3813 */ 3814 info->addrlen = 16; 3815 info->mts = 1000000; 3816 info->mtu = 1000000; 3817 3818 return (RDMA_SUCCESS); 3819 } 3820 3821 rib_bufpool_t * 3822 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3823 { 3824 rib_bufpool_t *rbp = NULL; 3825 bufpool_t *bp = NULL; 3826 caddr_t buf; 3827 ibt_mr_attr_t mem_attr; 3828 ibt_status_t ibt_status; 3829 int i, j; 3830 3831 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3832 3833 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3834 num * sizeof (void *), KM_SLEEP); 3835 3836 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3837 bp->numelems = num; 3838 3839 3840 switch (ptype) { 3841 case SEND_BUFFER: 3842 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3843 bp->rsize = RPC_MSG_SZ; 3844 break; 3845 case RECV_BUFFER: 3846 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3847 bp->rsize = RPC_BUF_SIZE; 3848 break; 3849 default: 3850 goto fail; 3851 } 3852 3853 /* 3854 * Register the pool. 3855 */ 3856 bp->bufsize = num * bp->rsize; 3857 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3858 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3859 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3860 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3861 sizeof (ibt_mr_desc_t), KM_SLEEP); 3862 rw_enter(&hca->state_lock, RW_READER); 3863 3864 if (hca->state == HCA_DETACHED) { 3865 rw_exit(&hca->state_lock); 3866 goto fail; 3867 } 3868 3869 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3870 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3871 mem_attr.mr_vaddr = (uintptr_t)buf; 3872 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3873 mem_attr.mr_as = NULL; 3874 ibt_status = ibt_register_mr(hca->hca_hdl, 3875 hca->pd_hdl, &mem_attr, 3876 &rbp->mr_hdl[i], 3877 &rbp->mr_desc[i]); 3878 if (ibt_status != IBT_SUCCESS) { 3879 for (j = 0; j < i; j++) { 3880 (void) ibt_deregister_mr(hca->hca_hdl, 3881 rbp->mr_hdl[j]); 3882 } 3883 rw_exit(&hca->state_lock); 3884 goto fail; 3885 } 3886 } 3887 rw_exit(&hca->state_lock); 3888 buf = (caddr_t)bp->buf; 3889 for (i = 0; i < num; i++, buf += bp->rsize) { 3890 bp->buflist[i] = (void *)buf; 3891 } 3892 bp->buffree = num - 1; /* no. of free buffers */ 3893 rbp->bpool = bp; 3894 3895 return (rbp); 3896 fail: 3897 if (bp) { 3898 if (bp->buf) 3899 kmem_free(bp->buf, bp->bufsize); 3900 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3901 } 3902 if (rbp) { 3903 if (rbp->mr_hdl) 3904 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3905 if (rbp->mr_desc) 3906 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3907 kmem_free(rbp, sizeof (rib_bufpool_t)); 3908 } 3909 return (NULL); 3910 } 3911 3912 static void 3913 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3914 { 3915 int i; 3916 rib_bufpool_t *rbp = NULL; 3917 bufpool_t *bp; 3918 3919 /* 3920 * Obtain pool address based on type of pool 3921 */ 3922 switch (ptype) { 3923 case SEND_BUFFER: 3924 rbp = hca->send_pool; 3925 break; 3926 case RECV_BUFFER: 3927 rbp = hca->recv_pool; 3928 break; 3929 default: 3930 return; 3931 } 3932 if (rbp == NULL) 3933 return; 3934 3935 bp = rbp->bpool; 3936 3937 /* 3938 * Deregister the pool memory and free it. 3939 */ 3940 for (i = 0; i < bp->numelems; i++) { 3941 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3942 } 3943 } 3944 3945 static void 3946 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3947 { 3948 3949 rib_bufpool_t *rbp = NULL; 3950 bufpool_t *bp; 3951 3952 /* 3953 * Obtain pool address based on type of pool 3954 */ 3955 switch (ptype) { 3956 case SEND_BUFFER: 3957 rbp = hca->send_pool; 3958 break; 3959 case RECV_BUFFER: 3960 rbp = hca->recv_pool; 3961 break; 3962 default: 3963 return; 3964 } 3965 if (rbp == NULL) 3966 return; 3967 3968 bp = rbp->bpool; 3969 3970 /* 3971 * Free the pool memory. 3972 */ 3973 if (rbp->mr_hdl) 3974 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3975 3976 if (rbp->mr_desc) 3977 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3978 if (bp->buf) 3979 kmem_free(bp->buf, bp->bufsize); 3980 mutex_destroy(&bp->buflock); 3981 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3982 kmem_free(rbp, sizeof (rib_bufpool_t)); 3983 } 3984 3985 void 3986 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3987 { 3988 /* 3989 * Deregister the pool memory and free it. 3990 */ 3991 rib_rbufpool_deregister(hca, ptype); 3992 rib_rbufpool_free(hca, ptype); 3993 } 3994 3995 /* 3996 * Fetch a buffer from the pool of type specified in rdbuf->type. 3997 */ 3998 static rdma_stat 3999 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4000 { 4001 rib_lrc_entry_t *rlep; 4002 4003 if (rdbuf->type == RDMA_LONG_BUFFER) { 4004 rlep = rib_get_cache_buf(conn, rdbuf->len); 4005 rdbuf->rb_private = (caddr_t)rlep; 4006 rdbuf->addr = rlep->lrc_buf; 4007 rdbuf->handle = rlep->lrc_mhandle; 4008 return (RDMA_SUCCESS); 4009 } 4010 4011 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4012 if (rdbuf->addr) { 4013 switch (rdbuf->type) { 4014 case SEND_BUFFER: 4015 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4016 break; 4017 case RECV_BUFFER: 4018 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4019 break; 4020 default: 4021 rdbuf->len = 0; 4022 } 4023 return (RDMA_SUCCESS); 4024 } else 4025 return (RDMA_FAILED); 4026 } 4027 4028 /* 4029 * Fetch a buffer of specified type. 4030 * Note that rdbuf->handle is mw's rkey. 4031 */ 4032 static void * 4033 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4034 { 4035 rib_qp_t *qp = ctoqp(conn); 4036 rib_hca_t *hca = qp->hca; 4037 rdma_btype ptype = rdbuf->type; 4038 void *buf; 4039 rib_bufpool_t *rbp = NULL; 4040 bufpool_t *bp; 4041 int i; 4042 4043 /* 4044 * Obtain pool address based on type of pool 4045 */ 4046 switch (ptype) { 4047 case SEND_BUFFER: 4048 rbp = hca->send_pool; 4049 break; 4050 case RECV_BUFFER: 4051 rbp = hca->recv_pool; 4052 break; 4053 default: 4054 return (NULL); 4055 } 4056 if (rbp == NULL) 4057 return (NULL); 4058 4059 bp = rbp->bpool; 4060 4061 mutex_enter(&bp->buflock); 4062 if (bp->buffree < 0) { 4063 mutex_exit(&bp->buflock); 4064 return (NULL); 4065 } 4066 4067 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4068 buf = bp->buflist[bp->buffree]; 4069 rdbuf->addr = buf; 4070 rdbuf->len = bp->rsize; 4071 for (i = bp->numelems - 1; i >= 0; i--) { 4072 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4073 rdbuf->handle.mrc_rmr = 4074 (uint32_t)rbp->mr_desc[i].md_rkey; 4075 rdbuf->handle.mrc_linfo = 4076 (uintptr_t)rbp->mr_hdl[i]; 4077 rdbuf->handle.mrc_lmr = 4078 (uint32_t)rbp->mr_desc[i].md_lkey; 4079 bp->buffree--; 4080 4081 mutex_exit(&bp->buflock); 4082 4083 return (buf); 4084 } 4085 } 4086 4087 mutex_exit(&bp->buflock); 4088 4089 return (NULL); 4090 } 4091 4092 static void 4093 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4094 { 4095 4096 if (rdbuf->type == RDMA_LONG_BUFFER) { 4097 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4098 rdbuf->rb_private = NULL; 4099 return; 4100 } 4101 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4102 } 4103 4104 static void 4105 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4106 { 4107 rib_qp_t *qp = ctoqp(conn); 4108 rib_hca_t *hca = qp->hca; 4109 rib_bufpool_t *rbp = NULL; 4110 bufpool_t *bp; 4111 4112 /* 4113 * Obtain pool address based on type of pool 4114 */ 4115 switch (ptype) { 4116 case SEND_BUFFER: 4117 rbp = hca->send_pool; 4118 break; 4119 case RECV_BUFFER: 4120 rbp = hca->recv_pool; 4121 break; 4122 default: 4123 return; 4124 } 4125 if (rbp == NULL) 4126 return; 4127 4128 bp = rbp->bpool; 4129 4130 mutex_enter(&bp->buflock); 4131 if (++bp->buffree >= bp->numelems) { 4132 /* 4133 * Should never happen 4134 */ 4135 bp->buffree--; 4136 } else { 4137 bp->buflist[bp->buffree] = buf; 4138 } 4139 mutex_exit(&bp->buflock); 4140 } 4141 4142 static rdma_stat 4143 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4144 { 4145 rw_enter(&connlist->conn_lock, RW_WRITER); 4146 if (connlist->conn_hd) { 4147 cn->c_next = connlist->conn_hd; 4148 connlist->conn_hd->c_prev = cn; 4149 } 4150 connlist->conn_hd = cn; 4151 rw_exit(&connlist->conn_lock); 4152 4153 return (RDMA_SUCCESS); 4154 } 4155 4156 static rdma_stat 4157 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4158 { 4159 rw_enter(&connlist->conn_lock, RW_WRITER); 4160 if (cn->c_prev) { 4161 cn->c_prev->c_next = cn->c_next; 4162 } 4163 if (cn->c_next) { 4164 cn->c_next->c_prev = cn->c_prev; 4165 } 4166 if (connlist->conn_hd == cn) 4167 connlist->conn_hd = cn->c_next; 4168 rw_exit(&connlist->conn_lock); 4169 4170 return (RDMA_SUCCESS); 4171 } 4172 4173 /* ARGSUSED */ 4174 static rdma_stat 4175 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4176 int addr_type, void *handle, CONN **conn) 4177 { 4178 rdma_stat status; 4179 rpcib_ping_t rpt; 4180 4181 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4182 return (status); 4183 } 4184 4185 /* 4186 * rib_find_hca_connection 4187 * 4188 * if there is an existing connection to the specified address then 4189 * it will be returned in conn, otherwise conn will be set to NULL. 4190 * Also cleans up any connection that is in error state. 4191 */ 4192 static int 4193 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4194 struct netbuf *d_svcaddr, CONN **conn) 4195 { 4196 CONN *cn; 4197 clock_t cv_stat, timout; 4198 4199 *conn = NULL; 4200 again: 4201 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4202 cn = hca->cl_conn_list.conn_hd; 4203 while (cn != NULL) { 4204 /* 4205 * First, clear up any connection in the ERROR state 4206 */ 4207 mutex_enter(&cn->c_lock); 4208 if (cn->c_state == C_ERROR_CONN) { 4209 if (cn->c_ref == 0) { 4210 /* 4211 * Remove connection from list and destroy it. 4212 */ 4213 cn->c_state = C_DISCONN_PEND; 4214 mutex_exit(&cn->c_lock); 4215 rw_exit(&hca->cl_conn_list.conn_lock); 4216 rib_conn_close((void *)cn); 4217 goto again; 4218 } 4219 mutex_exit(&cn->c_lock); 4220 cn = cn->c_next; 4221 continue; 4222 } 4223 if (cn->c_state == C_DISCONN_PEND) { 4224 mutex_exit(&cn->c_lock); 4225 cn = cn->c_next; 4226 continue; 4227 } 4228 4229 /* 4230 * source address is only checked for if there is one, 4231 * this is the case for retries. 4232 */ 4233 if ((cn->c_raddr.len == d_svcaddr->len) && 4234 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4235 d_svcaddr->len) == 0) && 4236 ((s_svcaddr->len == 0) || 4237 ((cn->c_laddr.len == s_svcaddr->len) && 4238 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4239 s_svcaddr->len) == 0)))) { 4240 /* 4241 * Our connection. Give up conn list lock 4242 * as we are done traversing the list. 4243 */ 4244 rw_exit(&hca->cl_conn_list.conn_lock); 4245 if (cn->c_state == C_CONNECTED) { 4246 cn->c_ref++; /* sharing a conn */ 4247 mutex_exit(&cn->c_lock); 4248 *conn = cn; 4249 return (RDMA_SUCCESS); 4250 } 4251 if (cn->c_state == C_CONN_PEND) { 4252 /* 4253 * Hold a reference to this conn before 4254 * we give up the lock. 4255 */ 4256 cn->c_ref++; 4257 timout = ddi_get_lbolt() + 4258 drv_usectohz(CONN_WAIT_TIME * 1000000); 4259 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4260 &cn->c_lock, timout)) > 0 && 4261 cn->c_state == C_CONN_PEND) 4262 ; 4263 if (cv_stat == 0) { 4264 cn->c_ref--; 4265 mutex_exit(&cn->c_lock); 4266 return (RDMA_INTR); 4267 } 4268 if (cv_stat < 0) { 4269 cn->c_ref--; 4270 mutex_exit(&cn->c_lock); 4271 return (RDMA_TIMEDOUT); 4272 } 4273 if (cn->c_state == C_CONNECTED) { 4274 *conn = cn; 4275 mutex_exit(&cn->c_lock); 4276 return (RDMA_SUCCESS); 4277 } else { 4278 cn->c_ref--; 4279 mutex_exit(&cn->c_lock); 4280 return (RDMA_TIMEDOUT); 4281 } 4282 } 4283 } 4284 mutex_exit(&cn->c_lock); 4285 cn = cn->c_next; 4286 } 4287 rw_exit(&hca->cl_conn_list.conn_lock); 4288 *conn = NULL; 4289 return (RDMA_FAILED); 4290 } 4291 4292 /* 4293 * Connection management. 4294 * IBTF does not support recycling of channels. So connections are only 4295 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4296 * C_DISCONN_PEND state. No C_IDLE state. 4297 * C_CONN_PEND state: Connection establishment in progress to the server. 4298 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4299 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4300 * only in this state. 4301 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4302 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4303 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4304 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4305 * c_ref drops to 0 (this indicates that RPC has no more references to this 4306 * connection), the connection should be destroyed. A connection transitions 4307 * into this state when it is being destroyed. 4308 */ 4309 /* ARGSUSED */ 4310 static rdma_stat 4311 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4312 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4313 { 4314 CONN *cn; 4315 int status; 4316 rib_hca_t *hca; 4317 rib_qp_t *qp; 4318 int s_addr_len; 4319 char *s_addr_buf; 4320 4321 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4322 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4323 rw_enter(&hca->state_lock, RW_READER); 4324 if (hca->state != HCA_DETACHED) { 4325 status = rib_find_hca_connection(hca, s_svcaddr, 4326 d_svcaddr, conn); 4327 rw_exit(&hca->state_lock); 4328 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4329 rw_exit(&rib_stat->hcas_list_lock); 4330 return (status); 4331 } 4332 } else 4333 rw_exit(&hca->state_lock); 4334 } 4335 rw_exit(&rib_stat->hcas_list_lock); 4336 4337 /* 4338 * No existing connection found, establish a new connection. 4339 */ 4340 bzero(rpt, sizeof (rpcib_ping_t)); 4341 4342 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4343 if (status != RDMA_SUCCESS) { 4344 return (RDMA_FAILED); 4345 } 4346 hca = rpt->hca; 4347 4348 if (rpt->srcip.family == AF_INET) { 4349 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4350 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4351 } else if (rpt->srcip.family == AF_INET6) { 4352 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4353 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4354 } else { 4355 return (RDMA_FAILED); 4356 } 4357 4358 /* 4359 * Channel to server doesn't exist yet, create one. 4360 */ 4361 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4362 return (RDMA_FAILED); 4363 } 4364 cn = qptoc(qp); 4365 cn->c_state = C_CONN_PEND; 4366 cn->c_ref = 1; 4367 4368 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4369 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4370 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4371 4372 if (rpt->srcip.family == AF_INET) { 4373 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4374 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4375 } else { 4376 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4377 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4378 } 4379 4380 /* 4381 * Add to conn list. 4382 * We had given up the READER lock. In the time since then, 4383 * another thread might have created the connection we are 4384 * trying here. But for now, that is quiet alright - there 4385 * might be two connections between a pair of hosts instead 4386 * of one. If we really want to close that window, 4387 * then need to check the list after acquiring the 4388 * WRITER lock. 4389 */ 4390 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4391 status = rib_conn_to_srv(hca, qp, rpt); 4392 mutex_enter(&cn->c_lock); 4393 if (status == RDMA_SUCCESS) { 4394 cn->c_state = C_CONNECTED; 4395 *conn = cn; 4396 } else { 4397 cn->c_state = C_ERROR_CONN; 4398 cn->c_ref--; 4399 } 4400 cv_broadcast(&cn->c_cv); 4401 mutex_exit(&cn->c_lock); 4402 return (status); 4403 } 4404 4405 static void 4406 rib_conn_close(void *rarg) 4407 { 4408 CONN *conn = (CONN *)rarg; 4409 rib_qp_t *qp = ctoqp(conn); 4410 4411 mutex_enter(&conn->c_lock); 4412 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4413 4414 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4415 /* 4416 * Live connection in CONNECTED state. 4417 */ 4418 if (conn->c_state == C_CONNECTED) { 4419 conn->c_state = C_ERROR_CONN; 4420 } 4421 mutex_exit(&conn->c_lock); 4422 4423 rib_close_a_channel(conn); 4424 4425 mutex_enter(&conn->c_lock); 4426 conn->c_flags &= ~C_CLOSE_PENDING; 4427 cv_signal(&conn->c_cv); 4428 } 4429 4430 mutex_exit(&conn->c_lock); 4431 4432 if (qp->mode == RIB_SERVER) 4433 (void) rib_disconnect_channel(conn, 4434 &qp->hca->srv_conn_list); 4435 else 4436 (void) rib_disconnect_channel(conn, 4437 &qp->hca->cl_conn_list); 4438 } 4439 4440 static void 4441 rib_conn_timeout_call(void *carg) 4442 { 4443 time_t idle_time; 4444 CONN *conn = (CONN *)carg; 4445 rib_hca_t *hca = ctoqp(conn)->hca; 4446 int error; 4447 4448 mutex_enter(&conn->c_lock); 4449 if ((conn->c_ref > 0) || 4450 (conn->c_state == C_DISCONN_PEND)) { 4451 conn->c_timeout = NULL; 4452 mutex_exit(&conn->c_lock); 4453 return; 4454 } 4455 4456 idle_time = (gethrestime_sec() - conn->c_last_used); 4457 4458 if ((idle_time <= rib_conn_timeout) && 4459 (conn->c_state != C_ERROR_CONN)) { 4460 /* 4461 * There was activity after the last timeout. 4462 * Extend the conn life. Unless the conn is 4463 * already in error state. 4464 */ 4465 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4466 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4467 mutex_exit(&conn->c_lock); 4468 return; 4469 } 4470 4471 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4472 (void *)conn, DDI_NOSLEEP); 4473 4474 /* 4475 * If taskq dispatch fails above, then reset the timeout 4476 * to try again after 10 secs. 4477 */ 4478 4479 if (error != DDI_SUCCESS) { 4480 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4481 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4482 mutex_exit(&conn->c_lock); 4483 return; 4484 } 4485 4486 conn->c_state = C_DISCONN_PEND; 4487 mutex_exit(&conn->c_lock); 4488 } 4489 4490 static rdma_stat 4491 rib_conn_release(CONN *conn) 4492 { 4493 4494 mutex_enter(&conn->c_lock); 4495 conn->c_ref--; 4496 4497 conn->c_last_used = gethrestime_sec(); 4498 if (conn->c_ref > 0) { 4499 mutex_exit(&conn->c_lock); 4500 return (RDMA_SUCCESS); 4501 } 4502 4503 /* 4504 * If a conn is C_ERROR_CONN, close the channel. 4505 */ 4506 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4507 conn->c_state = C_DISCONN_PEND; 4508 mutex_exit(&conn->c_lock); 4509 rib_conn_close((void *)conn); 4510 return (RDMA_SUCCESS); 4511 } 4512 4513 /* 4514 * c_ref == 0, set a timeout for conn release 4515 */ 4516 4517 if (conn->c_timeout == NULL) { 4518 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4519 SEC_TO_TICK(rib_conn_timeout)); 4520 } 4521 4522 mutex_exit(&conn->c_lock); 4523 return (RDMA_SUCCESS); 4524 } 4525 4526 /* 4527 * Add at front of list 4528 */ 4529 static struct rdma_done_list * 4530 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4531 { 4532 struct rdma_done_list *rd; 4533 4534 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4535 4536 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4537 rd->xid = xid; 4538 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4539 4540 rd->prev = NULL; 4541 rd->next = qp->rdlist; 4542 if (qp->rdlist != NULL) 4543 qp->rdlist->prev = rd; 4544 qp->rdlist = rd; 4545 4546 return (rd); 4547 } 4548 4549 static void 4550 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4551 { 4552 struct rdma_done_list *r; 4553 4554 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4555 4556 r = rd->next; 4557 if (r != NULL) { 4558 r->prev = rd->prev; 4559 } 4560 4561 r = rd->prev; 4562 if (r != NULL) { 4563 r->next = rd->next; 4564 } else { 4565 qp->rdlist = rd->next; 4566 } 4567 4568 cv_destroy(&rd->rdma_done_cv); 4569 kmem_free(rd, sizeof (*rd)); 4570 } 4571 4572 static void 4573 rdma_done_rem_list(rib_qp_t *qp) 4574 { 4575 struct rdma_done_list *r, *n; 4576 4577 mutex_enter(&qp->rdlist_lock); 4578 for (r = qp->rdlist; r != NULL; r = n) { 4579 n = r->next; 4580 rdma_done_rm(qp, r); 4581 } 4582 mutex_exit(&qp->rdlist_lock); 4583 } 4584 4585 static void 4586 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4587 { 4588 struct rdma_done_list *r = qp->rdlist; 4589 4590 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4591 4592 while (r) { 4593 if (r->xid == xid) { 4594 cv_signal(&r->rdma_done_cv); 4595 return; 4596 } else { 4597 r = r->next; 4598 } 4599 } 4600 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4601 int, xid); 4602 } 4603 4604 /* 4605 * Expects conn->c_lock to be held by the caller. 4606 */ 4607 4608 static void 4609 rib_close_a_channel(CONN *conn) 4610 { 4611 rib_qp_t *qp; 4612 qp = ctoqp(conn); 4613 4614 if (qp->qp_hdl == NULL) { 4615 /* channel already freed */ 4616 return; 4617 } 4618 4619 /* 4620 * Call ibt_close_rc_channel in blocking mode 4621 * with no callbacks. 4622 */ 4623 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4624 NULL, 0, NULL, NULL, 0); 4625 } 4626 4627 /* 4628 * Goes through all connections and closes the channel 4629 * This will cause all the WRs on those channels to be 4630 * flushed. 4631 */ 4632 static void 4633 rib_close_channels(rib_conn_list_t *connlist) 4634 { 4635 CONN *conn, *tmp; 4636 4637 rw_enter(&connlist->conn_lock, RW_READER); 4638 conn = connlist->conn_hd; 4639 while (conn != NULL) { 4640 mutex_enter(&conn->c_lock); 4641 tmp = conn->c_next; 4642 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4643 4644 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4645 4646 /* 4647 * Live connection in CONNECTED state. 4648 */ 4649 if (conn->c_state == C_CONNECTED) 4650 conn->c_state = C_ERROR_CONN; 4651 mutex_exit(&conn->c_lock); 4652 4653 rib_close_a_channel(conn); 4654 4655 mutex_enter(&conn->c_lock); 4656 conn->c_flags &= ~C_CLOSE_PENDING; 4657 /* Signal a pending rib_disconnect_channel() */ 4658 cv_signal(&conn->c_cv); 4659 } 4660 mutex_exit(&conn->c_lock); 4661 conn = tmp; 4662 } 4663 rw_exit(&connlist->conn_lock); 4664 } 4665 4666 /* 4667 * Frees up all connections that are no longer being referenced 4668 */ 4669 static void 4670 rib_purge_connlist(rib_conn_list_t *connlist) 4671 { 4672 CONN *conn; 4673 4674 top: 4675 rw_enter(&connlist->conn_lock, RW_READER); 4676 conn = connlist->conn_hd; 4677 while (conn != NULL) { 4678 mutex_enter(&conn->c_lock); 4679 4680 /* 4681 * At this point connection is either in ERROR 4682 * or DISCONN_PEND state. If in DISCONN_PEND state 4683 * then some other thread is culling that connection. 4684 * If not and if c_ref is 0, then destroy the connection. 4685 */ 4686 if (conn->c_ref == 0 && 4687 conn->c_state != C_DISCONN_PEND) { 4688 /* 4689 * Cull the connection 4690 */ 4691 conn->c_state = C_DISCONN_PEND; 4692 mutex_exit(&conn->c_lock); 4693 rw_exit(&connlist->conn_lock); 4694 (void) rib_disconnect_channel(conn, connlist); 4695 goto top; 4696 } else { 4697 /* 4698 * conn disconnect already scheduled or will 4699 * happen from conn_release when c_ref drops to 0. 4700 */ 4701 mutex_exit(&conn->c_lock); 4702 } 4703 conn = conn->c_next; 4704 } 4705 rw_exit(&connlist->conn_lock); 4706 4707 /* 4708 * At this point, only connections with c_ref != 0 are on the list 4709 */ 4710 } 4711 4712 /* 4713 * Free all the HCA resources and close 4714 * the hca. 4715 */ 4716 4717 static void 4718 rib_free_hca(rib_hca_t *hca) 4719 { 4720 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4721 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4722 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4723 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4724 4725 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4726 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4727 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4728 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4729 4730 rib_rbufpool_destroy(hca, RECV_BUFFER); 4731 rib_rbufpool_destroy(hca, SEND_BUFFER); 4732 rib_destroy_cache(hca); 4733 if (rib_mod.rdma_count == 0) 4734 rdma_unregister_mod(&rib_mod); 4735 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4736 (void) ibt_close_hca(hca->hca_hdl); 4737 hca->hca_hdl = NULL; 4738 } 4739 4740 4741 static void 4742 rib_stop_hca_services(rib_hca_t *hca) 4743 { 4744 rib_stop_services(hca); 4745 rib_close_channels(&hca->cl_conn_list); 4746 rib_close_channels(&hca->srv_conn_list); 4747 4748 rib_purge_connlist(&hca->cl_conn_list); 4749 rib_purge_connlist(&hca->srv_conn_list); 4750 4751 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4752 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4753 GLOBAL_ZONEID); 4754 stats_enabled = FALSE; 4755 } 4756 4757 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4758 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4759 if (hca->srv_conn_list.conn_hd == NULL && 4760 hca->cl_conn_list.conn_hd == NULL) { 4761 /* 4762 * conn_lists are NULL, so destroy 4763 * buffers, close hca and be done. 4764 */ 4765 rib_free_hca(hca); 4766 } 4767 rw_exit(&hca->cl_conn_list.conn_lock); 4768 rw_exit(&hca->srv_conn_list.conn_lock); 4769 4770 if (hca->hca_hdl != NULL) { 4771 mutex_enter(&hca->inuse_lock); 4772 while (hca->inuse) 4773 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4774 mutex_exit(&hca->inuse_lock); 4775 4776 rib_free_hca(hca); 4777 } 4778 rw_destroy(&hca->bound_services_lock); 4779 4780 if (hca->cleanup_helper != NULL) { 4781 ddi_taskq_destroy(hca->cleanup_helper); 4782 hca->cleanup_helper = NULL; 4783 } 4784 } 4785 4786 /* 4787 * Cleans and closes up all uses of the HCA 4788 */ 4789 static void 4790 rib_detach_hca(rib_hca_t *hca) 4791 { 4792 rib_hca_t **hcap; 4793 4794 /* 4795 * Stop all services on the HCA 4796 * Go through cl_conn_list and close all rc_channels 4797 * Go through svr_conn_list and close all rc_channels 4798 * Free connections whose c_ref has dropped to 0 4799 * Destroy all CQs 4800 * Deregister and released all buffer pool memory after all 4801 * connections are destroyed 4802 * Free the protection domain 4803 * ibt_close_hca() 4804 */ 4805 rw_enter(&hca->state_lock, RW_WRITER); 4806 if (hca->state == HCA_DETACHED) { 4807 rw_exit(&hca->state_lock); 4808 return; 4809 } 4810 4811 hca->state = HCA_DETACHED; 4812 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4813 for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca); 4814 hcap = &(*hcap)->next) 4815 ; 4816 ASSERT(*hcap == hca); 4817 *hcap = hca->next; 4818 rib_stat->nhca_inited--; 4819 rib_mod.rdma_count--; 4820 rw_exit(&rib_stat->hcas_list_lock); 4821 rw_exit(&hca->state_lock); 4822 4823 rib_stop_hca_services(hca); 4824 4825 kmem_free(hca, sizeof (*hca)); 4826 } 4827 4828 static void 4829 rib_server_side_cache_reclaim(void *argp) 4830 { 4831 cache_avl_struct_t *rcas; 4832 rib_lrc_entry_t *rb; 4833 rib_hca_t *hca = (rib_hca_t *)argp; 4834 4835 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4836 rcas = avl_first(&hca->avl_tree); 4837 if (rcas != NULL) 4838 avl_remove(&hca->avl_tree, rcas); 4839 4840 while (rcas != NULL) { 4841 while (rcas->r.forw != &rcas->r) { 4842 rcas->elements--; 4843 rb = rcas->r.forw; 4844 remque(rb); 4845 if (rb->registered) 4846 (void) rib_deregistermem_via_hca(hca, 4847 rb->lrc_buf, rb->lrc_mhandle); 4848 4849 hca->cache_allocation -= rb->lrc_len; 4850 kmem_free(rb->lrc_buf, rb->lrc_len); 4851 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4852 } 4853 mutex_destroy(&rcas->node_lock); 4854 kmem_cache_free(hca->server_side_cache, rcas); 4855 rcas = avl_first(&hca->avl_tree); 4856 if (rcas != NULL) 4857 avl_remove(&hca->avl_tree, rcas); 4858 } 4859 rw_exit(&hca->avl_rw_lock); 4860 } 4861 4862 static void 4863 rib_server_side_cache_cleanup(void *argp) 4864 { 4865 cache_avl_struct_t *rcas; 4866 rib_lrc_entry_t *rb; 4867 rib_hca_t *hca = (rib_hca_t *)argp; 4868 4869 mutex_enter(&hca->cache_allocation_lock); 4870 if (hca->cache_allocation < cache_limit) { 4871 mutex_exit(&hca->cache_allocation_lock); 4872 return; 4873 } 4874 mutex_exit(&hca->cache_allocation_lock); 4875 4876 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4877 rcas = avl_last(&hca->avl_tree); 4878 if (rcas != NULL) 4879 avl_remove(&hca->avl_tree, rcas); 4880 4881 while (rcas != NULL) { 4882 while (rcas->r.forw != &rcas->r) { 4883 rcas->elements--; 4884 rb = rcas->r.forw; 4885 remque(rb); 4886 if (rb->registered) 4887 (void) rib_deregistermem_via_hca(hca, 4888 rb->lrc_buf, rb->lrc_mhandle); 4889 4890 hca->cache_allocation -= rb->lrc_len; 4891 4892 kmem_free(rb->lrc_buf, rb->lrc_len); 4893 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4894 } 4895 mutex_destroy(&rcas->node_lock); 4896 if (hca->server_side_cache) { 4897 kmem_cache_free(hca->server_side_cache, rcas); 4898 } 4899 4900 if (hca->cache_allocation < cache_limit) { 4901 rw_exit(&hca->avl_rw_lock); 4902 return; 4903 } 4904 4905 rcas = avl_last(&hca->avl_tree); 4906 if (rcas != NULL) 4907 avl_remove(&hca->avl_tree, rcas); 4908 } 4909 rw_exit(&hca->avl_rw_lock); 4910 } 4911 4912 static int 4913 avl_compare(const void *t1, const void *t2) 4914 { 4915 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4916 return (0); 4917 4918 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4919 return (-1); 4920 4921 return (1); 4922 } 4923 4924 static void 4925 rib_destroy_cache(rib_hca_t *hca) 4926 { 4927 if (hca->avl_init) { 4928 rib_server_side_cache_reclaim((void *)hca); 4929 if (hca->server_side_cache) { 4930 kmem_cache_destroy(hca->server_side_cache); 4931 hca->server_side_cache = NULL; 4932 } 4933 avl_destroy(&hca->avl_tree); 4934 mutex_destroy(&hca->cache_allocation_lock); 4935 rw_destroy(&hca->avl_rw_lock); 4936 } 4937 hca->avl_init = FALSE; 4938 } 4939 4940 static void 4941 rib_force_cleanup(void *hca) 4942 { 4943 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4944 (void) ddi_taskq_dispatch( 4945 ((rib_hca_t *)hca)->cleanup_helper, 4946 rib_server_side_cache_cleanup, 4947 (void *)hca, DDI_NOSLEEP); 4948 } 4949 4950 static rib_lrc_entry_t * 4951 rib_get_cache_buf(CONN *conn, uint32_t len) 4952 { 4953 cache_avl_struct_t cas, *rcas; 4954 rib_hca_t *hca = (ctoqp(conn))->hca; 4955 rib_lrc_entry_t *reply_buf; 4956 avl_index_t where = NULL; 4957 uint64_t c_alloc = 0; 4958 4959 if (!hca->avl_init) 4960 goto error_alloc; 4961 4962 cas.len = len; 4963 4964 rw_enter(&hca->avl_rw_lock, RW_READER); 4965 4966 mutex_enter(&hca->cache_allocation_lock); 4967 c_alloc = hca->cache_allocation; 4968 mutex_exit(&hca->cache_allocation_lock); 4969 4970 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4971 &where)) == NULL) { 4972 /* Am I above the cache limit */ 4973 if ((c_alloc + len) >= cache_limit) { 4974 rib_force_cleanup((void *)hca); 4975 rw_exit(&hca->avl_rw_lock); 4976 mutex_enter(&hca->cache_allocation_lock); 4977 hca->cache_misses_above_the_limit ++; 4978 mutex_exit(&hca->cache_allocation_lock); 4979 4980 /* Allocate and register the buffer directly */ 4981 goto error_alloc; 4982 } 4983 4984 rw_exit(&hca->avl_rw_lock); 4985 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4986 4987 /* Recheck to make sure no other thread added the entry in */ 4988 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4989 &cas, &where)) == NULL) { 4990 /* Allocate an avl tree entry */ 4991 rcas = (cache_avl_struct_t *) 4992 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4993 4994 bzero(rcas, sizeof (cache_avl_struct_t)); 4995 rcas->elements = 0; 4996 rcas->r.forw = &rcas->r; 4997 rcas->r.back = &rcas->r; 4998 rcas->len = len; 4999 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5000 avl_insert(&hca->avl_tree, rcas, where); 5001 } 5002 } 5003 5004 mutex_enter(&rcas->node_lock); 5005 5006 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5007 reply_buf = rcas->r.forw; 5008 remque(reply_buf); 5009 rcas->elements--; 5010 mutex_exit(&rcas->node_lock); 5011 rw_exit(&hca->avl_rw_lock); 5012 5013 mutex_enter(&hca->cache_allocation_lock); 5014 hca->cache_hits++; 5015 hca->cache_allocation -= len; 5016 mutex_exit(&hca->cache_allocation_lock); 5017 } else { 5018 /* Am I above the cache limit */ 5019 mutex_exit(&rcas->node_lock); 5020 if ((c_alloc + len) >= cache_limit) { 5021 rib_force_cleanup((void *)hca); 5022 rw_exit(&hca->avl_rw_lock); 5023 5024 mutex_enter(&hca->cache_allocation_lock); 5025 hca->cache_misses_above_the_limit++; 5026 mutex_exit(&hca->cache_allocation_lock); 5027 /* Allocate and register the buffer directly */ 5028 goto error_alloc; 5029 } 5030 rw_exit(&hca->avl_rw_lock); 5031 mutex_enter(&hca->cache_allocation_lock); 5032 hca->cache_misses++; 5033 mutex_exit(&hca->cache_allocation_lock); 5034 /* Allocate a reply_buf entry */ 5035 reply_buf = (rib_lrc_entry_t *) 5036 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5037 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5038 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5039 reply_buf->lrc_len = len; 5040 reply_buf->registered = FALSE; 5041 reply_buf->avl_node = (void *)rcas; 5042 } 5043 5044 return (reply_buf); 5045 5046 error_alloc: 5047 reply_buf = (rib_lrc_entry_t *) 5048 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5049 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5050 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5051 reply_buf->lrc_len = len; 5052 reply_buf->registered = FALSE; 5053 reply_buf->avl_node = NULL; 5054 5055 return (reply_buf); 5056 } 5057 5058 /* 5059 * Return a pre-registered back to the cache (without 5060 * unregistering the buffer).. 5061 */ 5062 5063 static void 5064 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5065 { 5066 cache_avl_struct_t cas, *rcas; 5067 avl_index_t where = NULL; 5068 rib_hca_t *hca = (ctoqp(conn))->hca; 5069 5070 if (!hca->avl_init) 5071 goto error_free; 5072 5073 cas.len = reg_buf->lrc_len; 5074 rw_enter(&hca->avl_rw_lock, RW_READER); 5075 if ((rcas = (cache_avl_struct_t *) 5076 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5077 rw_exit(&hca->avl_rw_lock); 5078 goto error_free; 5079 } else { 5080 cas.len = reg_buf->lrc_len; 5081 mutex_enter(&rcas->node_lock); 5082 insque(reg_buf, &rcas->r); 5083 rcas->elements ++; 5084 mutex_exit(&rcas->node_lock); 5085 rw_exit(&hca->avl_rw_lock); 5086 mutex_enter(&hca->cache_allocation_lock); 5087 hca->cache_allocation += cas.len; 5088 mutex_exit(&hca->cache_allocation_lock); 5089 } 5090 5091 return; 5092 5093 error_free: 5094 5095 if (reg_buf->registered) 5096 (void) rib_deregistermem_via_hca(hca, 5097 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5098 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5099 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5100 } 5101 5102 static rdma_stat 5103 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5104 uint_t buflen, struct mrc *buf_handle) 5105 { 5106 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5107 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5108 rdma_stat status; 5109 5110 5111 /* 5112 * Note: ALL buffer pools use the same memory type RDMARW. 5113 */ 5114 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5115 if (status == RDMA_SUCCESS) { 5116 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5117 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5118 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5119 } else { 5120 buf_handle->mrc_linfo = NULL; 5121 buf_handle->mrc_lmr = 0; 5122 buf_handle->mrc_rmr = 0; 5123 } 5124 return (status); 5125 } 5126 5127 /* ARGSUSED */ 5128 static rdma_stat 5129 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5130 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5131 { 5132 5133 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5134 return (RDMA_SUCCESS); 5135 } 5136 5137 /* ARGSUSED */ 5138 static rdma_stat 5139 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5140 { 5141 5142 (void) ibt_deregister_mr(hca->hca_hdl, 5143 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5144 return (RDMA_SUCCESS); 5145 } 5146 5147 /* 5148 * Check if the IP interface named by `lifrp' is RDMA-capable. 5149 */ 5150 static boolean_t 5151 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5152 { 5153 char ifname[LIFNAMSIZ]; 5154 char *cp; 5155 5156 if (lifrp->lifr_type == IFT_IB) 5157 return (B_TRUE); 5158 5159 /* 5160 * Strip off the logical interface portion before getting 5161 * intimate with the name. 5162 */ 5163 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5164 if ((cp = strchr(ifname, ':')) != NULL) 5165 *cp = '\0'; 5166 5167 return (strcmp("lo0", ifname) == 0); 5168 } 5169 5170 static int 5171 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5172 { 5173 vnode_t *kvp, *vp; 5174 TIUSER *tiptr; 5175 struct strioctl iocb; 5176 k_sigset_t smask; 5177 int err = 0; 5178 5179 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 5180 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 5181 &tiptr, CRED()) == 0) { 5182 vp = tiptr->fp->f_vnode; 5183 } else { 5184 VN_RELE(kvp); 5185 return (EPROTO); 5186 } 5187 } else { 5188 return (EPROTO); 5189 } 5190 5191 iocb.ic_cmd = cmd; 5192 iocb.ic_timout = 0; 5193 iocb.ic_len = len; 5194 iocb.ic_dp = (caddr_t)arg; 5195 sigintr(&smask, 0); 5196 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5197 sigunintr(&smask); 5198 (void) t_kclose(tiptr, 0); 5199 VN_RELE(kvp); 5200 return (err); 5201 } 5202 5203 /* 5204 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5205 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5206 */ 5207 static int 5208 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5209 { 5210 int err; 5211 struct lifnum lifn; 5212 5213 bzero(&lifn, sizeof (struct lifnum)); 5214 lifn.lifn_family = AF_UNSPEC; 5215 5216 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5217 if (err != 0) 5218 return (err); 5219 5220 /* 5221 * Pad the interface count to account for additional interfaces that 5222 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5223 */ 5224 lifn.lifn_count += 4; 5225 5226 bzero(lifcp, sizeof (struct lifconf)); 5227 lifcp->lifc_family = AF_UNSPEC; 5228 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5229 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5230 5231 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5232 if (err != 0) { 5233 kmem_free(lifcp->lifc_buf, *bufsizep); 5234 return (err); 5235 } 5236 return (0); 5237 } 5238 5239 static boolean_t 5240 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5241 { 5242 uint_t i, nifs; 5243 uint_t bufsize; 5244 struct lifconf lifc; 5245 struct lifreq *lifrp; 5246 struct sockaddr_in *sinp; 5247 struct sockaddr_in6 *sin6p; 5248 5249 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5250 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5251 5252 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5253 return (B_FALSE); 5254 5255 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5256 kmem_free(lifc.lifc_buf, bufsize); 5257 return (B_FALSE); 5258 } 5259 5260 /* 5261 * Worst case is that all of the addresses are IB-capable and have 5262 * the same address family, so size our buffers accordingly. 5263 */ 5264 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5265 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5266 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5267 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5268 5269 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5270 if (!rpcib_rdma_capable_interface(lifrp)) 5271 continue; 5272 5273 if (lifrp->lifr_addr.ss_family == AF_INET) { 5274 sinp = addrs4->ri_list; 5275 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5276 sizeof (struct sockaddr_in)); 5277 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5278 sin6p = addrs6->ri_list; 5279 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5280 sizeof (struct sockaddr_in6)); 5281 } 5282 } 5283 5284 kmem_free(lifc.lifc_buf, bufsize); 5285 return (B_TRUE); 5286 } 5287 5288 /* ARGSUSED */ 5289 static int 5290 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5291 { 5292 rib_hca_t *hca; 5293 5294 if (KSTAT_WRITE == rw) { 5295 return (EACCES); 5296 } 5297 5298 rpcib_kstat.cache_limit.value.ui64 = 5299 (uint64_t)cache_limit; 5300 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5301 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5302 rpcib_kstat.cache_allocation.value.ui64 += 5303 (uint64_t)hca->cache_allocation; 5304 rpcib_kstat.cache_hits.value.ui64 += 5305 (uint64_t)hca->cache_hits; 5306 rpcib_kstat.cache_misses.value.ui64 += 5307 (uint64_t)hca->cache_misses; 5308 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5309 (uint64_t)hca->cache_misses_above_the_limit; 5310 } 5311 rw_exit(&rib_stat->hcas_list_lock); 5312 return (0); 5313 } 5314