1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 118 struct { 119 kstat_named_t cache_limit; 120 kstat_named_t cache_allocation; 121 kstat_named_t cache_hits; 122 kstat_named_t cache_misses; 123 kstat_named_t cache_misses_above_the_limit; 124 } rpcib_kstat = { 125 {"cache_limit", KSTAT_DATA_UINT64 }, 126 {"cache_allocation", KSTAT_DATA_UINT64 }, 127 {"cache_hits", KSTAT_DATA_UINT64 }, 128 {"cache_misses", KSTAT_DATA_UINT64 }, 129 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 130 }; 131 132 /* rpcib cb_ops */ 133 static struct cb_ops rpcib_cbops = { 134 nulldev, /* open */ 135 nulldev, /* close */ 136 nodev, /* strategy */ 137 nodev, /* print */ 138 nodev, /* dump */ 139 nodev, /* read */ 140 nodev, /* write */ 141 nodev, /* ioctl */ 142 nodev, /* devmap */ 143 nodev, /* mmap */ 144 nodev, /* segmap */ 145 nochpoll, /* poll */ 146 ddi_prop_op, /* prop_op */ 147 NULL, /* stream */ 148 D_MP, /* cb_flag */ 149 CB_REV, /* rev */ 150 nodev, /* int (*cb_aread)() */ 151 nodev /* int (*cb_awrite)() */ 152 }; 153 154 /* 155 * Device options 156 */ 157 static struct dev_ops rpcib_ops = { 158 DEVO_REV, /* devo_rev, */ 159 0, /* refcnt */ 160 rpcib_getinfo, /* info */ 161 nulldev, /* identify */ 162 nulldev, /* probe */ 163 rpcib_attach, /* attach */ 164 rpcib_detach, /* detach */ 165 nodev, /* reset */ 166 &rpcib_cbops, /* driver ops - devctl interfaces */ 167 NULL, /* bus operations */ 168 NULL, /* power */ 169 ddi_quiesce_not_needed, /* quiesce */ 170 }; 171 172 /* 173 * Module linkage information. 174 */ 175 176 static struct modldrv rib_modldrv = { 177 &mod_driverops, /* Driver module */ 178 "RPCIB plugin driver", /* Driver name and version */ 179 &rpcib_ops, /* Driver ops */ 180 }; 181 182 static struct modlinkage rib_modlinkage = { 183 MODREV_1, 184 (void *)&rib_modldrv, 185 NULL 186 }; 187 188 typedef struct rib_lrc_entry { 189 struct rib_lrc_entry *forw; 190 struct rib_lrc_entry *back; 191 char *lrc_buf; 192 193 uint32_t lrc_len; 194 void *avl_node; 195 bool_t registered; 196 197 struct mrc lrc_mhandle; 198 bool_t lrc_on_freed_list; 199 } rib_lrc_entry_t; 200 201 typedef struct cache_struct { 202 rib_lrc_entry_t r; 203 uint32_t len; 204 uint32_t elements; 205 kmutex_t node_lock; 206 avl_node_t avl_link; 207 } cache_avl_struct_t; 208 209 static uint64_t rib_total_buffers = 0; 210 uint64_t cache_limit = 100 * 1024 * 1024; 211 static volatile uint64_t cache_allocation = 0; 212 static uint64_t cache_watermark = 80 * 1024 * 1024; 213 static uint64_t cache_hits = 0; 214 static uint64_t cache_misses = 0; 215 static uint64_t cache_cold_misses = 0; 216 static uint64_t cache_hot_misses = 0; 217 static uint64_t cache_misses_above_the_limit = 0; 218 static bool_t stats_enabled = FALSE; 219 220 static uint64_t max_unsignaled_rws = 5; 221 int nfs_rdma_port = NFS_RDMA_PORT; 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 233 int preposted_rbufs = RDMA_BUFS_GRANT; 234 int send_threshold = 1; 235 236 /* 237 * State of the plugin. 238 * ACCEPT = accepting new connections and requests. 239 * NO_ACCEPT = not accepting new connection and requests. 240 * This should eventually move to rpcib_state_t structure, since this 241 * will tell in which state the plugin is for a particular type of service 242 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 243 * state for one and in no_accept state for the other. 244 */ 245 int plugin_state; 246 kmutex_t plugin_state_lock; 247 248 ldi_ident_t rpcib_li; 249 250 /* 251 * RPCIB RDMATF operations 252 */ 253 #if defined(MEASURE_POOL_DEPTH) 254 static void rib_posted_rbufs(uint32_t x) { return; } 255 #endif 256 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 257 static rdma_stat rib_disconnect(CONN *conn); 258 static void rib_listen(struct rdma_svc_data *rd); 259 static void rib_listen_stop(struct rdma_svc_data *rd); 260 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 261 uint_t buflen, struct mrc *buf_handle); 262 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 263 struct mrc buf_handle); 264 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 265 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 266 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 267 struct mrc buf_handle); 268 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 269 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 270 void *lrc); 271 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 272 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 273 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 274 caddr_t buf, int len, int cpu); 275 276 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 277 278 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 279 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 280 281 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 282 283 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 284 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 285 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 286 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 287 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 288 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 289 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 290 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 291 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 292 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 293 static rdma_stat rib_conn_release(CONN *conn); 294 static rdma_stat rib_getinfo(rdma_info_t *info); 295 296 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 297 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 298 static void rib_destroy_cache(rib_hca_t *hca); 299 static void rib_server_side_cache_reclaim(void *argp); 300 static int avl_compare(const void *t1, const void *t2); 301 302 static void rib_stop_services(rib_hca_t *); 303 static void rib_close_channels(rib_conn_list_t *); 304 305 /* 306 * RPCIB addressing operations 307 */ 308 309 /* 310 * RDMA operations the RPCIB module exports 311 */ 312 static rdmaops_t rib_ops = { 313 rib_reachable, 314 rib_conn_get, 315 rib_conn_release, 316 rib_listen, 317 rib_listen_stop, 318 rib_registermem, 319 rib_deregistermem, 320 rib_registermemsync, 321 rib_deregistermemsync, 322 rib_syncmem, 323 rib_reg_buf_alloc, 324 rib_reg_buf_free, 325 rib_send, 326 rib_send_resp, 327 rib_post_resp, 328 rib_post_resp_remove, 329 rib_post_recv, 330 rib_recv, 331 rib_read, 332 rib_write, 333 rib_getinfo, 334 }; 335 336 /* 337 * RDMATF RPCIB plugin details 338 */ 339 static rdma_mod_t rib_mod = { 340 "ibtf", /* api name */ 341 RDMATF_VERS_1, 342 0, 343 &rib_ops, /* rdma op vector for ibtf */ 344 }; 345 346 static rdma_stat open_hcas(rpcib_state_t *); 347 static rdma_stat rib_qp_init(rib_qp_t *, int); 348 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 349 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 350 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 351 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 352 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 353 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 354 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 355 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 356 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 357 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 358 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 359 rib_qp_t **); 360 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 361 rib_qp_t **); 362 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 363 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 364 static int rib_free_sendwait(struct send_wid *); 365 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 366 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 367 static void rdma_done_rem_list(rib_qp_t *); 368 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 369 370 static void rib_async_handler(void *, 371 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 372 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 373 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 374 static int rib_free_svc_recv(struct svc_recv *); 375 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 376 static void rib_free_wid(struct recv_wid *); 377 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 378 static void rib_detach_hca(rib_hca_t *); 379 380 /* 381 * Registration with IBTF as a consumer 382 */ 383 static struct ibt_clnt_modinfo_s rib_modinfo = { 384 IBTI_V_CURR, 385 IBT_GENERIC, 386 rib_async_handler, /* async event handler */ 387 NULL, /* Memory Region Handler */ 388 "nfs/ib" 389 }; 390 391 /* 392 * Global strucuture 393 */ 394 395 typedef struct rpcib_s { 396 dev_info_t *rpcib_dip; 397 kmutex_t rpcib_mutex; 398 } rpcib_t; 399 400 rpcib_t rpcib; 401 402 /* 403 * /etc/system controlled variable to control 404 * debugging in rpcib kernel module. 405 * Set it to values greater that 1 to control 406 * the amount of debugging messages required. 407 */ 408 int rib_debug = 0; 409 410 int 411 _init(void) 412 { 413 int error; 414 415 error = mod_install((struct modlinkage *)&rib_modlinkage); 416 if (error != 0) { 417 /* 418 * Could not load module 419 */ 420 return (error); 421 } 422 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 423 return (0); 424 } 425 426 int 427 _fini() 428 { 429 int status; 430 431 /* 432 * Remove module 433 */ 434 if ((status = mod_remove(&rib_modlinkage)) != 0) { 435 return (status); 436 } 437 mutex_destroy(&plugin_state_lock); 438 return (0); 439 } 440 441 int 442 _info(struct modinfo *modinfop) 443 { 444 return (mod_info(&rib_modlinkage, modinfop)); 445 } 446 447 /* 448 * rpcib_getinfo() 449 * Given the device number, return the devinfo pointer or the 450 * instance number. 451 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 452 */ 453 454 /*ARGSUSED*/ 455 static int 456 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 457 { 458 int ret = DDI_SUCCESS; 459 460 switch (cmd) { 461 case DDI_INFO_DEVT2DEVINFO: 462 if (rpcib.rpcib_dip != NULL) 463 *result = rpcib.rpcib_dip; 464 else { 465 *result = NULL; 466 ret = DDI_FAILURE; 467 } 468 break; 469 470 case DDI_INFO_DEVT2INSTANCE: 471 *result = NULL; 472 break; 473 474 default: 475 ret = DDI_FAILURE; 476 } 477 return (ret); 478 } 479 480 static int 481 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 482 { 483 ibt_status_t ibt_status; 484 rdma_stat r_status; 485 486 switch (cmd) { 487 case DDI_ATTACH: 488 break; 489 case DDI_RESUME: 490 return (DDI_SUCCESS); 491 default: 492 return (DDI_FAILURE); 493 } 494 495 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 496 497 mutex_enter(&rpcib.rpcib_mutex); 498 if (rpcib.rpcib_dip != NULL) { 499 mutex_exit(&rpcib.rpcib_mutex); 500 return (DDI_FAILURE); 501 } 502 rpcib.rpcib_dip = dip; 503 mutex_exit(&rpcib.rpcib_mutex); 504 /* 505 * Create the "rpcib" minor-node. 506 */ 507 if (ddi_create_minor_node(dip, 508 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 509 /* Error message, no cmn_err as they print on console */ 510 return (DDI_FAILURE); 511 } 512 513 if (rib_stat == NULL) { 514 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 515 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 516 } 517 518 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 519 if (rib_stat->hca_count < 1) { 520 mutex_destroy(&rib_stat->open_hca_lock); 521 kmem_free(rib_stat, sizeof (*rib_stat)); 522 rib_stat = NULL; 523 return (DDI_FAILURE); 524 } 525 526 ibt_status = ibt_attach(&rib_modinfo, dip, 527 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 528 529 if (ibt_status != IBT_SUCCESS) { 530 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 531 mutex_destroy(&rib_stat->open_hca_lock); 532 kmem_free(rib_stat, sizeof (*rib_stat)); 533 rib_stat = NULL; 534 return (DDI_FAILURE); 535 } 536 537 mutex_enter(&rib_stat->open_hca_lock); 538 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 539 mutex_exit(&rib_stat->open_hca_lock); 540 goto open_fail; 541 } 542 mutex_exit(&rib_stat->open_hca_lock); 543 544 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 545 DDI_PROP_SUCCESS) { 546 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 547 "failed."); 548 goto register_fail; 549 } 550 551 /* 552 * Register with rdmatf 553 */ 554 rib_mod.rdma_count = rib_stat->nhca_inited; 555 r_status = rdma_register_mod(&rib_mod); 556 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 557 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 558 "status = %d", r_status); 559 goto register_fail; 560 } 561 562 return (DDI_SUCCESS); 563 564 register_fail: 565 rib_detach_hca(rib_stat->hca); 566 open_fail: 567 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 568 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 569 mutex_destroy(&rib_stat->open_hca_lock); 570 kmem_free(rib_stat, sizeof (*rib_stat)); 571 rib_stat = NULL; 572 return (DDI_FAILURE); 573 } 574 575 /*ARGSUSED*/ 576 static int 577 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 578 { 579 switch (cmd) { 580 581 case DDI_DETACH: 582 break; 583 584 case DDI_SUSPEND: 585 default: 586 return (DDI_FAILURE); 587 } 588 589 /* 590 * Detach the hca and free resources 591 */ 592 mutex_enter(&plugin_state_lock); 593 plugin_state = NO_ACCEPT; 594 mutex_exit(&plugin_state_lock); 595 rib_detach_hca(rib_stat->hca); 596 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 597 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 598 mutex_destroy(&rib_stat->open_hca_lock); 599 if (rib_stat->hcas) { 600 kmem_free(rib_stat->hcas, rib_stat->hca_count * 601 sizeof (rib_hca_t)); 602 rib_stat->hcas = NULL; 603 } 604 kmem_free(rib_stat, sizeof (*rib_stat)); 605 rib_stat = NULL; 606 607 mutex_enter(&rpcib.rpcib_mutex); 608 rpcib.rpcib_dip = NULL; 609 mutex_exit(&rpcib.rpcib_mutex); 610 mutex_destroy(&rpcib.rpcib_mutex); 611 return (DDI_SUCCESS); 612 } 613 614 615 static void rib_rbufpool_free(rib_hca_t *, int); 616 static void rib_rbufpool_deregister(rib_hca_t *, int); 617 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 618 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 619 static rdma_stat rib_rem_replylist(rib_qp_t *); 620 static int rib_remreply(rib_qp_t *, struct reply *); 621 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 622 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 623 624 625 /* 626 * One CQ pair per HCA 627 */ 628 static rdma_stat 629 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 630 rib_cq_t **cqp, rpcib_state_t *ribstat) 631 { 632 rib_cq_t *cq; 633 ibt_cq_attr_t cq_attr; 634 uint32_t real_size; 635 ibt_status_t status; 636 rdma_stat error = RDMA_SUCCESS; 637 638 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 639 cq->rib_hca = hca; 640 cq_attr.cq_size = cq_size; 641 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 642 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 643 &real_size); 644 if (status != IBT_SUCCESS) { 645 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 646 " status=%d", status); 647 error = RDMA_FAILED; 648 goto fail; 649 } 650 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 651 652 /* 653 * Enable CQ callbacks. CQ Callbacks are single shot 654 * (e.g. you have to call ibt_enable_cq_notify() 655 * after each callback to get another one). 656 */ 657 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 658 if (status != IBT_SUCCESS) { 659 cmn_err(CE_WARN, "rib_create_cq: " 660 "enable_cq_notify failed, status %d", status); 661 error = RDMA_FAILED; 662 goto fail; 663 } 664 *cqp = cq; 665 666 return (error); 667 fail: 668 if (cq->rib_cq_hdl) 669 (void) ibt_free_cq(cq->rib_cq_hdl); 670 if (cq) 671 kmem_free(cq, sizeof (rib_cq_t)); 672 return (error); 673 } 674 675 static rdma_stat 676 open_hcas(rpcib_state_t *ribstat) 677 { 678 rib_hca_t *hca; 679 ibt_status_t ibt_status; 680 rdma_stat status; 681 ibt_hca_portinfo_t *pinfop; 682 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 683 uint_t size, cq_size; 684 int i; 685 kstat_t *ksp; 686 cache_avl_struct_t example_avl_node; 687 char rssc_name[32]; 688 689 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 690 691 if (ribstat->hcas == NULL) 692 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 693 sizeof (rib_hca_t), KM_SLEEP); 694 695 /* 696 * Open a hca and setup for RDMA 697 */ 698 for (i = 0; i < ribstat->hca_count; i++) { 699 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 700 ribstat->hca_guids[i], 701 &ribstat->hcas[i].hca_hdl); 702 if (ibt_status != IBT_SUCCESS) { 703 continue; 704 } 705 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 706 hca = &(ribstat->hcas[i]); 707 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 708 hca->state = HCA_INITED; 709 710 /* 711 * query HCA info 712 */ 713 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 714 if (ibt_status != IBT_SUCCESS) { 715 goto fail1; 716 } 717 718 /* 719 * One PD (Protection Domain) per HCA. 720 * A qp is allowed to access a memory region 721 * only when it's in the same PD as that of 722 * the memory region. 723 */ 724 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 725 if (ibt_status != IBT_SUCCESS) { 726 goto fail1; 727 } 728 729 /* 730 * query HCA ports 731 */ 732 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 733 0, &pinfop, &hca->hca_nports, &size); 734 if (ibt_status != IBT_SUCCESS) { 735 goto fail2; 736 } 737 hca->hca_ports = pinfop; 738 hca->hca_pinfosz = size; 739 pinfop = NULL; 740 741 cq_size = DEF_CQ_SIZE; /* default cq size */ 742 /* 743 * Create 2 pairs of cq's (1 pair for client 744 * and the other pair for server) on this hca. 745 * If number of qp's gets too large, then several 746 * cq's will be needed. 747 */ 748 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 749 &hca->svc_rcq, ribstat); 750 if (status != RDMA_SUCCESS) { 751 goto fail3; 752 } 753 754 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 755 &hca->svc_scq, ribstat); 756 if (status != RDMA_SUCCESS) { 757 goto fail3; 758 } 759 760 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 761 &hca->clnt_rcq, ribstat); 762 if (status != RDMA_SUCCESS) { 763 goto fail3; 764 } 765 766 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 767 &hca->clnt_scq, ribstat); 768 if (status != RDMA_SUCCESS) { 769 goto fail3; 770 } 771 772 /* 773 * Create buffer pools. 774 * Note rib_rbuf_create also allocates memory windows. 775 */ 776 hca->recv_pool = rib_rbufpool_create(hca, 777 RECV_BUFFER, MAX_BUFS); 778 if (hca->recv_pool == NULL) { 779 goto fail3; 780 } 781 782 hca->send_pool = rib_rbufpool_create(hca, 783 SEND_BUFFER, MAX_BUFS); 784 if (hca->send_pool == NULL) { 785 rib_rbufpool_destroy(hca, RECV_BUFFER); 786 goto fail3; 787 } 788 789 if (hca->server_side_cache == NULL) { 790 (void) sprintf(rssc_name, 791 "rib_server_side_cache_%04d", i); 792 hca->server_side_cache = kmem_cache_create( 793 rssc_name, 794 sizeof (cache_avl_struct_t), 0, 795 NULL, 796 NULL, 797 rib_server_side_cache_reclaim, 798 hca, NULL, 0); 799 } 800 801 avl_create(&hca->avl_tree, 802 avl_compare, 803 sizeof (cache_avl_struct_t), 804 (uint_t)(uintptr_t)&example_avl_node.avl_link- 805 (uint_t)(uintptr_t)&example_avl_node); 806 807 rw_init(&hca->avl_rw_lock, 808 NULL, RW_DRIVER, hca->iblock); 809 mutex_init(&hca->cache_allocation, 810 NULL, MUTEX_DRIVER, NULL); 811 hca->avl_init = TRUE; 812 813 /* Create kstats for the cache */ 814 ASSERT(INGLOBALZONE(curproc)); 815 816 if (!stats_enabled) { 817 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 818 KSTAT_TYPE_NAMED, 819 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 820 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 821 GLOBAL_ZONEID); 822 if (ksp) { 823 ksp->ks_data = (void *) &rpcib_kstat; 824 ksp->ks_update = rpcib_cache_kstat_update; 825 kstat_install(ksp); 826 stats_enabled = TRUE; 827 } 828 } 829 if (NULL == hca->reg_cache_clean_up) { 830 hca->reg_cache_clean_up = ddi_taskq_create(NULL, 831 "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0); 832 } 833 834 /* 835 * Initialize the registered service list and 836 * the lock 837 */ 838 hca->service_list = NULL; 839 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 840 841 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 842 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 843 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 844 hca->iblock); 845 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 846 hca->iblock); 847 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 848 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 849 hca->inuse = TRUE; 850 /* 851 * XXX One hca only. Add multi-hca functionality if needed 852 * later. 853 */ 854 ribstat->hca = hca; 855 ribstat->nhca_inited++; 856 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 857 break; 858 859 fail3: 860 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 861 fail2: 862 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 863 fail1: 864 (void) ibt_close_hca(hca->hca_hdl); 865 866 } 867 if (ribstat->hca != NULL) 868 return (RDMA_SUCCESS); 869 else 870 return (RDMA_FAILED); 871 } 872 873 /* 874 * Callback routines 875 */ 876 877 /* 878 * SCQ handlers 879 */ 880 /* ARGSUSED */ 881 static void 882 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 883 { 884 ibt_status_t ibt_status; 885 ibt_wc_t wc; 886 int i; 887 888 /* 889 * Re-enable cq notify here to avoid missing any 890 * completion queue notification. 891 */ 892 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 893 894 ibt_status = IBT_SUCCESS; 895 while (ibt_status != IBT_CQ_EMPTY) { 896 bzero(&wc, sizeof (wc)); 897 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 898 if (ibt_status != IBT_SUCCESS) 899 return; 900 901 /* 902 * Got a send completion 903 */ 904 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 905 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 906 CONN *conn = qptoc(wd->qp); 907 908 mutex_enter(&wd->sendwait_lock); 909 switch (wc.wc_status) { 910 case IBT_WC_SUCCESS: 911 wd->status = RDMA_SUCCESS; 912 break; 913 case IBT_WC_WR_FLUSHED_ERR: 914 wd->status = RDMA_FAILED; 915 break; 916 default: 917 /* 918 * RC Send Q Error Code Local state Remote State 919 * ==================== =========== ============ 920 * IBT_WC_BAD_RESPONSE_ERR ERROR None 921 * IBT_WC_LOCAL_LEN_ERR ERROR None 922 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 923 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 924 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 925 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 926 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 927 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 928 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 929 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 930 * IBT_WC_WR_FLUSHED_ERR None None 931 */ 932 /* 933 * Channel in error state. Set connection to 934 * ERROR and cleanup will happen either from 935 * conn_release or from rib_conn_get 936 */ 937 wd->status = RDMA_FAILED; 938 mutex_enter(&conn->c_lock); 939 if (conn->c_state != C_DISCONN_PEND) 940 conn->c_state = C_ERROR_CONN; 941 mutex_exit(&conn->c_lock); 942 break; 943 } 944 945 if (wd->cv_sig == 1) { 946 /* 947 * Notify poster 948 */ 949 cv_signal(&wd->wait_cv); 950 mutex_exit(&wd->sendwait_lock); 951 } else { 952 /* 953 * Poster not waiting for notification. 954 * Free the send buffers and send_wid 955 */ 956 for (i = 0; i < wd->nsbufs; i++) { 957 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 958 (void *)(uintptr_t)wd->sbufaddr[i]); 959 } 960 mutex_exit(&wd->sendwait_lock); 961 (void) rib_free_sendwait(wd); 962 } 963 } 964 } 965 } 966 967 /* ARGSUSED */ 968 static void 969 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 970 { 971 ibt_status_t ibt_status; 972 ibt_wc_t wc; 973 int i; 974 975 /* 976 * Re-enable cq notify here to avoid missing any 977 * completion queue notification. 978 */ 979 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 980 981 ibt_status = IBT_SUCCESS; 982 while (ibt_status != IBT_CQ_EMPTY) { 983 bzero(&wc, sizeof (wc)); 984 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 985 if (ibt_status != IBT_SUCCESS) 986 return; 987 988 /* 989 * Got a send completion 990 */ 991 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 992 struct send_wid *wd = 993 (struct send_wid *)(uintptr_t)wc.wc_id; 994 mutex_enter(&wd->sendwait_lock); 995 if (wd->cv_sig == 1) { 996 /* 997 * Update completion status and notify poster 998 */ 999 if (wc.wc_status == IBT_WC_SUCCESS) 1000 wd->status = RDMA_SUCCESS; 1001 else 1002 wd->status = RDMA_FAILED; 1003 cv_signal(&wd->wait_cv); 1004 mutex_exit(&wd->sendwait_lock); 1005 } else { 1006 /* 1007 * Poster not waiting for notification. 1008 * Free the send buffers and send_wid 1009 */ 1010 for (i = 0; i < wd->nsbufs; i++) { 1011 rib_rbuf_free(qptoc(wd->qp), 1012 SEND_BUFFER, 1013 (void *)(uintptr_t)wd->sbufaddr[i]); 1014 } 1015 mutex_exit(&wd->sendwait_lock); 1016 (void) rib_free_sendwait(wd); 1017 } 1018 } 1019 } 1020 } 1021 1022 /* 1023 * RCQ handler 1024 */ 1025 /* ARGSUSED */ 1026 static void 1027 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1028 { 1029 rib_qp_t *qp; 1030 ibt_status_t ibt_status; 1031 ibt_wc_t wc; 1032 struct recv_wid *rwid; 1033 1034 /* 1035 * Re-enable cq notify here to avoid missing any 1036 * completion queue notification. 1037 */ 1038 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1039 1040 ibt_status = IBT_SUCCESS; 1041 while (ibt_status != IBT_CQ_EMPTY) { 1042 bzero(&wc, sizeof (wc)); 1043 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1044 if (ibt_status != IBT_SUCCESS) 1045 return; 1046 1047 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1048 qp = rwid->qp; 1049 if (wc.wc_status == IBT_WC_SUCCESS) { 1050 XDR inxdrs, *xdrs; 1051 uint_t xid, vers, op, find_xid = 0; 1052 struct reply *r; 1053 CONN *conn = qptoc(qp); 1054 uint32_t rdma_credit = 0; 1055 1056 xdrs = &inxdrs; 1057 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1058 wc.wc_bytes_xfer, XDR_DECODE); 1059 /* 1060 * Treat xid as opaque (xid is the first entity 1061 * in the rpc rdma message). 1062 */ 1063 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1064 1065 /* Skip xid and set the xdr position accordingly. */ 1066 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1067 (void) xdr_u_int(xdrs, &vers); 1068 (void) xdr_u_int(xdrs, &rdma_credit); 1069 (void) xdr_u_int(xdrs, &op); 1070 XDR_DESTROY(xdrs); 1071 1072 if (vers != RPCRDMA_VERS) { 1073 /* 1074 * Invalid RPC/RDMA version. Cannot 1075 * interoperate. Set connection to 1076 * ERROR state and bail out. 1077 */ 1078 mutex_enter(&conn->c_lock); 1079 if (conn->c_state != C_DISCONN_PEND) 1080 conn->c_state = C_ERROR_CONN; 1081 mutex_exit(&conn->c_lock); 1082 rib_rbuf_free(conn, RECV_BUFFER, 1083 (void *)(uintptr_t)rwid->addr); 1084 rib_free_wid(rwid); 1085 continue; 1086 } 1087 1088 mutex_enter(&qp->replylist_lock); 1089 for (r = qp->replylist; r != NULL; r = r->next) { 1090 if (r->xid == xid) { 1091 find_xid = 1; 1092 switch (op) { 1093 case RDMA_MSG: 1094 case RDMA_NOMSG: 1095 case RDMA_MSGP: 1096 r->status = RDMA_SUCCESS; 1097 r->vaddr_cq = rwid->addr; 1098 r->bytes_xfer = 1099 wc.wc_bytes_xfer; 1100 cv_signal(&r->wait_cv); 1101 break; 1102 default: 1103 rib_rbuf_free(qptoc(qp), 1104 RECV_BUFFER, 1105 (void *)(uintptr_t) 1106 rwid->addr); 1107 break; 1108 } 1109 break; 1110 } 1111 } 1112 mutex_exit(&qp->replylist_lock); 1113 if (find_xid == 0) { 1114 /* RPC caller not waiting for reply */ 1115 1116 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1117 int, xid); 1118 1119 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1120 (void *)(uintptr_t)rwid->addr); 1121 } 1122 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1123 CONN *conn = qptoc(qp); 1124 1125 /* 1126 * Connection being flushed. Just free 1127 * the posted buffer 1128 */ 1129 rib_rbuf_free(conn, RECV_BUFFER, 1130 (void *)(uintptr_t)rwid->addr); 1131 } else { 1132 CONN *conn = qptoc(qp); 1133 /* 1134 * RC Recv Q Error Code Local state Remote State 1135 * ==================== =========== ============ 1136 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1137 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1138 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1139 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1140 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1141 * IBT_WC_WR_FLUSHED_ERR None None 1142 */ 1143 /* 1144 * Channel in error state. Set connection 1145 * in ERROR state. 1146 */ 1147 mutex_enter(&conn->c_lock); 1148 if (conn->c_state != C_DISCONN_PEND) 1149 conn->c_state = C_ERROR_CONN; 1150 mutex_exit(&conn->c_lock); 1151 rib_rbuf_free(conn, RECV_BUFFER, 1152 (void *)(uintptr_t)rwid->addr); 1153 } 1154 rib_free_wid(rwid); 1155 } 1156 } 1157 1158 /* Server side */ 1159 /* ARGSUSED */ 1160 static void 1161 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1162 { 1163 rdma_recv_data_t *rdp; 1164 rib_qp_t *qp; 1165 ibt_status_t ibt_status; 1166 ibt_wc_t wc; 1167 struct svc_recv *s_recvp; 1168 CONN *conn; 1169 mblk_t *mp; 1170 1171 /* 1172 * Re-enable cq notify here to avoid missing any 1173 * completion queue notification. 1174 */ 1175 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1176 1177 ibt_status = IBT_SUCCESS; 1178 while (ibt_status != IBT_CQ_EMPTY) { 1179 bzero(&wc, sizeof (wc)); 1180 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1181 if (ibt_status != IBT_SUCCESS) 1182 return; 1183 1184 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1185 qp = s_recvp->qp; 1186 conn = qptoc(qp); 1187 mutex_enter(&qp->posted_rbufs_lock); 1188 qp->n_posted_rbufs--; 1189 #if defined(MEASURE_POOL_DEPTH) 1190 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); 1191 #endif 1192 if (qp->n_posted_rbufs == 0) 1193 cv_signal(&qp->posted_rbufs_cv); 1194 mutex_exit(&qp->posted_rbufs_lock); 1195 1196 if (wc.wc_status == IBT_WC_SUCCESS) { 1197 XDR inxdrs, *xdrs; 1198 uint_t xid, vers, op; 1199 uint32_t rdma_credit; 1200 1201 xdrs = &inxdrs; 1202 /* s_recvp->vaddr stores data */ 1203 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1204 wc.wc_bytes_xfer, XDR_DECODE); 1205 1206 /* 1207 * Treat xid as opaque (xid is the first entity 1208 * in the rpc rdma message). 1209 */ 1210 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1211 /* Skip xid and set the xdr position accordingly. */ 1212 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1213 if (!xdr_u_int(xdrs, &vers) || 1214 !xdr_u_int(xdrs, &rdma_credit) || 1215 !xdr_u_int(xdrs, &op)) { 1216 rib_rbuf_free(conn, RECV_BUFFER, 1217 (void *)(uintptr_t)s_recvp->vaddr); 1218 XDR_DESTROY(xdrs); 1219 (void) rib_free_svc_recv(s_recvp); 1220 continue; 1221 } 1222 XDR_DESTROY(xdrs); 1223 1224 if (vers != RPCRDMA_VERS) { 1225 /* 1226 * Invalid RPC/RDMA version. 1227 * Drop rpc rdma message. 1228 */ 1229 rib_rbuf_free(conn, RECV_BUFFER, 1230 (void *)(uintptr_t)s_recvp->vaddr); 1231 (void) rib_free_svc_recv(s_recvp); 1232 continue; 1233 } 1234 /* 1235 * Is this for RDMA_DONE? 1236 */ 1237 if (op == RDMA_DONE) { 1238 rib_rbuf_free(conn, RECV_BUFFER, 1239 (void *)(uintptr_t)s_recvp->vaddr); 1240 /* 1241 * Wake up the thread waiting on 1242 * a RDMA_DONE for xid 1243 */ 1244 mutex_enter(&qp->rdlist_lock); 1245 rdma_done_notify(qp, xid); 1246 mutex_exit(&qp->rdlist_lock); 1247 (void) rib_free_svc_recv(s_recvp); 1248 continue; 1249 } 1250 1251 mutex_enter(&plugin_state_lock); 1252 if (plugin_state == ACCEPT) { 1253 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1254 == NULL) 1255 (void) strwaitbuf( 1256 sizeof (*rdp), BPRI_LO); 1257 /* 1258 * Plugin is in accept state, hence the master 1259 * transport queue for this is still accepting 1260 * requests. Hence we can call svc_queuereq to 1261 * queue this recieved msg. 1262 */ 1263 rdp = (rdma_recv_data_t *)mp->b_rptr; 1264 rdp->conn = conn; 1265 rdp->rpcmsg.addr = 1266 (caddr_t)(uintptr_t)s_recvp->vaddr; 1267 rdp->rpcmsg.type = RECV_BUFFER; 1268 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1269 rdp->status = wc.wc_status; 1270 mutex_enter(&conn->c_lock); 1271 conn->c_ref++; 1272 mutex_exit(&conn->c_lock); 1273 mp->b_wptr += sizeof (*rdp); 1274 svc_queuereq((queue_t *)rib_stat->q, mp); 1275 mutex_exit(&plugin_state_lock); 1276 } else { 1277 /* 1278 * The master transport for this is going 1279 * away and the queue is not accepting anymore 1280 * requests for krpc, so don't do anything, just 1281 * free the msg. 1282 */ 1283 mutex_exit(&plugin_state_lock); 1284 rib_rbuf_free(conn, RECV_BUFFER, 1285 (void *)(uintptr_t)s_recvp->vaddr); 1286 } 1287 } else { 1288 rib_rbuf_free(conn, RECV_BUFFER, 1289 (void *)(uintptr_t)s_recvp->vaddr); 1290 } 1291 (void) rib_free_svc_recv(s_recvp); 1292 } 1293 } 1294 1295 /* 1296 * Handles DR event of IBT_HCA_DETACH_EVENT. 1297 */ 1298 /* ARGSUSED */ 1299 static void 1300 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1301 ibt_async_code_t code, ibt_async_event_t *event) 1302 { 1303 1304 switch (code) { 1305 case IBT_HCA_ATTACH_EVENT: 1306 /* ignore */ 1307 break; 1308 case IBT_HCA_DETACH_EVENT: 1309 { 1310 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1311 rib_detach_hca(rib_stat->hca); 1312 #ifdef DEBUG 1313 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1314 #endif 1315 break; 1316 } 1317 #ifdef DEBUG 1318 case IBT_EVENT_PATH_MIGRATED: 1319 cmn_err(CE_NOTE, "rib_async_handler(): " 1320 "IBT_EVENT_PATH_MIGRATED\n"); 1321 break; 1322 case IBT_EVENT_SQD: 1323 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1324 break; 1325 case IBT_EVENT_COM_EST: 1326 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1327 break; 1328 case IBT_ERROR_CATASTROPHIC_CHAN: 1329 cmn_err(CE_NOTE, "rib_async_handler(): " 1330 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1331 break; 1332 case IBT_ERROR_INVALID_REQUEST_CHAN: 1333 cmn_err(CE_NOTE, "rib_async_handler(): " 1334 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1335 break; 1336 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1337 cmn_err(CE_NOTE, "rib_async_handler(): " 1338 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1339 break; 1340 case IBT_ERROR_PATH_MIGRATE_REQ: 1341 cmn_err(CE_NOTE, "rib_async_handler(): " 1342 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1343 break; 1344 case IBT_ERROR_CQ: 1345 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1346 break; 1347 case IBT_ERROR_PORT_DOWN: 1348 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1349 break; 1350 case IBT_EVENT_PORT_UP: 1351 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1352 break; 1353 case IBT_ASYNC_OPAQUE1: 1354 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1355 break; 1356 case IBT_ASYNC_OPAQUE2: 1357 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1358 break; 1359 case IBT_ASYNC_OPAQUE3: 1360 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1361 break; 1362 case IBT_ASYNC_OPAQUE4: 1363 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1364 break; 1365 #endif 1366 default: 1367 break; 1368 } 1369 } 1370 1371 /* 1372 * Client's reachable function. 1373 */ 1374 static rdma_stat 1375 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1376 { 1377 rdma_stat status; 1378 rpcib_ping_t rpt; 1379 1380 /* 1381 * First check if a hca is still attached 1382 */ 1383 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1384 if (rib_stat->hca->state != HCA_INITED) { 1385 rw_exit(&rib_stat->hca->state_lock); 1386 return (RDMA_FAILED); 1387 } 1388 1389 bzero(&rpt, sizeof (rpcib_ping_t)); 1390 status = rib_ping_srv(addr_type, raddr, &rpt); 1391 rw_exit(&rib_stat->hca->state_lock); 1392 1393 if (status == RDMA_SUCCESS) { 1394 *handle = (void *)rpt.hca; 1395 return (RDMA_SUCCESS); 1396 } else { 1397 *handle = NULL; 1398 DTRACE_PROBE(rpcib__i__pingfailed); 1399 return (RDMA_FAILED); 1400 } 1401 } 1402 1403 /* Client side qp creation */ 1404 static rdma_stat 1405 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1406 { 1407 rib_qp_t *kqp = NULL; 1408 CONN *conn; 1409 rdma_clnt_cred_ctrl_t *cc_info; 1410 1411 ASSERT(qp != NULL); 1412 *qp = NULL; 1413 1414 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1415 conn = qptoc(kqp); 1416 kqp->hca = hca; 1417 kqp->rdmaconn.c_rdmamod = &rib_mod; 1418 kqp->rdmaconn.c_private = (caddr_t)kqp; 1419 1420 kqp->mode = RIB_CLIENT; 1421 kqp->chan_flags = IBT_BLOCKING; 1422 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1423 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1424 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1425 /* 1426 * Initialize 1427 */ 1428 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1429 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1430 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1431 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1432 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1433 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1434 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1435 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1436 /* 1437 * Initialize the client credit control 1438 * portion of the rdmaconn struct. 1439 */ 1440 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1441 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1442 cc_info->clnt_cc_granted_ops = 0; 1443 cc_info->clnt_cc_in_flight_ops = 0; 1444 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1445 1446 *qp = kqp; 1447 return (RDMA_SUCCESS); 1448 } 1449 1450 /* Server side qp creation */ 1451 static rdma_stat 1452 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1453 { 1454 rib_qp_t *kqp = NULL; 1455 ibt_chan_sizes_t chan_sizes; 1456 ibt_rc_chan_alloc_args_t qp_attr; 1457 ibt_status_t ibt_status; 1458 rdma_srv_cred_ctrl_t *cc_info; 1459 1460 *qp = NULL; 1461 1462 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1463 kqp->hca = hca; 1464 kqp->port_num = port; 1465 kqp->rdmaconn.c_rdmamod = &rib_mod; 1466 kqp->rdmaconn.c_private = (caddr_t)kqp; 1467 1468 /* 1469 * Create the qp handle 1470 */ 1471 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1472 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1473 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1474 qp_attr.rc_pd = hca->pd_hdl; 1475 qp_attr.rc_hca_port_num = port; 1476 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1477 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1478 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1479 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1480 qp_attr.rc_clone_chan = NULL; 1481 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1482 qp_attr.rc_flags = IBT_WR_SIGNALED; 1483 1484 rw_enter(&hca->state_lock, RW_READER); 1485 if (hca->state != HCA_DETACHED) { 1486 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1487 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1488 &chan_sizes); 1489 } else { 1490 rw_exit(&hca->state_lock); 1491 goto fail; 1492 } 1493 rw_exit(&hca->state_lock); 1494 1495 if (ibt_status != IBT_SUCCESS) { 1496 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1497 int, ibt_status); 1498 goto fail; 1499 } 1500 1501 kqp->mode = RIB_SERVER; 1502 kqp->chan_flags = IBT_BLOCKING; 1503 kqp->q = q; /* server ONLY */ 1504 1505 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1506 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1507 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1508 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1509 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1510 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1511 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1512 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1513 /* 1514 * Set the private data area to qp to be used in callbacks 1515 */ 1516 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1517 kqp->rdmaconn.c_state = C_CONNECTED; 1518 1519 /* 1520 * Initialize the server credit control 1521 * portion of the rdmaconn struct. 1522 */ 1523 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1524 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1525 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1526 cc_info->srv_cc_cur_buffers_used = 0; 1527 cc_info->srv_cc_posted = preposted_rbufs; 1528 1529 *qp = kqp; 1530 1531 return (RDMA_SUCCESS); 1532 fail: 1533 if (kqp) 1534 kmem_free(kqp, sizeof (rib_qp_t)); 1535 1536 return (RDMA_FAILED); 1537 } 1538 1539 /* ARGSUSED */ 1540 ibt_cm_status_t 1541 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1542 ibt_cm_return_args_t *ret_args, void *priv_data, 1543 ibt_priv_data_len_t len) 1544 { 1545 rpcib_state_t *ribstat; 1546 rib_hca_t *hca; 1547 1548 ribstat = (rpcib_state_t *)clnt_hdl; 1549 hca = (rib_hca_t *)ribstat->hca; 1550 1551 switch (event->cm_type) { 1552 1553 /* got a connection close event */ 1554 case IBT_CM_EVENT_CONN_CLOSED: 1555 { 1556 CONN *conn; 1557 rib_qp_t *qp; 1558 1559 /* check reason why connection was closed */ 1560 switch (event->cm_event.closed) { 1561 case IBT_CM_CLOSED_DREP_RCVD: 1562 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1563 case IBT_CM_CLOSED_DUP: 1564 case IBT_CM_CLOSED_ABORT: 1565 case IBT_CM_CLOSED_ALREADY: 1566 /* 1567 * These cases indicate the local end initiated 1568 * the closing of the channel. Nothing to do here. 1569 */ 1570 break; 1571 default: 1572 /* 1573 * Reason for CONN_CLOSED event must be one of 1574 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1575 * or IBT_CM_CLOSED_STALE. These indicate cases were 1576 * the remote end is closing the channel. In these 1577 * cases free the channel and transition to error 1578 * state 1579 */ 1580 qp = ibt_get_chan_private(event->cm_channel); 1581 conn = qptoc(qp); 1582 mutex_enter(&conn->c_lock); 1583 if (conn->c_state == C_DISCONN_PEND) { 1584 mutex_exit(&conn->c_lock); 1585 break; 1586 } 1587 1588 conn->c_state = C_ERROR_CONN; 1589 1590 /* 1591 * Free the rc_channel. Channel has already 1592 * transitioned to ERROR state and WRs have been 1593 * FLUSHED_ERR already. 1594 */ 1595 (void) ibt_free_channel(qp->qp_hdl); 1596 qp->qp_hdl = NULL; 1597 1598 /* 1599 * Free the conn if c_ref is down to 0 already 1600 */ 1601 if (conn->c_ref == 0) { 1602 /* 1603 * Remove from list and free conn 1604 */ 1605 conn->c_state = C_DISCONN_PEND; 1606 mutex_exit(&conn->c_lock); 1607 (void) rib_disconnect_channel(conn, 1608 &hca->cl_conn_list); 1609 } else { 1610 mutex_exit(&conn->c_lock); 1611 } 1612 #ifdef DEBUG 1613 if (rib_debug) 1614 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1615 "(CONN_CLOSED) channel disconnected"); 1616 #endif 1617 break; 1618 } 1619 break; 1620 } 1621 default: 1622 break; 1623 } 1624 return (IBT_CM_ACCEPT); 1625 } 1626 1627 /* 1628 * Connect to the server. 1629 */ 1630 rdma_stat 1631 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1632 { 1633 ibt_chan_open_args_t chan_args; /* channel args */ 1634 ibt_chan_sizes_t chan_sizes; 1635 ibt_rc_chan_alloc_args_t qp_attr; 1636 ibt_status_t ibt_status; 1637 ibt_rc_returns_t ret_args; /* conn reject info */ 1638 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1639 ibt_ip_cm_info_t ipcm_info; 1640 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1641 1642 1643 (void) bzero(&chan_args, sizeof (chan_args)); 1644 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1645 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1646 1647 ipcm_info.src_addr.family = rptp->srcip.family; 1648 switch (ipcm_info.src_addr.family) { 1649 case AF_INET: 1650 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1651 break; 1652 case AF_INET6: 1653 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1654 break; 1655 } 1656 1657 ipcm_info.dst_addr.family = rptp->srcip.family; 1658 switch (ipcm_info.dst_addr.family) { 1659 case AF_INET: 1660 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1661 break; 1662 case AF_INET6: 1663 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1664 break; 1665 } 1666 1667 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1668 1669 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1670 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1671 1672 if (ibt_status != IBT_SUCCESS) { 1673 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1674 return (-1); 1675 } 1676 1677 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1678 /* Alloc a RC channel */ 1679 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1680 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1681 qp_attr.rc_pd = hca->pd_hdl; 1682 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1683 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1684 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1685 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1686 qp_attr.rc_clone_chan = NULL; 1687 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1688 qp_attr.rc_flags = IBT_WR_SIGNALED; 1689 1690 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1691 chan_args.oc_path = &rptp->path; 1692 1693 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1694 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1695 chan_args.oc_rdma_ra_out = 4; 1696 chan_args.oc_rdma_ra_in = 4; 1697 chan_args.oc_path_retry_cnt = 2; 1698 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1699 chan_args.oc_priv_data = cmp_ip_pvt; 1700 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1701 1702 refresh: 1703 rw_enter(&hca->state_lock, RW_READER); 1704 if (hca->state != HCA_DETACHED) { 1705 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1706 IBT_ACHAN_NO_FLAGS, 1707 &qp_attr, &qp->qp_hdl, 1708 &chan_sizes); 1709 } else { 1710 rw_exit(&hca->state_lock); 1711 return (RDMA_FAILED); 1712 } 1713 rw_exit(&hca->state_lock); 1714 1715 if (ibt_status != IBT_SUCCESS) { 1716 DTRACE_PROBE1(rpcib__i_conntosrv, 1717 int, ibt_status); 1718 return (RDMA_FAILED); 1719 } 1720 1721 /* Connect to the Server */ 1722 (void) bzero(&ret_args, sizeof (ret_args)); 1723 mutex_enter(&qp->cb_lock); 1724 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1725 IBT_BLOCKING, &chan_args, &ret_args); 1726 if (ibt_status != IBT_SUCCESS) { 1727 DTRACE_PROBE2(rpcib__i_openrctosrv, 1728 int, ibt_status, int, ret_args.rc_status); 1729 1730 (void) ibt_free_channel(qp->qp_hdl); 1731 qp->qp_hdl = NULL; 1732 mutex_exit(&qp->cb_lock); 1733 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1734 ret_args.rc_status == IBT_CM_CONN_STALE) { 1735 /* 1736 * Got IBT_CM_CONN_STALE probably because of stale 1737 * data on the passive end of a channel that existed 1738 * prior to reboot. Retry establishing a channel 1739 * REFRESH_ATTEMPTS times, during which time the 1740 * stale conditions on the server might clear up. 1741 */ 1742 goto refresh; 1743 } 1744 return (RDMA_FAILED); 1745 } 1746 mutex_exit(&qp->cb_lock); 1747 /* 1748 * Set the private data area to qp to be used in callbacks 1749 */ 1750 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1751 return (RDMA_SUCCESS); 1752 } 1753 1754 rdma_stat 1755 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1756 { 1757 uint_t i; 1758 ibt_status_t ibt_status; 1759 uint8_t num_paths_p; 1760 ibt_ip_path_attr_t ipattr; 1761 ibt_path_ip_src_t srcip; 1762 rpcib_ipaddrs_t addrs4; 1763 rpcib_ipaddrs_t addrs6; 1764 struct sockaddr_in *sinp; 1765 struct sockaddr_in6 *sin6p; 1766 rdma_stat retval = RDMA_SUCCESS; 1767 1768 ASSERT(raddr->buf != NULL); 1769 1770 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1771 1772 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1773 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1774 retval = RDMA_FAILED; 1775 goto done; 1776 } 1777 1778 /* Prep the destination address */ 1779 switch (addr_type) { 1780 case AF_INET: 1781 sinp = (struct sockaddr_in *)raddr->buf; 1782 rptp->dstip.family = AF_INET; 1783 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1784 sinp = addrs4.ri_list; 1785 1786 ipattr.ipa_dst_ip = &rptp->dstip; 1787 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1788 ipattr.ipa_ndst = 1; 1789 ipattr.ipa_max_paths = 1; 1790 ipattr.ipa_src_ip.family = rptp->dstip.family; 1791 for (i = 0; i < addrs4.ri_count; i++) { 1792 num_paths_p = 0; 1793 ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; 1794 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1795 1796 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1797 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1798 &num_paths_p, &srcip); 1799 if (ibt_status == IBT_SUCCESS && 1800 num_paths_p != 0 && 1801 rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { 1802 rptp->hca = rib_stat->hca; 1803 rptp->srcip.family = AF_INET; 1804 rptp->srcip.un.ip4addr = 1805 srcip.ip_primary.un.ip4addr; 1806 goto done; 1807 } 1808 } 1809 retval = RDMA_FAILED; 1810 break; 1811 1812 case AF_INET6: 1813 sin6p = (struct sockaddr_in6 *)raddr->buf; 1814 rptp->dstip.family = AF_INET6; 1815 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1816 sin6p = addrs6.ri_list; 1817 1818 ipattr.ipa_dst_ip = &rptp->dstip; 1819 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1820 ipattr.ipa_ndst = 1; 1821 ipattr.ipa_max_paths = 1; 1822 ipattr.ipa_src_ip.family = rptp->dstip.family; 1823 for (i = 0; i < addrs6.ri_count; i++) { 1824 num_paths_p = 0; 1825 ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; 1826 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1827 1828 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1829 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1830 &num_paths_p, &srcip); 1831 if (ibt_status == IBT_SUCCESS && 1832 num_paths_p != 0 && 1833 rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { 1834 rptp->hca = rib_stat->hca; 1835 rptp->srcip.family = AF_INET6; 1836 rptp->srcip.un.ip6addr = 1837 srcip.ip_primary.un.ip6addr; 1838 goto done; 1839 } 1840 } 1841 retval = RDMA_FAILED; 1842 break; 1843 1844 default: 1845 retval = RDMA_INVAL; 1846 break; 1847 } 1848 done: 1849 1850 if (addrs4.ri_size > 0) 1851 kmem_free(addrs4.ri_list, addrs4.ri_size); 1852 if (addrs6.ri_size > 0) 1853 kmem_free(addrs6.ri_list, addrs6.ri_size); 1854 return (retval); 1855 } 1856 1857 /* 1858 * Close channel, remove from connection list and 1859 * free up resources allocated for that channel. 1860 */ 1861 rdma_stat 1862 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1863 { 1864 rib_qp_t *qp = ctoqp(conn); 1865 rib_hca_t *hca; 1866 1867 /* 1868 * c_ref == 0 and connection is in C_DISCONN_PEND 1869 */ 1870 hca = qp->hca; 1871 if (conn_list != NULL) 1872 (void) rib_rm_conn(conn, conn_list); 1873 1874 if (qp->qp_hdl != NULL) { 1875 /* 1876 * If the channel has not been establised, 1877 * ibt_flush_channel is called to flush outstanding WRs 1878 * on the Qs. Otherwise, ibt_close_rc_channel() is 1879 * called. The channel is then freed. 1880 */ 1881 if (conn_list != NULL) 1882 (void) ibt_close_rc_channel(qp->qp_hdl, 1883 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1884 else 1885 (void) ibt_flush_channel(qp->qp_hdl); 1886 1887 mutex_enter(&qp->posted_rbufs_lock); 1888 while (qp->n_posted_rbufs) 1889 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1890 mutex_exit(&qp->posted_rbufs_lock); 1891 (void) ibt_free_channel(qp->qp_hdl); 1892 qp->qp_hdl = NULL; 1893 } 1894 1895 ASSERT(qp->rdlist == NULL); 1896 1897 if (qp->replylist != NULL) { 1898 (void) rib_rem_replylist(qp); 1899 } 1900 1901 cv_destroy(&qp->cb_conn_cv); 1902 cv_destroy(&qp->posted_rbufs_cv); 1903 mutex_destroy(&qp->cb_lock); 1904 1905 mutex_destroy(&qp->replylist_lock); 1906 mutex_destroy(&qp->posted_rbufs_lock); 1907 mutex_destroy(&qp->rdlist_lock); 1908 1909 cv_destroy(&conn->c_cv); 1910 mutex_destroy(&conn->c_lock); 1911 1912 if (conn->c_raddr.buf != NULL) { 1913 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1914 } 1915 if (conn->c_laddr.buf != NULL) { 1916 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1917 } 1918 1919 /* 1920 * Credit control cleanup. 1921 */ 1922 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 1923 rdma_clnt_cred_ctrl_t *cc_info; 1924 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1925 cv_destroy(&cc_info->clnt_cc_cv); 1926 } 1927 1928 kmem_free(qp, sizeof (rib_qp_t)); 1929 1930 /* 1931 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1932 * then the hca is no longer being used. 1933 */ 1934 if (conn_list != NULL) { 1935 rw_enter(&hca->state_lock, RW_READER); 1936 if (hca->state == HCA_DETACHED) { 1937 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1938 if (hca->srv_conn_list.conn_hd == NULL) { 1939 rw_enter(&hca->cl_conn_list.conn_lock, 1940 RW_READER); 1941 1942 if (hca->cl_conn_list.conn_hd == NULL) { 1943 mutex_enter(&hca->inuse_lock); 1944 hca->inuse = FALSE; 1945 cv_signal(&hca->cb_cv); 1946 mutex_exit(&hca->inuse_lock); 1947 } 1948 rw_exit(&hca->cl_conn_list.conn_lock); 1949 } 1950 rw_exit(&hca->srv_conn_list.conn_lock); 1951 } 1952 rw_exit(&hca->state_lock); 1953 } 1954 1955 return (RDMA_SUCCESS); 1956 } 1957 1958 /* 1959 * Wait for send completion notification. Only on receiving a 1960 * notification be it a successful or error completion, free the 1961 * send_wid. 1962 */ 1963 static rdma_stat 1964 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 1965 { 1966 clock_t timout, cv_wait_ret; 1967 rdma_stat error = RDMA_SUCCESS; 1968 int i; 1969 1970 /* 1971 * Wait for send to complete 1972 */ 1973 ASSERT(wd != NULL); 1974 mutex_enter(&wd->sendwait_lock); 1975 if (wd->status == (uint_t)SEND_WAIT) { 1976 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 1977 ddi_get_lbolt(); 1978 1979 if (qp->mode == RIB_SERVER) { 1980 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 1981 &wd->sendwait_lock, timout)) > 0 && 1982 wd->status == (uint_t)SEND_WAIT) 1983 ; 1984 switch (cv_wait_ret) { 1985 case -1: /* timeout */ 1986 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 1987 1988 wd->cv_sig = 0; /* no signal needed */ 1989 error = RDMA_TIMEDOUT; 1990 break; 1991 default: /* got send completion */ 1992 break; 1993 } 1994 } else { 1995 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 1996 &wd->sendwait_lock, timout)) > 0 && 1997 wd->status == (uint_t)SEND_WAIT) 1998 ; 1999 switch (cv_wait_ret) { 2000 case -1: /* timeout */ 2001 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2002 2003 wd->cv_sig = 0; /* no signal needed */ 2004 error = RDMA_TIMEDOUT; 2005 break; 2006 case 0: /* interrupted */ 2007 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2008 2009 wd->cv_sig = 0; /* no signal needed */ 2010 error = RDMA_INTR; 2011 break; 2012 default: /* got send completion */ 2013 break; 2014 } 2015 } 2016 } 2017 2018 if (wd->status != (uint_t)SEND_WAIT) { 2019 /* got send completion */ 2020 if (wd->status != RDMA_SUCCESS) { 2021 if (wd->status != RDMA_CONNLOST) { 2022 error = RDMA_FAILED; 2023 } else { 2024 error = RDMA_CONNLOST; 2025 } 2026 } 2027 for (i = 0; i < wd->nsbufs; i++) { 2028 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2029 (void *)(uintptr_t)wd->sbufaddr[i]); 2030 } 2031 mutex_exit(&wd->sendwait_lock); 2032 (void) rib_free_sendwait(wd); 2033 } else { 2034 mutex_exit(&wd->sendwait_lock); 2035 } 2036 return (error); 2037 } 2038 2039 static struct send_wid * 2040 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2041 { 2042 struct send_wid *wd; 2043 2044 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2045 wd->xid = xid; 2046 wd->cv_sig = cv_sig; 2047 wd->qp = qp; 2048 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2049 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2050 wd->status = (uint_t)SEND_WAIT; 2051 2052 return (wd); 2053 } 2054 2055 static int 2056 rib_free_sendwait(struct send_wid *wdesc) 2057 { 2058 cv_destroy(&wdesc->wait_cv); 2059 mutex_destroy(&wdesc->sendwait_lock); 2060 kmem_free(wdesc, sizeof (*wdesc)); 2061 2062 return (0); 2063 } 2064 2065 static rdma_stat 2066 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2067 { 2068 mutex_enter(&qp->replylist_lock); 2069 if (rep != NULL) { 2070 (void) rib_remreply(qp, rep); 2071 mutex_exit(&qp->replylist_lock); 2072 return (RDMA_SUCCESS); 2073 } 2074 mutex_exit(&qp->replylist_lock); 2075 return (RDMA_FAILED); 2076 } 2077 2078 /* 2079 * Send buffers are freed here only in case of error in posting 2080 * on QP. If the post succeeded, the send buffers are freed upon 2081 * send completion in rib_sendwait() or in the scq_handler. 2082 */ 2083 rdma_stat 2084 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2085 int send_sig, int cv_sig, caddr_t *swid) 2086 { 2087 struct send_wid *wdesc; 2088 struct clist *clp; 2089 ibt_status_t ibt_status = IBT_SUCCESS; 2090 rdma_stat ret = RDMA_SUCCESS; 2091 ibt_send_wr_t tx_wr; 2092 int i, nds; 2093 ibt_wr_ds_t sgl[DSEG_MAX]; 2094 uint_t total_msg_size; 2095 rib_qp_t *qp; 2096 2097 qp = ctoqp(conn); 2098 2099 ASSERT(cl != NULL); 2100 2101 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2102 2103 nds = 0; 2104 total_msg_size = 0; 2105 clp = cl; 2106 while (clp != NULL) { 2107 if (nds >= DSEG_MAX) { 2108 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2109 return (RDMA_FAILED); 2110 } 2111 sgl[nds].ds_va = clp->w.c_saddr; 2112 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2113 sgl[nds].ds_len = clp->c_len; 2114 total_msg_size += clp->c_len; 2115 clp = clp->c_next; 2116 nds++; 2117 } 2118 2119 if (send_sig) { 2120 /* Set SEND_SIGNAL flag. */ 2121 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2122 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2123 *swid = (caddr_t)wdesc; 2124 } else { 2125 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2126 wdesc = rib_init_sendwait(msgid, 0, qp); 2127 *swid = (caddr_t)wdesc; 2128 } 2129 wdesc->nsbufs = nds; 2130 for (i = 0; i < nds; i++) { 2131 wdesc->sbufaddr[i] = sgl[i].ds_va; 2132 } 2133 2134 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2135 tx_wr.wr_opcode = IBT_WRC_SEND; 2136 tx_wr.wr_trans = IBT_RC_SRV; 2137 tx_wr.wr_nds = nds; 2138 tx_wr.wr_sgl = sgl; 2139 2140 mutex_enter(&conn->c_lock); 2141 if (conn->c_state == C_CONNECTED) { 2142 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2143 } 2144 if (conn->c_state != C_CONNECTED || 2145 ibt_status != IBT_SUCCESS) { 2146 if (conn->c_state != C_DISCONN_PEND) 2147 conn->c_state = C_ERROR_CONN; 2148 mutex_exit(&conn->c_lock); 2149 for (i = 0; i < nds; i++) { 2150 rib_rbuf_free(conn, SEND_BUFFER, 2151 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2152 } 2153 (void) rib_free_sendwait(wdesc); 2154 return (RDMA_CONNLOST); 2155 } 2156 mutex_exit(&conn->c_lock); 2157 2158 if (send_sig) { 2159 if (cv_sig) { 2160 /* 2161 * cv_wait for send to complete. 2162 * We can fail due to a timeout or signal or 2163 * unsuccessful send. 2164 */ 2165 ret = rib_sendwait(qp, wdesc); 2166 2167 return (ret); 2168 } 2169 } 2170 2171 return (RDMA_SUCCESS); 2172 } 2173 2174 2175 rdma_stat 2176 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2177 { 2178 rdma_stat ret; 2179 caddr_t wd; 2180 2181 /* send-wait & cv_signal */ 2182 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2183 return (ret); 2184 } 2185 2186 /* 2187 * Server interface (svc_rdma_ksend). 2188 * Send RPC reply and wait for RDMA_DONE. 2189 */ 2190 rdma_stat 2191 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2192 { 2193 rdma_stat ret = RDMA_SUCCESS; 2194 struct rdma_done_list *rd; 2195 clock_t timout, cv_wait_ret; 2196 caddr_t *wid = NULL; 2197 rib_qp_t *qp = ctoqp(conn); 2198 2199 mutex_enter(&qp->rdlist_lock); 2200 rd = rdma_done_add(qp, msgid); 2201 2202 /* No cv_signal (whether send-wait or no-send-wait) */ 2203 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2204 2205 if (ret != RDMA_SUCCESS) { 2206 rdma_done_rm(qp, rd); 2207 } else { 2208 /* 2209 * Wait for RDMA_DONE from remote end 2210 */ 2211 timout = 2212 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2213 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2214 &qp->rdlist_lock, 2215 timout); 2216 2217 rdma_done_rm(qp, rd); 2218 2219 if (cv_wait_ret < 0) { 2220 ret = RDMA_TIMEDOUT; 2221 } 2222 } 2223 2224 mutex_exit(&qp->rdlist_lock); 2225 return (ret); 2226 } 2227 2228 static struct recv_wid * 2229 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2230 { 2231 struct recv_wid *rwid; 2232 2233 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2234 rwid->xid = msgid; 2235 rwid->addr = sgl->ds_va; 2236 rwid->qp = qp; 2237 2238 return (rwid); 2239 } 2240 2241 static void 2242 rib_free_wid(struct recv_wid *rwid) 2243 { 2244 kmem_free(rwid, sizeof (struct recv_wid)); 2245 } 2246 2247 rdma_stat 2248 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2249 { 2250 rib_qp_t *qp = ctoqp(conn); 2251 struct clist *clp = cl; 2252 struct reply *rep; 2253 struct recv_wid *rwid; 2254 int nds; 2255 ibt_wr_ds_t sgl[DSEG_MAX]; 2256 ibt_recv_wr_t recv_wr; 2257 rdma_stat ret; 2258 ibt_status_t ibt_status; 2259 2260 /* 2261 * rdma_clnt_postrecv uses RECV_BUFFER. 2262 */ 2263 2264 nds = 0; 2265 while (cl != NULL) { 2266 if (nds >= DSEG_MAX) { 2267 ret = RDMA_FAILED; 2268 goto done; 2269 } 2270 sgl[nds].ds_va = cl->w.c_saddr; 2271 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2272 sgl[nds].ds_len = cl->c_len; 2273 cl = cl->c_next; 2274 nds++; 2275 } 2276 2277 if (nds != 1) { 2278 ret = RDMA_FAILED; 2279 goto done; 2280 } 2281 2282 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2283 recv_wr.wr_nds = nds; 2284 recv_wr.wr_sgl = sgl; 2285 2286 rwid = rib_create_wid(qp, &sgl[0], msgid); 2287 if (rwid) { 2288 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2289 } else { 2290 ret = RDMA_NORESOURCE; 2291 goto done; 2292 } 2293 rep = rib_addreplylist(qp, msgid); 2294 if (!rep) { 2295 rib_free_wid(rwid); 2296 ret = RDMA_NORESOURCE; 2297 goto done; 2298 } 2299 2300 mutex_enter(&conn->c_lock); 2301 2302 if (conn->c_state == C_CONNECTED) { 2303 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2304 } 2305 2306 if (conn->c_state != C_CONNECTED || 2307 ibt_status != IBT_SUCCESS) { 2308 if (conn->c_state != C_DISCONN_PEND) 2309 conn->c_state = C_ERROR_CONN; 2310 mutex_exit(&conn->c_lock); 2311 rib_free_wid(rwid); 2312 (void) rib_rem_rep(qp, rep); 2313 ret = RDMA_CONNLOST; 2314 goto done; 2315 } 2316 mutex_exit(&conn->c_lock); 2317 return (RDMA_SUCCESS); 2318 2319 done: 2320 while (clp != NULL) { 2321 rib_rbuf_free(conn, RECV_BUFFER, 2322 (void *)(uintptr_t)clp->w.c_saddr3); 2323 clp = clp->c_next; 2324 } 2325 return (ret); 2326 } 2327 2328 rdma_stat 2329 rib_svc_post(CONN* conn, struct clist *cl) 2330 { 2331 rib_qp_t *qp = ctoqp(conn); 2332 struct svc_recv *s_recvp; 2333 int nds; 2334 ibt_wr_ds_t sgl[DSEG_MAX]; 2335 ibt_recv_wr_t recv_wr; 2336 ibt_status_t ibt_status; 2337 2338 nds = 0; 2339 while (cl != NULL) { 2340 if (nds >= DSEG_MAX) { 2341 return (RDMA_FAILED); 2342 } 2343 sgl[nds].ds_va = cl->w.c_saddr; 2344 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2345 sgl[nds].ds_len = cl->c_len; 2346 cl = cl->c_next; 2347 nds++; 2348 } 2349 2350 if (nds != 1) { 2351 rib_rbuf_free(conn, RECV_BUFFER, 2352 (caddr_t)(uintptr_t)sgl[0].ds_va); 2353 2354 return (RDMA_FAILED); 2355 } 2356 2357 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2358 recv_wr.wr_nds = nds; 2359 recv_wr.wr_sgl = sgl; 2360 2361 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2362 /* Use s_recvp's addr as wr id */ 2363 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2364 mutex_enter(&conn->c_lock); 2365 if (conn->c_state == C_CONNECTED) { 2366 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2367 } 2368 if (conn->c_state != C_CONNECTED || 2369 ibt_status != IBT_SUCCESS) { 2370 if (conn->c_state != C_DISCONN_PEND) 2371 conn->c_state = C_ERROR_CONN; 2372 mutex_exit(&conn->c_lock); 2373 rib_rbuf_free(conn, RECV_BUFFER, 2374 (caddr_t)(uintptr_t)sgl[0].ds_va); 2375 (void) rib_free_svc_recv(s_recvp); 2376 2377 return (RDMA_CONNLOST); 2378 } 2379 mutex_exit(&conn->c_lock); 2380 2381 return (RDMA_SUCCESS); 2382 } 2383 2384 /* Client */ 2385 rdma_stat 2386 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2387 { 2388 2389 return (rib_clnt_post(conn, cl, msgid)); 2390 } 2391 2392 /* Client */ 2393 rdma_stat 2394 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2395 { 2396 rib_qp_t *qp = ctoqp(conn); 2397 struct reply *rep; 2398 2399 mutex_enter(&qp->replylist_lock); 2400 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2401 if (rep->xid == msgid) { 2402 if (rep->vaddr_cq) { 2403 rib_rbuf_free(conn, RECV_BUFFER, 2404 (caddr_t)(uintptr_t)rep->vaddr_cq); 2405 } 2406 (void) rib_remreply(qp, rep); 2407 break; 2408 } 2409 } 2410 mutex_exit(&qp->replylist_lock); 2411 2412 return (RDMA_SUCCESS); 2413 } 2414 2415 /* Server */ 2416 rdma_stat 2417 rib_post_recv(CONN *conn, struct clist *cl) 2418 { 2419 rib_qp_t *qp = ctoqp(conn); 2420 2421 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2422 mutex_enter(&qp->posted_rbufs_lock); 2423 qp->n_posted_rbufs++; 2424 mutex_exit(&qp->posted_rbufs_lock); 2425 return (RDMA_SUCCESS); 2426 } 2427 return (RDMA_FAILED); 2428 } 2429 2430 /* 2431 * Client side only interface to "recv" the rpc reply buf 2432 * posted earlier by rib_post_resp(conn, cl, msgid). 2433 */ 2434 rdma_stat 2435 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2436 { 2437 struct reply *rep = NULL; 2438 clock_t timout, cv_wait_ret; 2439 rdma_stat ret = RDMA_SUCCESS; 2440 rib_qp_t *qp = ctoqp(conn); 2441 2442 /* 2443 * Find the reply structure for this msgid 2444 */ 2445 mutex_enter(&qp->replylist_lock); 2446 2447 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2448 if (rep->xid == msgid) 2449 break; 2450 } 2451 2452 if (rep != NULL) { 2453 /* 2454 * If message not yet received, wait. 2455 */ 2456 if (rep->status == (uint_t)REPLY_WAIT) { 2457 timout = ddi_get_lbolt() + 2458 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2459 2460 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2461 &qp->replylist_lock, timout)) > 0 && 2462 rep->status == (uint_t)REPLY_WAIT) 2463 ; 2464 2465 switch (cv_wait_ret) { 2466 case -1: /* timeout */ 2467 ret = RDMA_TIMEDOUT; 2468 break; 2469 case 0: 2470 ret = RDMA_INTR; 2471 break; 2472 default: 2473 break; 2474 } 2475 } 2476 2477 if (rep->status == RDMA_SUCCESS) { 2478 struct clist *cl = NULL; 2479 2480 /* 2481 * Got message successfully 2482 */ 2483 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2484 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2485 *clp = cl; 2486 } else { 2487 if (rep->status != (uint_t)REPLY_WAIT) { 2488 /* 2489 * Got error in reply message. Free 2490 * recv buffer here. 2491 */ 2492 ret = rep->status; 2493 rib_rbuf_free(conn, RECV_BUFFER, 2494 (caddr_t)(uintptr_t)rep->vaddr_cq); 2495 } 2496 } 2497 (void) rib_remreply(qp, rep); 2498 } else { 2499 /* 2500 * No matching reply structure found for given msgid on the 2501 * reply wait list. 2502 */ 2503 ret = RDMA_INVAL; 2504 DTRACE_PROBE(rpcib__i__nomatchxid2); 2505 } 2506 2507 /* 2508 * Done. 2509 */ 2510 mutex_exit(&qp->replylist_lock); 2511 return (ret); 2512 } 2513 2514 /* 2515 * RDMA write a buffer to the remote address. 2516 */ 2517 rdma_stat 2518 rib_write(CONN *conn, struct clist *cl, int wait) 2519 { 2520 ibt_send_wr_t tx_wr; 2521 int cv_sig; 2522 int i; 2523 ibt_wr_ds_t sgl[DSEG_MAX]; 2524 struct send_wid *wdesc; 2525 ibt_status_t ibt_status; 2526 rdma_stat ret = RDMA_SUCCESS; 2527 rib_qp_t *qp = ctoqp(conn); 2528 uint64_t n_writes = 0; 2529 bool_t force_wait = FALSE; 2530 2531 if (cl == NULL) { 2532 return (RDMA_FAILED); 2533 } 2534 2535 while ((cl != NULL)) { 2536 if (cl->c_len > 0) { 2537 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2538 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2539 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2540 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2541 sgl[0].ds_va = cl->w.c_saddr; 2542 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2543 sgl[0].ds_len = cl->c_len; 2544 2545 if (wait) { 2546 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2547 cv_sig = 1; 2548 } else { 2549 if (n_writes > max_unsignaled_rws) { 2550 n_writes = 0; 2551 force_wait = TRUE; 2552 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2553 cv_sig = 1; 2554 } else { 2555 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2556 cv_sig = 0; 2557 } 2558 } 2559 2560 wdesc = rib_init_sendwait(0, cv_sig, qp); 2561 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2562 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2563 tx_wr.wr_trans = IBT_RC_SRV; 2564 tx_wr.wr_nds = 1; 2565 tx_wr.wr_sgl = sgl; 2566 2567 mutex_enter(&conn->c_lock); 2568 if (conn->c_state == C_CONNECTED) { 2569 ibt_status = 2570 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2571 } 2572 if (conn->c_state != C_CONNECTED || 2573 ibt_status != IBT_SUCCESS) { 2574 if (conn->c_state != C_DISCONN_PEND) 2575 conn->c_state = C_ERROR_CONN; 2576 mutex_exit(&conn->c_lock); 2577 (void) rib_free_sendwait(wdesc); 2578 return (RDMA_CONNLOST); 2579 } 2580 mutex_exit(&conn->c_lock); 2581 2582 /* 2583 * Wait for send to complete 2584 */ 2585 if (wait || force_wait) { 2586 force_wait = FALSE; 2587 ret = rib_sendwait(qp, wdesc); 2588 if (ret != 0) { 2589 return (ret); 2590 } 2591 } else { 2592 mutex_enter(&wdesc->sendwait_lock); 2593 for (i = 0; i < wdesc->nsbufs; i++) { 2594 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2595 (void *)(uintptr_t) 2596 wdesc->sbufaddr[i]); 2597 } 2598 mutex_exit(&wdesc->sendwait_lock); 2599 (void) rib_free_sendwait(wdesc); 2600 } 2601 n_writes ++; 2602 } 2603 cl = cl->c_next; 2604 } 2605 return (RDMA_SUCCESS); 2606 } 2607 2608 /* 2609 * RDMA Read a buffer from the remote address. 2610 */ 2611 rdma_stat 2612 rib_read(CONN *conn, struct clist *cl, int wait) 2613 { 2614 ibt_send_wr_t rx_wr; 2615 int cv_sig; 2616 int i; 2617 ibt_wr_ds_t sgl; 2618 struct send_wid *wdesc; 2619 ibt_status_t ibt_status = IBT_SUCCESS; 2620 rdma_stat ret = RDMA_SUCCESS; 2621 rib_qp_t *qp = ctoqp(conn); 2622 2623 if (cl == NULL) { 2624 return (RDMA_FAILED); 2625 } 2626 2627 while (cl != NULL) { 2628 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2629 /* 2630 * Remote address is at the head chunk item in list. 2631 */ 2632 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2633 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2634 2635 sgl.ds_va = cl->u.c_daddr; 2636 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2637 sgl.ds_len = cl->c_len; 2638 2639 if (wait) { 2640 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2641 cv_sig = 1; 2642 } else { 2643 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2644 cv_sig = 0; 2645 } 2646 2647 wdesc = rib_init_sendwait(0, cv_sig, qp); 2648 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2649 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2650 rx_wr.wr_trans = IBT_RC_SRV; 2651 rx_wr.wr_nds = 1; 2652 rx_wr.wr_sgl = &sgl; 2653 2654 mutex_enter(&conn->c_lock); 2655 if (conn->c_state == C_CONNECTED) { 2656 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2657 } 2658 if (conn->c_state != C_CONNECTED || 2659 ibt_status != IBT_SUCCESS) { 2660 if (conn->c_state != C_DISCONN_PEND) 2661 conn->c_state = C_ERROR_CONN; 2662 mutex_exit(&conn->c_lock); 2663 (void) rib_free_sendwait(wdesc); 2664 return (RDMA_CONNLOST); 2665 } 2666 mutex_exit(&conn->c_lock); 2667 2668 /* 2669 * Wait for send to complete if this is the 2670 * last item in the list. 2671 */ 2672 if (wait && cl->c_next == NULL) { 2673 ret = rib_sendwait(qp, wdesc); 2674 if (ret != 0) { 2675 return (ret); 2676 } 2677 } else { 2678 mutex_enter(&wdesc->sendwait_lock); 2679 for (i = 0; i < wdesc->nsbufs; i++) { 2680 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2681 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2682 } 2683 mutex_exit(&wdesc->sendwait_lock); 2684 (void) rib_free_sendwait(wdesc); 2685 } 2686 cl = cl->c_next; 2687 } 2688 return (RDMA_SUCCESS); 2689 } 2690 2691 /* 2692 * rib_srv_cm_handler() 2693 * Connection Manager callback to handle RC connection requests. 2694 */ 2695 /* ARGSUSED */ 2696 static ibt_cm_status_t 2697 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2698 ibt_cm_return_args_t *ret_args, void *priv_data, 2699 ibt_priv_data_len_t len) 2700 { 2701 queue_t *q; 2702 rib_qp_t *qp; 2703 rpcib_state_t *ribstat; 2704 rib_hca_t *hca; 2705 rdma_stat status = RDMA_SUCCESS; 2706 int i; 2707 struct clist cl; 2708 rdma_buf_t rdbuf = {0}; 2709 void *buf = NULL; 2710 CONN *conn; 2711 ibt_ip_cm_info_t ipinfo; 2712 struct sockaddr_in *s; 2713 struct sockaddr_in6 *s6; 2714 int sin_size = sizeof (struct sockaddr_in); 2715 int in_size = sizeof (struct in_addr); 2716 int sin6_size = sizeof (struct sockaddr_in6); 2717 2718 ASSERT(any != NULL); 2719 ASSERT(event != NULL); 2720 2721 ribstat = (rpcib_state_t *)any; 2722 hca = (rib_hca_t *)ribstat->hca; 2723 ASSERT(hca != NULL); 2724 2725 /* got a connection request */ 2726 switch (event->cm_type) { 2727 case IBT_CM_EVENT_REQ_RCV: 2728 /* 2729 * If the plugin is in the NO_ACCEPT state, bail out. 2730 */ 2731 mutex_enter(&plugin_state_lock); 2732 if (plugin_state == NO_ACCEPT) { 2733 mutex_exit(&plugin_state_lock); 2734 return (IBT_CM_REJECT); 2735 } 2736 mutex_exit(&plugin_state_lock); 2737 2738 /* 2739 * Need to send a MRA MAD to CM so that it does not 2740 * timeout on us. 2741 */ 2742 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2743 event->cm_event.req.req_timeout * 8, NULL, 0); 2744 2745 mutex_enter(&rib_stat->open_hca_lock); 2746 q = rib_stat->q; 2747 mutex_exit(&rib_stat->open_hca_lock); 2748 2749 status = rib_svc_create_chan(hca, (caddr_t)q, 2750 event->cm_event.req.req_prim_hca_port, &qp); 2751 2752 if (status) { 2753 return (IBT_CM_REJECT); 2754 } 2755 2756 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2757 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2758 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2759 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2760 2761 /* 2762 * Pre-posts RECV buffers 2763 */ 2764 conn = qptoc(qp); 2765 for (i = 0; i < preposted_rbufs; i++) { 2766 bzero(&rdbuf, sizeof (rdbuf)); 2767 rdbuf.type = RECV_BUFFER; 2768 buf = rib_rbuf_alloc(conn, &rdbuf); 2769 if (buf == NULL) { 2770 (void) rib_disconnect_channel(conn, NULL); 2771 return (IBT_CM_REJECT); 2772 } 2773 2774 bzero(&cl, sizeof (cl)); 2775 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2776 cl.c_len = rdbuf.len; 2777 cl.c_smemhandle.mrc_lmr = 2778 rdbuf.handle.mrc_lmr; /* lkey */ 2779 cl.c_next = NULL; 2780 status = rib_post_recv(conn, &cl); 2781 if (status != RDMA_SUCCESS) { 2782 (void) rib_disconnect_channel(conn, NULL); 2783 return (IBT_CM_REJECT); 2784 } 2785 } 2786 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2787 2788 /* 2789 * Get the address translation 2790 */ 2791 rw_enter(&hca->state_lock, RW_READER); 2792 if (hca->state == HCA_DETACHED) { 2793 rw_exit(&hca->state_lock); 2794 return (IBT_CM_REJECT); 2795 } 2796 rw_exit(&hca->state_lock); 2797 2798 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 2799 2800 if (ibt_get_ip_data(event->cm_priv_data_len, 2801 event->cm_priv_data, 2802 &ipinfo) != IBT_SUCCESS) { 2803 2804 return (IBT_CM_REJECT); 2805 } 2806 2807 switch (ipinfo.src_addr.family) { 2808 case AF_INET: 2809 2810 conn->c_raddr.maxlen = 2811 conn->c_raddr.len = sin_size; 2812 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 2813 2814 s = (struct sockaddr_in *)conn->c_raddr.buf; 2815 s->sin_family = AF_INET; 2816 2817 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 2818 &s->sin_addr, in_size); 2819 2820 break; 2821 2822 case AF_INET6: 2823 2824 conn->c_raddr.maxlen = 2825 conn->c_raddr.len = sin6_size; 2826 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 2827 2828 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2829 s6->sin6_family = AF_INET6; 2830 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 2831 &s6->sin6_addr, 2832 sizeof (struct in6_addr)); 2833 2834 break; 2835 2836 default: 2837 return (IBT_CM_REJECT); 2838 } 2839 2840 break; 2841 2842 case IBT_CM_EVENT_CONN_CLOSED: 2843 { 2844 CONN *conn; 2845 rib_qp_t *qp; 2846 2847 switch (event->cm_event.closed) { 2848 case IBT_CM_CLOSED_DREP_RCVD: 2849 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2850 case IBT_CM_CLOSED_DUP: 2851 case IBT_CM_CLOSED_ABORT: 2852 case IBT_CM_CLOSED_ALREADY: 2853 /* 2854 * These cases indicate the local end initiated 2855 * the closing of the channel. Nothing to do here. 2856 */ 2857 break; 2858 default: 2859 /* 2860 * Reason for CONN_CLOSED event must be one of 2861 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2862 * or IBT_CM_CLOSED_STALE. These indicate cases were 2863 * the remote end is closing the channel. In these 2864 * cases free the channel and transition to error 2865 * state 2866 */ 2867 qp = ibt_get_chan_private(event->cm_channel); 2868 conn = qptoc(qp); 2869 mutex_enter(&conn->c_lock); 2870 if (conn->c_state == C_DISCONN_PEND) { 2871 mutex_exit(&conn->c_lock); 2872 break; 2873 } 2874 conn->c_state = C_ERROR_CONN; 2875 2876 /* 2877 * Free the rc_channel. Channel has already 2878 * transitioned to ERROR state and WRs have been 2879 * FLUSHED_ERR already. 2880 */ 2881 (void) ibt_free_channel(qp->qp_hdl); 2882 qp->qp_hdl = NULL; 2883 2884 /* 2885 * Free the conn if c_ref goes down to 0 2886 */ 2887 if (conn->c_ref == 0) { 2888 /* 2889 * Remove from list and free conn 2890 */ 2891 conn->c_state = C_DISCONN_PEND; 2892 mutex_exit(&conn->c_lock); 2893 (void) rib_disconnect_channel(conn, 2894 &hca->srv_conn_list); 2895 } else { 2896 mutex_exit(&conn->c_lock); 2897 } 2898 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 2899 break; 2900 } 2901 break; 2902 } 2903 case IBT_CM_EVENT_CONN_EST: 2904 /* 2905 * RTU received, hence connection established. 2906 */ 2907 if (rib_debug > 1) 2908 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2909 "(CONN_EST) channel established"); 2910 break; 2911 2912 default: 2913 if (rib_debug > 2) { 2914 /* Let CM handle the following events. */ 2915 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 2916 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2917 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 2918 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 2919 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2920 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 2921 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 2922 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2923 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 2924 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 2925 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2926 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 2927 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 2928 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2929 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 2930 } 2931 } 2932 return (IBT_CM_DEFAULT); 2933 } 2934 2935 /* accept all other CM messages (i.e. let the CM handle them) */ 2936 return (IBT_CM_ACCEPT); 2937 } 2938 2939 static rdma_stat 2940 rib_register_service(rib_hca_t *hca, int service_type) 2941 { 2942 ibt_srv_desc_t sdesc; 2943 ibt_hca_portinfo_t *port_infop; 2944 ib_svc_id_t srv_id; 2945 ibt_srv_hdl_t srv_hdl; 2946 uint_t port_size; 2947 uint_t pki, i, num_ports, nbinds; 2948 ibt_status_t ibt_status; 2949 rib_service_t *new_service; 2950 ib_pkey_t pkey; 2951 2952 /* 2953 * Query all ports for the given HCA 2954 */ 2955 rw_enter(&hca->state_lock, RW_READER); 2956 if (hca->state != HCA_DETACHED) { 2957 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 2958 &num_ports, &port_size); 2959 rw_exit(&hca->state_lock); 2960 } else { 2961 rw_exit(&hca->state_lock); 2962 return (RDMA_FAILED); 2963 } 2964 if (ibt_status != IBT_SUCCESS) { 2965 return (RDMA_FAILED); 2966 } 2967 2968 DTRACE_PROBE1(rpcib__i__regservice_numports, 2969 int, num_ports); 2970 2971 for (i = 0; i < num_ports; i++) { 2972 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 2973 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 2974 int, i+1); 2975 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 2976 DTRACE_PROBE1(rpcib__i__regservice__portactive, 2977 int, i+1); 2978 } 2979 } 2980 2981 /* 2982 * Get all the IP addresses on this system to register the 2983 * given "service type" on all DNS recognized IP addrs. 2984 * Each service type such as NFS will have all the systems 2985 * IP addresses as its different names. For now the only 2986 * type of service we support in RPCIB is NFS. 2987 */ 2988 rw_enter(&hca->service_list_lock, RW_WRITER); 2989 /* 2990 * Start registering and binding service to active 2991 * on active ports on this HCA. 2992 */ 2993 nbinds = 0; 2994 new_service = NULL; 2995 2996 /* 2997 * We use IP addresses as the service names for 2998 * service registration. Register each of them 2999 * with CM to obtain a svc_id and svc_hdl. We do not 3000 * register the service with machine's loopback address. 3001 */ 3002 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3003 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3004 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3005 3006 sdesc.sd_handler = rib_srv_cm_handler; 3007 sdesc.sd_flags = 0; 3008 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3009 &sdesc, ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port), 3010 1, &srv_hdl, &srv_id); 3011 3012 for (i = 0; i < num_ports; i++) { 3013 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3014 continue; 3015 3016 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3017 pkey = port_infop[i].p_pkey_tbl[pki]; 3018 if ((pkey & IBSRM_HB) && 3019 (pkey != IB_PKEY_INVALID_FULL)) { 3020 3021 /* 3022 * Allocate and prepare a service entry 3023 */ 3024 new_service = 3025 kmem_zalloc(1 * sizeof (rib_service_t), 3026 KM_SLEEP); 3027 3028 new_service->srv_type = service_type; 3029 new_service->srv_hdl = srv_hdl; 3030 new_service->srv_next = NULL; 3031 3032 ibt_status = ibt_bind_service(srv_hdl, 3033 port_infop[i].p_sgid_tbl[0], 3034 NULL, rib_stat, NULL); 3035 3036 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3037 int, ibt_status); 3038 3039 if (ibt_status != IBT_SUCCESS) { 3040 kmem_free(new_service, 3041 sizeof (rib_service_t)); 3042 new_service = NULL; 3043 continue; 3044 } 3045 3046 /* 3047 * Add to the service list for this HCA 3048 */ 3049 new_service->srv_next = hca->service_list; 3050 hca->service_list = new_service; 3051 new_service = NULL; 3052 nbinds++; 3053 } 3054 } 3055 } 3056 rw_exit(&hca->service_list_lock); 3057 3058 ibt_free_portinfo(port_infop, port_size); 3059 3060 if (nbinds == 0) { 3061 return (RDMA_FAILED); 3062 } else { 3063 /* 3064 * Put this plugin into accept state, since atleast 3065 * one registration was successful. 3066 */ 3067 mutex_enter(&plugin_state_lock); 3068 plugin_state = ACCEPT; 3069 mutex_exit(&plugin_state_lock); 3070 return (RDMA_SUCCESS); 3071 } 3072 } 3073 3074 void 3075 rib_listen(struct rdma_svc_data *rd) 3076 { 3077 rdma_stat status = RDMA_SUCCESS; 3078 3079 rd->active = 0; 3080 rd->err_code = RDMA_FAILED; 3081 3082 /* 3083 * First check if a hca is still attached 3084 */ 3085 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3086 if (rib_stat->hca->state != HCA_INITED) { 3087 rw_exit(&rib_stat->hca->state_lock); 3088 return; 3089 } 3090 rw_exit(&rib_stat->hca->state_lock); 3091 3092 rib_stat->q = &rd->q; 3093 /* 3094 * Right now the only service type is NFS. Hence force feed this 3095 * value. Ideally to communicate the service type it should be 3096 * passed down in rdma_svc_data. 3097 */ 3098 rib_stat->service_type = NFS; 3099 status = rib_register_service(rib_stat->hca, NFS); 3100 if (status != RDMA_SUCCESS) { 3101 rd->err_code = status; 3102 return; 3103 } 3104 /* 3105 * Service active on an HCA, check rd->err_code for more 3106 * explainable errors. 3107 */ 3108 rd->active = 1; 3109 rd->err_code = status; 3110 } 3111 3112 /* XXXX */ 3113 /* ARGSUSED */ 3114 static void 3115 rib_listen_stop(struct rdma_svc_data *svcdata) 3116 { 3117 rib_hca_t *hca; 3118 3119 /* 3120 * KRPC called the RDMATF to stop the listeners, this means 3121 * stop sending incomming or recieved requests to KRPC master 3122 * transport handle for RDMA-IB. This is also means that the 3123 * master transport handle, responsible for us, is going away. 3124 */ 3125 mutex_enter(&plugin_state_lock); 3126 plugin_state = NO_ACCEPT; 3127 if (svcdata != NULL) 3128 svcdata->active = 0; 3129 mutex_exit(&plugin_state_lock); 3130 3131 /* 3132 * First check if a hca is still attached 3133 */ 3134 hca = rib_stat->hca; 3135 rw_enter(&hca->state_lock, RW_READER); 3136 if (hca->state != HCA_INITED) { 3137 rw_exit(&hca->state_lock); 3138 return; 3139 } 3140 rib_close_channels(&hca->srv_conn_list); 3141 rib_stop_services(hca); 3142 rw_exit(&hca->state_lock); 3143 } 3144 3145 /* 3146 * Traverse the HCA's service list to unbind and deregister services. 3147 * Instead of unbinding the service for a service handle by 3148 * calling ibt_unbind_service() for each port/pkey, we unbind 3149 * all the services for the service handle by making only one 3150 * call to ibt_unbind_all_services(). Then, we deregister the 3151 * service for the service handle. 3152 * 3153 * When traversing the entries in service_list, we compare the 3154 * srv_hdl of the current entry with that of the next. If they 3155 * are different or if the next entry is NULL, the current entry 3156 * marks the last binding of the service handle. In this case, 3157 * call ibt_unbind_all_services() and deregister the service for 3158 * the service handle. If they are the same, the current and the 3159 * next entries are bound to the same service handle. In this 3160 * case, move on to the next entry. 3161 */ 3162 static void 3163 rib_stop_services(rib_hca_t *hca) 3164 { 3165 rib_service_t *srv_list, *to_remove; 3166 3167 /* 3168 * unbind and deregister the services for this service type. 3169 * Right now there is only one service type. In future it will 3170 * be passed down to this function. 3171 */ 3172 rw_enter(&hca->service_list_lock, RW_WRITER); 3173 srv_list = hca->service_list; 3174 while (srv_list != NULL) { 3175 to_remove = srv_list; 3176 srv_list = to_remove->srv_next; 3177 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3178 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3179 3180 (void) ibt_unbind_all_services(to_remove->srv_hdl); 3181 (void) ibt_deregister_service(hca->ibt_clnt_hdl, 3182 to_remove->srv_hdl); 3183 } 3184 3185 kmem_free(to_remove, sizeof (rib_service_t)); 3186 } 3187 hca->service_list = NULL; 3188 rw_exit(&hca->service_list_lock); 3189 } 3190 3191 static struct svc_recv * 3192 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3193 { 3194 struct svc_recv *recvp; 3195 3196 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3197 recvp->vaddr = sgl->ds_va; 3198 recvp->qp = qp; 3199 recvp->bytes_xfer = 0; 3200 return (recvp); 3201 } 3202 3203 static int 3204 rib_free_svc_recv(struct svc_recv *recvp) 3205 { 3206 kmem_free(recvp, sizeof (*recvp)); 3207 3208 return (0); 3209 } 3210 3211 static struct reply * 3212 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3213 { 3214 struct reply *rep; 3215 3216 3217 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3218 if (rep == NULL) { 3219 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3220 return (NULL); 3221 } 3222 rep->xid = msgid; 3223 rep->vaddr_cq = NULL; 3224 rep->bytes_xfer = 0; 3225 rep->status = (uint_t)REPLY_WAIT; 3226 rep->prev = NULL; 3227 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3228 3229 mutex_enter(&qp->replylist_lock); 3230 if (qp->replylist) { 3231 rep->next = qp->replylist; 3232 qp->replylist->prev = rep; 3233 } 3234 qp->rep_list_size++; 3235 3236 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3237 int, qp->rep_list_size); 3238 3239 qp->replylist = rep; 3240 mutex_exit(&qp->replylist_lock); 3241 3242 return (rep); 3243 } 3244 3245 static rdma_stat 3246 rib_rem_replylist(rib_qp_t *qp) 3247 { 3248 struct reply *r, *n; 3249 3250 mutex_enter(&qp->replylist_lock); 3251 for (r = qp->replylist; r != NULL; r = n) { 3252 n = r->next; 3253 (void) rib_remreply(qp, r); 3254 } 3255 mutex_exit(&qp->replylist_lock); 3256 3257 return (RDMA_SUCCESS); 3258 } 3259 3260 static int 3261 rib_remreply(rib_qp_t *qp, struct reply *rep) 3262 { 3263 3264 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3265 if (rep->prev) { 3266 rep->prev->next = rep->next; 3267 } 3268 if (rep->next) { 3269 rep->next->prev = rep->prev; 3270 } 3271 if (qp->replylist == rep) 3272 qp->replylist = rep->next; 3273 3274 cv_destroy(&rep->wait_cv); 3275 qp->rep_list_size--; 3276 3277 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3278 int, qp->rep_list_size); 3279 3280 kmem_free(rep, sizeof (*rep)); 3281 3282 return (0); 3283 } 3284 3285 rdma_stat 3286 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3287 struct mrc *buf_handle) 3288 { 3289 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3290 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3291 rdma_stat status; 3292 rib_hca_t *hca = (ctoqp(conn))->hca; 3293 3294 /* 3295 * Note: ALL buffer pools use the same memory type RDMARW. 3296 */ 3297 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3298 if (status == RDMA_SUCCESS) { 3299 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3300 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3301 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3302 } else { 3303 buf_handle->mrc_linfo = NULL; 3304 buf_handle->mrc_lmr = 0; 3305 buf_handle->mrc_rmr = 0; 3306 } 3307 return (status); 3308 } 3309 3310 static rdma_stat 3311 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3312 ibt_mr_flags_t spec, 3313 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3314 { 3315 ibt_mr_attr_t mem_attr; 3316 ibt_status_t ibt_status; 3317 mem_attr.mr_vaddr = (uintptr_t)buf; 3318 mem_attr.mr_len = (ib_msglen_t)size; 3319 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3320 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3321 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3322 IBT_MR_ENABLE_WINDOW_BIND | spec; 3323 3324 rw_enter(&hca->state_lock, RW_READER); 3325 if (hca->state == HCA_INITED) { 3326 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3327 &mem_attr, mr_hdlp, mr_descp); 3328 rw_exit(&hca->state_lock); 3329 } else { 3330 rw_exit(&hca->state_lock); 3331 return (RDMA_FAILED); 3332 } 3333 3334 if (ibt_status != IBT_SUCCESS) { 3335 return (RDMA_FAILED); 3336 } 3337 return (RDMA_SUCCESS); 3338 } 3339 3340 rdma_stat 3341 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3342 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3343 { 3344 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3345 rib_lrc_entry_t *l; 3346 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3347 rdma_stat status; 3348 rib_hca_t *hca = (ctoqp(conn))->hca; 3349 3350 /* 3351 * Non-coherent memory registration. 3352 */ 3353 l = (rib_lrc_entry_t *)lrc; 3354 if (l) { 3355 if (l->registered) { 3356 buf_handle->mrc_linfo = 3357 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3358 buf_handle->mrc_lmr = 3359 (uint32_t)l->lrc_mhandle.mrc_lmr; 3360 buf_handle->mrc_rmr = 3361 (uint32_t)l->lrc_mhandle.mrc_rmr; 3362 *sync_handle = (RIB_SYNCMEM_HANDLE) 3363 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3364 return (RDMA_SUCCESS); 3365 } else { 3366 /* Always register the whole buffer */ 3367 buf = (caddr_t)l->lrc_buf; 3368 buflen = l->lrc_len; 3369 } 3370 } 3371 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3372 3373 if (status == RDMA_SUCCESS) { 3374 if (l) { 3375 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3376 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3377 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3378 l->registered = TRUE; 3379 } 3380 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3381 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3382 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3383 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3384 } else { 3385 buf_handle->mrc_linfo = NULL; 3386 buf_handle->mrc_lmr = 0; 3387 buf_handle->mrc_rmr = 0; 3388 } 3389 return (status); 3390 } 3391 3392 /* ARGSUSED */ 3393 rdma_stat 3394 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3395 { 3396 rib_hca_t *hca = (ctoqp(conn))->hca; 3397 /* 3398 * Allow memory deregistration even if HCA is 3399 * getting detached. Need all outstanding 3400 * memory registrations to be deregistered 3401 * before HCA_DETACH_EVENT can be accepted. 3402 */ 3403 (void) ibt_deregister_mr(hca->hca_hdl, 3404 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3405 return (RDMA_SUCCESS); 3406 } 3407 3408 /* ARGSUSED */ 3409 rdma_stat 3410 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3411 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3412 { 3413 rib_lrc_entry_t *l; 3414 l = (rib_lrc_entry_t *)lrc; 3415 if (l) 3416 if (l->registered) 3417 return (RDMA_SUCCESS); 3418 3419 (void) rib_deregistermem(conn, buf, buf_handle); 3420 3421 return (RDMA_SUCCESS); 3422 } 3423 3424 /* ARGSUSED */ 3425 rdma_stat 3426 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3427 int len, int cpu) 3428 { 3429 ibt_status_t status; 3430 rib_hca_t *hca = (ctoqp(conn))->hca; 3431 ibt_mr_sync_t mr_segment; 3432 3433 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3434 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3435 mr_segment.ms_len = (ib_memlen_t)len; 3436 if (cpu) { 3437 /* make incoming data visible to memory */ 3438 mr_segment.ms_flags = IBT_SYNC_WRITE; 3439 } else { 3440 /* make memory changes visible to IO */ 3441 mr_segment.ms_flags = IBT_SYNC_READ; 3442 } 3443 rw_enter(&hca->state_lock, RW_READER); 3444 if (hca->state == HCA_INITED) { 3445 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3446 rw_exit(&hca->state_lock); 3447 } else { 3448 rw_exit(&hca->state_lock); 3449 return (RDMA_FAILED); 3450 } 3451 3452 if (status == IBT_SUCCESS) 3453 return (RDMA_SUCCESS); 3454 else { 3455 return (RDMA_FAILED); 3456 } 3457 } 3458 3459 /* 3460 * XXXX ???? 3461 */ 3462 static rdma_stat 3463 rib_getinfo(rdma_info_t *info) 3464 { 3465 /* 3466 * XXXX Hack! 3467 */ 3468 info->addrlen = 16; 3469 info->mts = 1000000; 3470 info->mtu = 1000000; 3471 3472 return (RDMA_SUCCESS); 3473 } 3474 3475 rib_bufpool_t * 3476 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3477 { 3478 rib_bufpool_t *rbp = NULL; 3479 bufpool_t *bp = NULL; 3480 caddr_t buf; 3481 ibt_mr_attr_t mem_attr; 3482 ibt_status_t ibt_status; 3483 int i, j; 3484 3485 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3486 3487 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3488 num * sizeof (void *), KM_SLEEP); 3489 3490 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3491 bp->numelems = num; 3492 3493 3494 switch (ptype) { 3495 case SEND_BUFFER: 3496 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3497 bp->rsize = RPC_MSG_SZ; 3498 break; 3499 case RECV_BUFFER: 3500 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3501 bp->rsize = RPC_BUF_SIZE; 3502 break; 3503 default: 3504 goto fail; 3505 } 3506 3507 /* 3508 * Register the pool. 3509 */ 3510 bp->bufsize = num * bp->rsize; 3511 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3512 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3513 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3514 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3515 sizeof (ibt_mr_desc_t), KM_SLEEP); 3516 rw_enter(&hca->state_lock, RW_READER); 3517 3518 if (hca->state != HCA_INITED) { 3519 rw_exit(&hca->state_lock); 3520 goto fail; 3521 } 3522 3523 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3524 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3525 mem_attr.mr_vaddr = (uintptr_t)buf; 3526 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3527 mem_attr.mr_as = NULL; 3528 ibt_status = ibt_register_mr(hca->hca_hdl, 3529 hca->pd_hdl, &mem_attr, 3530 &rbp->mr_hdl[i], 3531 &rbp->mr_desc[i]); 3532 if (ibt_status != IBT_SUCCESS) { 3533 for (j = 0; j < i; j++) { 3534 (void) ibt_deregister_mr(hca->hca_hdl, 3535 rbp->mr_hdl[j]); 3536 } 3537 rw_exit(&hca->state_lock); 3538 goto fail; 3539 } 3540 } 3541 rw_exit(&hca->state_lock); 3542 buf = (caddr_t)bp->buf; 3543 for (i = 0; i < num; i++, buf += bp->rsize) { 3544 bp->buflist[i] = (void *)buf; 3545 } 3546 bp->buffree = num - 1; /* no. of free buffers */ 3547 rbp->bpool = bp; 3548 3549 return (rbp); 3550 fail: 3551 if (bp) { 3552 if (bp->buf) 3553 kmem_free(bp->buf, bp->bufsize); 3554 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3555 } 3556 if (rbp) { 3557 if (rbp->mr_hdl) 3558 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3559 if (rbp->mr_desc) 3560 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3561 kmem_free(rbp, sizeof (rib_bufpool_t)); 3562 } 3563 return (NULL); 3564 } 3565 3566 static void 3567 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3568 { 3569 int i; 3570 rib_bufpool_t *rbp = NULL; 3571 bufpool_t *bp; 3572 3573 /* 3574 * Obtain pool address based on type of pool 3575 */ 3576 switch (ptype) { 3577 case SEND_BUFFER: 3578 rbp = hca->send_pool; 3579 break; 3580 case RECV_BUFFER: 3581 rbp = hca->recv_pool; 3582 break; 3583 default: 3584 return; 3585 } 3586 if (rbp == NULL) 3587 return; 3588 3589 bp = rbp->bpool; 3590 3591 /* 3592 * Deregister the pool memory and free it. 3593 */ 3594 for (i = 0; i < bp->numelems; i++) { 3595 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3596 } 3597 } 3598 3599 static void 3600 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3601 { 3602 3603 rib_bufpool_t *rbp = NULL; 3604 bufpool_t *bp; 3605 3606 /* 3607 * Obtain pool address based on type of pool 3608 */ 3609 switch (ptype) { 3610 case SEND_BUFFER: 3611 rbp = hca->send_pool; 3612 break; 3613 case RECV_BUFFER: 3614 rbp = hca->recv_pool; 3615 break; 3616 default: 3617 return; 3618 } 3619 if (rbp == NULL) 3620 return; 3621 3622 bp = rbp->bpool; 3623 3624 /* 3625 * Free the pool memory. 3626 */ 3627 if (rbp->mr_hdl) 3628 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3629 3630 if (rbp->mr_desc) 3631 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3632 if (bp->buf) 3633 kmem_free(bp->buf, bp->bufsize); 3634 mutex_destroy(&bp->buflock); 3635 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3636 kmem_free(rbp, sizeof (rib_bufpool_t)); 3637 } 3638 3639 void 3640 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3641 { 3642 /* 3643 * Deregister the pool memory and free it. 3644 */ 3645 rib_rbufpool_deregister(hca, ptype); 3646 rib_rbufpool_free(hca, ptype); 3647 } 3648 3649 /* 3650 * Fetch a buffer from the pool of type specified in rdbuf->type. 3651 */ 3652 static rdma_stat 3653 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3654 { 3655 rib_lrc_entry_t *rlep; 3656 3657 if (rdbuf->type == RDMA_LONG_BUFFER) { 3658 rlep = rib_get_cache_buf(conn, rdbuf->len); 3659 rdbuf->rb_private = (caddr_t)rlep; 3660 rdbuf->addr = rlep->lrc_buf; 3661 rdbuf->handle = rlep->lrc_mhandle; 3662 return (RDMA_SUCCESS); 3663 } 3664 3665 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3666 if (rdbuf->addr) { 3667 switch (rdbuf->type) { 3668 case SEND_BUFFER: 3669 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3670 break; 3671 case RECV_BUFFER: 3672 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3673 break; 3674 default: 3675 rdbuf->len = 0; 3676 } 3677 return (RDMA_SUCCESS); 3678 } else 3679 return (RDMA_FAILED); 3680 } 3681 3682 #if defined(MEASURE_POOL_DEPTH) 3683 static void rib_recv_bufs(uint32_t x) { 3684 3685 } 3686 3687 static void rib_send_bufs(uint32_t x) { 3688 3689 } 3690 #endif 3691 3692 /* 3693 * Fetch a buffer of specified type. 3694 * Note that rdbuf->handle is mw's rkey. 3695 */ 3696 static void * 3697 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3698 { 3699 rib_qp_t *qp = ctoqp(conn); 3700 rib_hca_t *hca = qp->hca; 3701 rdma_btype ptype = rdbuf->type; 3702 void *buf; 3703 rib_bufpool_t *rbp = NULL; 3704 bufpool_t *bp; 3705 int i; 3706 3707 /* 3708 * Obtain pool address based on type of pool 3709 */ 3710 switch (ptype) { 3711 case SEND_BUFFER: 3712 rbp = hca->send_pool; 3713 break; 3714 case RECV_BUFFER: 3715 rbp = hca->recv_pool; 3716 break; 3717 default: 3718 return (NULL); 3719 } 3720 if (rbp == NULL) 3721 return (NULL); 3722 3723 bp = rbp->bpool; 3724 3725 mutex_enter(&bp->buflock); 3726 if (bp->buffree < 0) { 3727 mutex_exit(&bp->buflock); 3728 return (NULL); 3729 } 3730 3731 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 3732 buf = bp->buflist[bp->buffree]; 3733 rdbuf->addr = buf; 3734 rdbuf->len = bp->rsize; 3735 for (i = bp->numelems - 1; i >= 0; i--) { 3736 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 3737 rdbuf->handle.mrc_rmr = 3738 (uint32_t)rbp->mr_desc[i].md_rkey; 3739 rdbuf->handle.mrc_linfo = 3740 (uintptr_t)rbp->mr_hdl[i]; 3741 rdbuf->handle.mrc_lmr = 3742 (uint32_t)rbp->mr_desc[i].md_lkey; 3743 #if defined(MEASURE_POOL_DEPTH) 3744 if (ptype == SEND_BUFFER) 3745 rib_send_bufs(MAX_BUFS - (bp->buffree+1)); 3746 if (ptype == RECV_BUFFER) 3747 rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); 3748 #endif 3749 bp->buffree--; 3750 3751 mutex_exit(&bp->buflock); 3752 3753 return (buf); 3754 } 3755 } 3756 3757 mutex_exit(&bp->buflock); 3758 3759 return (NULL); 3760 } 3761 3762 static void 3763 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 3764 { 3765 3766 if (rdbuf->type == RDMA_LONG_BUFFER) { 3767 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 3768 rdbuf->rb_private = NULL; 3769 return; 3770 } 3771 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 3772 } 3773 3774 static void 3775 rib_rbuf_free(CONN *conn, int ptype, void *buf) 3776 { 3777 rib_qp_t *qp = ctoqp(conn); 3778 rib_hca_t *hca = qp->hca; 3779 rib_bufpool_t *rbp = NULL; 3780 bufpool_t *bp; 3781 3782 /* 3783 * Obtain pool address based on type of pool 3784 */ 3785 switch (ptype) { 3786 case SEND_BUFFER: 3787 rbp = hca->send_pool; 3788 break; 3789 case RECV_BUFFER: 3790 rbp = hca->recv_pool; 3791 break; 3792 default: 3793 return; 3794 } 3795 if (rbp == NULL) 3796 return; 3797 3798 bp = rbp->bpool; 3799 3800 mutex_enter(&bp->buflock); 3801 if (++bp->buffree >= bp->numelems) { 3802 /* 3803 * Should never happen 3804 */ 3805 bp->buffree--; 3806 } else { 3807 bp->buflist[bp->buffree] = buf; 3808 } 3809 mutex_exit(&bp->buflock); 3810 } 3811 3812 static rdma_stat 3813 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 3814 { 3815 rw_enter(&connlist->conn_lock, RW_WRITER); 3816 if (connlist->conn_hd) { 3817 cn->c_next = connlist->conn_hd; 3818 connlist->conn_hd->c_prev = cn; 3819 } 3820 connlist->conn_hd = cn; 3821 rw_exit(&connlist->conn_lock); 3822 3823 return (RDMA_SUCCESS); 3824 } 3825 3826 static rdma_stat 3827 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 3828 { 3829 rw_enter(&connlist->conn_lock, RW_WRITER); 3830 if (cn->c_prev) { 3831 cn->c_prev->c_next = cn->c_next; 3832 } 3833 if (cn->c_next) { 3834 cn->c_next->c_prev = cn->c_prev; 3835 } 3836 if (connlist->conn_hd == cn) 3837 connlist->conn_hd = cn->c_next; 3838 rw_exit(&connlist->conn_lock); 3839 3840 return (RDMA_SUCCESS); 3841 } 3842 3843 /* 3844 * Connection management. 3845 * IBTF does not support recycling of channels. So connections are only 3846 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 3847 * C_DISCONN_PEND state. No C_IDLE state. 3848 * C_CONN_PEND state: Connection establishment in progress to the server. 3849 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 3850 * It has an RC channel associated with it. ibt_post_send/recv are allowed 3851 * only in this state. 3852 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 3853 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 3854 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 3855 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 3856 * c_ref drops to 0 (this indicates that RPC has no more references to this 3857 * connection), the connection should be destroyed. A connection transitions 3858 * into this state when it is being destroyed. 3859 */ 3860 /* ARGSUSED */ 3861 static rdma_stat 3862 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 3863 { 3864 CONN *cn; 3865 int status = RDMA_SUCCESS; 3866 rib_hca_t *hca = rib_stat->hca; 3867 rib_qp_t *qp; 3868 clock_t cv_stat, timout; 3869 rpcib_ping_t rpt; 3870 3871 if (hca == NULL) 3872 return (RDMA_FAILED); 3873 3874 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3875 if (hca->state == HCA_DETACHED) { 3876 rw_exit(&rib_stat->hca->state_lock); 3877 return (RDMA_FAILED); 3878 } 3879 rw_exit(&rib_stat->hca->state_lock); 3880 3881 again: 3882 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 3883 cn = hca->cl_conn_list.conn_hd; 3884 while (cn != NULL) { 3885 /* 3886 * First, clear up any connection in the ERROR state 3887 */ 3888 mutex_enter(&cn->c_lock); 3889 if (cn->c_state == C_ERROR_CONN) { 3890 if (cn->c_ref == 0) { 3891 /* 3892 * Remove connection from list and destroy it. 3893 */ 3894 cn->c_state = C_DISCONN_PEND; 3895 mutex_exit(&cn->c_lock); 3896 rw_exit(&hca->cl_conn_list.conn_lock); 3897 (void) rib_disconnect_channel(cn, 3898 &hca->cl_conn_list); 3899 goto again; 3900 } 3901 mutex_exit(&cn->c_lock); 3902 cn = cn->c_next; 3903 continue; 3904 } 3905 if (cn->c_state == C_DISCONN_PEND) { 3906 mutex_exit(&cn->c_lock); 3907 cn = cn->c_next; 3908 continue; 3909 } 3910 if ((cn->c_raddr.len == svcaddr->len) && 3911 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 3912 /* 3913 * Our connection. Give up conn list lock 3914 * as we are done traversing the list. 3915 */ 3916 rw_exit(&hca->cl_conn_list.conn_lock); 3917 if (cn->c_state == C_CONNECTED) { 3918 cn->c_ref++; /* sharing a conn */ 3919 mutex_exit(&cn->c_lock); 3920 *conn = cn; 3921 return (status); 3922 } 3923 if (cn->c_state == C_CONN_PEND) { 3924 /* 3925 * Hold a reference to this conn before 3926 * we give up the lock. 3927 */ 3928 cn->c_ref++; 3929 timout = ddi_get_lbolt() + 3930 drv_usectohz(CONN_WAIT_TIME * 1000000); 3931 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 3932 &cn->c_lock, timout)) > 0 && 3933 cn->c_state == C_CONN_PEND) 3934 ; 3935 if (cv_stat == 0) { 3936 cn->c_ref--; 3937 mutex_exit(&cn->c_lock); 3938 return (RDMA_INTR); 3939 } 3940 if (cv_stat < 0) { 3941 cn->c_ref--; 3942 mutex_exit(&cn->c_lock); 3943 return (RDMA_TIMEDOUT); 3944 } 3945 if (cn->c_state == C_CONNECTED) { 3946 *conn = cn; 3947 mutex_exit(&cn->c_lock); 3948 return (status); 3949 } else { 3950 cn->c_ref--; 3951 mutex_exit(&cn->c_lock); 3952 return (RDMA_TIMEDOUT); 3953 } 3954 } 3955 } 3956 mutex_exit(&cn->c_lock); 3957 cn = cn->c_next; 3958 } 3959 rw_exit(&hca->cl_conn_list.conn_lock); 3960 3961 bzero(&rpt, sizeof (rpcib_ping_t)); 3962 3963 status = rib_ping_srv(addr_type, svcaddr, &rpt); 3964 if (status != RDMA_SUCCESS) { 3965 return (RDMA_FAILED); 3966 } 3967 3968 /* 3969 * Channel to server doesn't exist yet, create one. 3970 */ 3971 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 3972 return (RDMA_FAILED); 3973 } 3974 cn = qptoc(qp); 3975 cn->c_state = C_CONN_PEND; 3976 cn->c_ref = 1; 3977 3978 /* 3979 * Add to conn list. 3980 * We had given up the READER lock. In the time since then, 3981 * another thread might have created the connection we are 3982 * trying here. But for now, that is quiet alright - there 3983 * might be two connections between a pair of hosts instead 3984 * of one. If we really want to close that window, 3985 * then need to check the list after acquiring the 3986 * WRITER lock. 3987 */ 3988 (void) rib_add_connlist(cn, &hca->cl_conn_list); 3989 status = rib_conn_to_srv(hca, qp, &rpt); 3990 mutex_enter(&cn->c_lock); 3991 if (status == RDMA_SUCCESS) { 3992 cn->c_state = C_CONNECTED; 3993 *conn = cn; 3994 } else { 3995 cn->c_state = C_ERROR_CONN; 3996 cn->c_ref--; 3997 } 3998 cv_broadcast(&cn->c_cv); 3999 mutex_exit(&cn->c_lock); 4000 return (status); 4001 } 4002 4003 static rdma_stat 4004 rib_conn_release(CONN *conn) 4005 { 4006 rib_qp_t *qp = ctoqp(conn); 4007 4008 mutex_enter(&conn->c_lock); 4009 conn->c_ref--; 4010 4011 /* 4012 * If a conn is C_ERROR_CONN, close the channel. 4013 * If it's CONNECTED, keep it that way. 4014 */ 4015 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4016 conn->c_state = C_DISCONN_PEND; 4017 mutex_exit(&conn->c_lock); 4018 if (qp->mode == RIB_SERVER) 4019 (void) rib_disconnect_channel(conn, 4020 &qp->hca->srv_conn_list); 4021 else 4022 (void) rib_disconnect_channel(conn, 4023 &qp->hca->cl_conn_list); 4024 return (RDMA_SUCCESS); 4025 } 4026 mutex_exit(&conn->c_lock); 4027 return (RDMA_SUCCESS); 4028 } 4029 4030 /* 4031 * Add at front of list 4032 */ 4033 static struct rdma_done_list * 4034 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4035 { 4036 struct rdma_done_list *rd; 4037 4038 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4039 4040 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4041 rd->xid = xid; 4042 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4043 4044 rd->prev = NULL; 4045 rd->next = qp->rdlist; 4046 if (qp->rdlist != NULL) 4047 qp->rdlist->prev = rd; 4048 qp->rdlist = rd; 4049 4050 return (rd); 4051 } 4052 4053 static void 4054 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4055 { 4056 struct rdma_done_list *r; 4057 4058 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4059 4060 r = rd->next; 4061 if (r != NULL) { 4062 r->prev = rd->prev; 4063 } 4064 4065 r = rd->prev; 4066 if (r != NULL) { 4067 r->next = rd->next; 4068 } else { 4069 qp->rdlist = rd->next; 4070 } 4071 4072 cv_destroy(&rd->rdma_done_cv); 4073 kmem_free(rd, sizeof (*rd)); 4074 } 4075 4076 static void 4077 rdma_done_rem_list(rib_qp_t *qp) 4078 { 4079 struct rdma_done_list *r, *n; 4080 4081 mutex_enter(&qp->rdlist_lock); 4082 for (r = qp->rdlist; r != NULL; r = n) { 4083 n = r->next; 4084 rdma_done_rm(qp, r); 4085 } 4086 mutex_exit(&qp->rdlist_lock); 4087 } 4088 4089 static void 4090 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4091 { 4092 struct rdma_done_list *r = qp->rdlist; 4093 4094 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4095 4096 while (r) { 4097 if (r->xid == xid) { 4098 cv_signal(&r->rdma_done_cv); 4099 return; 4100 } else { 4101 r = r->next; 4102 } 4103 } 4104 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4105 int, xid); 4106 } 4107 4108 4109 /* 4110 * Goes through all connections and closes the channel 4111 * This will cause all the WRs on those channels to be 4112 * flushed. 4113 */ 4114 static void 4115 rib_close_channels(rib_conn_list_t *connlist) 4116 { 4117 CONN *conn; 4118 rib_qp_t *qp; 4119 4120 rw_enter(&connlist->conn_lock, RW_READER); 4121 conn = connlist->conn_hd; 4122 while (conn != NULL) { 4123 mutex_enter(&conn->c_lock); 4124 qp = ctoqp(conn); 4125 if (conn->c_state == C_CONNECTED) { 4126 /* 4127 * Live connection in CONNECTED state. 4128 * Call ibt_close_rc_channel in nonblocking mode 4129 * with no callbacks. 4130 */ 4131 conn->c_state = C_ERROR_CONN; 4132 (void) ibt_close_rc_channel(qp->qp_hdl, 4133 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4134 (void) ibt_free_channel(qp->qp_hdl); 4135 qp->qp_hdl = NULL; 4136 } else { 4137 if (conn->c_state == C_ERROR_CONN && 4138 qp->qp_hdl != NULL) { 4139 /* 4140 * Connection in ERROR state but 4141 * channel is not yet freed. 4142 */ 4143 (void) ibt_close_rc_channel(qp->qp_hdl, 4144 IBT_NOCALLBACKS, NULL, 0, NULL, 4145 NULL, 0); 4146 (void) ibt_free_channel(qp->qp_hdl); 4147 qp->qp_hdl = NULL; 4148 } 4149 } 4150 mutex_exit(&conn->c_lock); 4151 conn = conn->c_next; 4152 } 4153 rw_exit(&connlist->conn_lock); 4154 } 4155 4156 /* 4157 * Frees up all connections that are no longer being referenced 4158 */ 4159 static void 4160 rib_purge_connlist(rib_conn_list_t *connlist) 4161 { 4162 CONN *conn; 4163 4164 top: 4165 rw_enter(&connlist->conn_lock, RW_READER); 4166 conn = connlist->conn_hd; 4167 while (conn != NULL) { 4168 mutex_enter(&conn->c_lock); 4169 4170 /* 4171 * At this point connection is either in ERROR 4172 * or DISCONN_PEND state. If in DISCONN_PEND state 4173 * then some other thread is culling that connection. 4174 * If not and if c_ref is 0, then destroy the connection. 4175 */ 4176 if (conn->c_ref == 0 && 4177 conn->c_state != C_DISCONN_PEND) { 4178 /* 4179 * Cull the connection 4180 */ 4181 conn->c_state = C_DISCONN_PEND; 4182 mutex_exit(&conn->c_lock); 4183 rw_exit(&connlist->conn_lock); 4184 (void) rib_disconnect_channel(conn, connlist); 4185 goto top; 4186 } else { 4187 /* 4188 * conn disconnect already scheduled or will 4189 * happen from conn_release when c_ref drops to 0. 4190 */ 4191 mutex_exit(&conn->c_lock); 4192 } 4193 conn = conn->c_next; 4194 } 4195 rw_exit(&connlist->conn_lock); 4196 4197 /* 4198 * At this point, only connections with c_ref != 0 are on the list 4199 */ 4200 } 4201 4202 /* 4203 * Cleans and closes up all uses of the HCA 4204 */ 4205 static void 4206 rib_detach_hca(rib_hca_t *hca) 4207 { 4208 4209 /* 4210 * Stop all services on the HCA 4211 * Go through cl_conn_list and close all rc_channels 4212 * Go through svr_conn_list and close all rc_channels 4213 * Free connections whose c_ref has dropped to 0 4214 * Destroy all CQs 4215 * Deregister and released all buffer pool memory after all 4216 * connections are destroyed 4217 * Free the protection domain 4218 * ibt_close_hca() 4219 */ 4220 rw_enter(&hca->state_lock, RW_WRITER); 4221 if (hca->state == HCA_DETACHED) { 4222 rw_exit(&hca->state_lock); 4223 return; 4224 } 4225 4226 hca->state = HCA_DETACHED; 4227 rib_stat->nhca_inited--; 4228 4229 rib_stop_services(hca); 4230 rib_close_channels(&hca->cl_conn_list); 4231 rib_close_channels(&hca->srv_conn_list); 4232 4233 rib_mod.rdma_count--; 4234 4235 rw_exit(&hca->state_lock); 4236 4237 /* 4238 * purge will free all datastructures used by CQ handlers. We don't 4239 * want to receive completions after purge, so we'll free the CQs now. 4240 */ 4241 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4242 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4243 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4244 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4245 4246 rib_purge_connlist(&hca->cl_conn_list); 4247 rib_purge_connlist(&hca->srv_conn_list); 4248 4249 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4250 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4251 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4252 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4253 if (stats_enabled) { 4254 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4255 GLOBAL_ZONEID); 4256 } 4257 4258 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4259 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4260 if (hca->srv_conn_list.conn_hd == NULL && 4261 hca->cl_conn_list.conn_hd == NULL) { 4262 /* 4263 * conn_lists are NULL, so destroy 4264 * buffers, close hca and be done. 4265 */ 4266 rib_rbufpool_destroy(hca, RECV_BUFFER); 4267 rib_rbufpool_destroy(hca, SEND_BUFFER); 4268 rib_destroy_cache(hca); 4269 rdma_unregister_mod(&rib_mod); 4270 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4271 (void) ibt_close_hca(hca->hca_hdl); 4272 hca->hca_hdl = NULL; 4273 } 4274 rw_exit(&hca->cl_conn_list.conn_lock); 4275 rw_exit(&hca->srv_conn_list.conn_lock); 4276 4277 if (hca->hca_hdl != NULL) { 4278 mutex_enter(&hca->inuse_lock); 4279 while (hca->inuse) 4280 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4281 mutex_exit(&hca->inuse_lock); 4282 4283 rdma_unregister_mod(&rib_mod); 4284 4285 /* 4286 * conn_lists are now NULL, so destroy 4287 * buffers, close hca and be done. 4288 */ 4289 rib_rbufpool_destroy(hca, RECV_BUFFER); 4290 rib_rbufpool_destroy(hca, SEND_BUFFER); 4291 rib_destroy_cache(hca); 4292 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4293 (void) ibt_close_hca(hca->hca_hdl); 4294 hca->hca_hdl = NULL; 4295 } 4296 } 4297 4298 static void 4299 rib_server_side_cache_reclaim(void *argp) 4300 { 4301 cache_avl_struct_t *rcas; 4302 rib_lrc_entry_t *rb; 4303 rib_hca_t *hca = (rib_hca_t *)argp; 4304 4305 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4306 rcas = avl_first(&hca->avl_tree); 4307 if (rcas != NULL) 4308 avl_remove(&hca->avl_tree, rcas); 4309 4310 while (rcas != NULL) { 4311 while (rcas->r.forw != &rcas->r) { 4312 rcas->elements--; 4313 rib_total_buffers --; 4314 rb = rcas->r.forw; 4315 remque(rb); 4316 if (rb->registered) 4317 (void) rib_deregistermem_via_hca(hca, 4318 rb->lrc_buf, rb->lrc_mhandle); 4319 cache_allocation -= rb->lrc_len; 4320 kmem_free(rb->lrc_buf, rb->lrc_len); 4321 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4322 } 4323 mutex_destroy(&rcas->node_lock); 4324 kmem_cache_free(hca->server_side_cache, rcas); 4325 rcas = avl_first(&hca->avl_tree); 4326 if (rcas != NULL) 4327 avl_remove(&hca->avl_tree, rcas); 4328 } 4329 rw_exit(&hca->avl_rw_lock); 4330 } 4331 4332 static void 4333 rib_server_side_cache_cleanup(void *argp) 4334 { 4335 cache_avl_struct_t *rcas; 4336 rib_lrc_entry_t *rb; 4337 rib_hca_t *hca = (rib_hca_t *)argp; 4338 4339 rw_enter(&hca->avl_rw_lock, RW_READER); 4340 if (cache_allocation < cache_limit) { 4341 rw_exit(&hca->avl_rw_lock); 4342 return; 4343 } 4344 rw_exit(&hca->avl_rw_lock); 4345 4346 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4347 rcas = avl_last(&hca->avl_tree); 4348 if (rcas != NULL) 4349 avl_remove(&hca->avl_tree, rcas); 4350 4351 while (rcas != NULL) { 4352 while (rcas->r.forw != &rcas->r) { 4353 rcas->elements--; 4354 rib_total_buffers --; 4355 rb = rcas->r.forw; 4356 remque(rb); 4357 if (rb->registered) 4358 (void) rib_deregistermem_via_hca(hca, 4359 rb->lrc_buf, rb->lrc_mhandle); 4360 cache_allocation -= rb->lrc_len; 4361 kmem_free(rb->lrc_buf, rb->lrc_len); 4362 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4363 } 4364 mutex_destroy(&rcas->node_lock); 4365 if (hca->server_side_cache) { 4366 kmem_cache_free(hca->server_side_cache, rcas); 4367 } 4368 if ((cache_allocation) < cache_limit) { 4369 rw_exit(&hca->avl_rw_lock); 4370 return; 4371 } 4372 4373 rcas = avl_last(&hca->avl_tree); 4374 if (rcas != NULL) 4375 avl_remove(&hca->avl_tree, rcas); 4376 } 4377 rw_exit(&hca->avl_rw_lock); 4378 } 4379 4380 static int 4381 avl_compare(const void *t1, const void *t2) 4382 { 4383 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4384 return (0); 4385 4386 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4387 return (-1); 4388 4389 return (1); 4390 } 4391 4392 static void 4393 rib_destroy_cache(rib_hca_t *hca) 4394 { 4395 if (hca->reg_cache_clean_up != NULL) { 4396 ddi_taskq_destroy(hca->reg_cache_clean_up); 4397 hca->reg_cache_clean_up = NULL; 4398 } 4399 if (hca->avl_init) { 4400 rib_server_side_cache_reclaim((void *)hca); 4401 if (hca->server_side_cache) { 4402 kmem_cache_destroy(hca->server_side_cache); 4403 hca->server_side_cache = NULL; 4404 } 4405 avl_destroy(&hca->avl_tree); 4406 mutex_destroy(&hca->cache_allocation); 4407 rw_destroy(&hca->avl_rw_lock); 4408 } 4409 hca->avl_init = FALSE; 4410 } 4411 4412 static void 4413 rib_force_cleanup(void *hca) 4414 { 4415 if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL) 4416 (void) ddi_taskq_dispatch( 4417 ((rib_hca_t *)hca)->reg_cache_clean_up, 4418 rib_server_side_cache_cleanup, 4419 (void *)hca, DDI_NOSLEEP); 4420 } 4421 4422 static rib_lrc_entry_t * 4423 rib_get_cache_buf(CONN *conn, uint32_t len) 4424 { 4425 cache_avl_struct_t cas, *rcas; 4426 rib_hca_t *hca = (ctoqp(conn))->hca; 4427 rib_lrc_entry_t *reply_buf; 4428 avl_index_t where = NULL; 4429 uint64_t c_alloc = 0; 4430 4431 if (!hca->avl_init) 4432 goto error_alloc; 4433 4434 cas.len = len; 4435 4436 rw_enter(&hca->avl_rw_lock, RW_READER); 4437 4438 mutex_enter(&hca->cache_allocation); 4439 c_alloc = cache_allocation; 4440 mutex_exit(&hca->cache_allocation); 4441 4442 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4443 &where)) == NULL) { 4444 /* Am I above the cache limit */ 4445 if ((c_alloc + len) >= cache_limit) { 4446 rib_force_cleanup((void *)hca); 4447 rw_exit(&hca->avl_rw_lock); 4448 cache_misses_above_the_limit ++; 4449 4450 /* Allocate and register the buffer directly */ 4451 goto error_alloc; 4452 } 4453 4454 rw_exit(&hca->avl_rw_lock); 4455 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4456 4457 /* Recheck to make sure no other thread added the entry in */ 4458 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4459 &cas, &where)) == NULL) { 4460 /* Allocate an avl tree entry */ 4461 rcas = (cache_avl_struct_t *) 4462 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4463 4464 bzero(rcas, sizeof (cache_avl_struct_t)); 4465 rcas->elements = 0; 4466 rcas->r.forw = &rcas->r; 4467 rcas->r.back = &rcas->r; 4468 rcas->len = len; 4469 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4470 avl_insert(&hca->avl_tree, rcas, where); 4471 } 4472 } 4473 4474 mutex_enter(&rcas->node_lock); 4475 4476 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4477 rib_total_buffers--; 4478 cache_hits++; 4479 reply_buf = rcas->r.forw; 4480 remque(reply_buf); 4481 rcas->elements--; 4482 mutex_exit(&rcas->node_lock); 4483 rw_exit(&hca->avl_rw_lock); 4484 mutex_enter(&hca->cache_allocation); 4485 cache_allocation -= len; 4486 mutex_exit(&hca->cache_allocation); 4487 } else { 4488 /* Am I above the cache limit */ 4489 mutex_exit(&rcas->node_lock); 4490 if ((c_alloc + len) >= cache_limit) { 4491 rib_force_cleanup((void *)hca); 4492 rw_exit(&hca->avl_rw_lock); 4493 cache_misses_above_the_limit ++; 4494 /* Allocate and register the buffer directly */ 4495 goto error_alloc; 4496 } 4497 rw_exit(&hca->avl_rw_lock); 4498 cache_misses ++; 4499 /* Allocate a reply_buf entry */ 4500 reply_buf = (rib_lrc_entry_t *) 4501 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4502 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4503 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4504 reply_buf->lrc_len = len; 4505 reply_buf->registered = FALSE; 4506 reply_buf->avl_node = (void *)rcas; 4507 } 4508 4509 return (reply_buf); 4510 4511 error_alloc: 4512 reply_buf = (rib_lrc_entry_t *) 4513 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4514 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4515 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4516 reply_buf->lrc_len = len; 4517 reply_buf->registered = FALSE; 4518 reply_buf->avl_node = NULL; 4519 4520 return (reply_buf); 4521 } 4522 4523 /* 4524 * Return a pre-registered back to the cache (without 4525 * unregistering the buffer).. 4526 */ 4527 4528 static void 4529 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 4530 { 4531 cache_avl_struct_t cas, *rcas; 4532 avl_index_t where = NULL; 4533 rib_hca_t *hca = (ctoqp(conn))->hca; 4534 4535 if (!hca->avl_init) 4536 goto error_free; 4537 4538 cas.len = reg_buf->lrc_len; 4539 rw_enter(&hca->avl_rw_lock, RW_READER); 4540 if ((rcas = (cache_avl_struct_t *) 4541 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 4542 rw_exit(&hca->avl_rw_lock); 4543 goto error_free; 4544 } else { 4545 rib_total_buffers ++; 4546 cas.len = reg_buf->lrc_len; 4547 mutex_enter(&rcas->node_lock); 4548 insque(reg_buf, &rcas->r); 4549 rcas->elements ++; 4550 mutex_exit(&rcas->node_lock); 4551 rw_exit(&hca->avl_rw_lock); 4552 mutex_enter(&hca->cache_allocation); 4553 cache_allocation += cas.len; 4554 mutex_exit(&hca->cache_allocation); 4555 } 4556 4557 return; 4558 4559 error_free: 4560 4561 if (reg_buf->registered) 4562 (void) rib_deregistermem_via_hca(hca, 4563 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 4564 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 4565 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 4566 } 4567 4568 static rdma_stat 4569 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 4570 uint_t buflen, struct mrc *buf_handle) 4571 { 4572 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4573 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4574 rdma_stat status; 4575 4576 4577 /* 4578 * Note: ALL buffer pools use the same memory type RDMARW. 4579 */ 4580 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4581 if (status == RDMA_SUCCESS) { 4582 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 4583 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4584 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4585 } else { 4586 buf_handle->mrc_linfo = NULL; 4587 buf_handle->mrc_lmr = 0; 4588 buf_handle->mrc_rmr = 0; 4589 } 4590 return (status); 4591 } 4592 4593 /* ARGSUSED */ 4594 static rdma_stat 4595 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 4596 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 4597 { 4598 4599 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 4600 return (RDMA_SUCCESS); 4601 } 4602 4603 /* ARGSUSED */ 4604 static rdma_stat 4605 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 4606 { 4607 4608 (void) ibt_deregister_mr(hca->hca_hdl, 4609 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4610 return (RDMA_SUCCESS); 4611 } 4612 4613 /* 4614 * Check if the IP interface named by `lifrp' is RDMA-capable. 4615 */ 4616 static boolean_t 4617 rpcib_rdma_capable_interface(struct lifreq *lifrp) 4618 { 4619 char ifname[LIFNAMSIZ]; 4620 char *cp; 4621 4622 if (lifrp->lifr_type == IFT_IB) 4623 return (B_TRUE); 4624 4625 /* 4626 * Strip off the logical interface portion before getting 4627 * intimate with the name. 4628 */ 4629 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 4630 if ((cp = strchr(ifname, ':')) != NULL) 4631 *cp = '\0'; 4632 4633 return (strcmp("lo0", ifname) == 0); 4634 } 4635 4636 static int 4637 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 4638 { 4639 vnode_t *kvp, *vp; 4640 TIUSER *tiptr; 4641 struct strioctl iocb; 4642 k_sigset_t smask; 4643 int err = 0; 4644 4645 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 4646 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 4647 &tiptr, CRED()) == 0) { 4648 vp = tiptr->fp->f_vnode; 4649 } else { 4650 VN_RELE(kvp); 4651 return (EPROTO); 4652 } 4653 } else { 4654 return (EPROTO); 4655 } 4656 4657 iocb.ic_cmd = cmd; 4658 iocb.ic_timout = 0; 4659 iocb.ic_len = len; 4660 iocb.ic_dp = (caddr_t)arg; 4661 sigintr(&smask, 0); 4662 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4663 sigunintr(&smask); 4664 (void) t_kclose(tiptr, 0); 4665 VN_RELE(kvp); 4666 return (err); 4667 } 4668 4669 /* 4670 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 4671 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 4672 */ 4673 static int 4674 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 4675 { 4676 int err; 4677 struct lifnum lifn; 4678 4679 bzero(&lifn, sizeof (struct lifnum)); 4680 lifn.lifn_family = AF_UNSPEC; 4681 4682 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 4683 if (err != 0) 4684 return (err); 4685 4686 /* 4687 * Pad the interface count to account for additional interfaces that 4688 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 4689 */ 4690 lifn.lifn_count += 4; 4691 4692 bzero(lifcp, sizeof (struct lifconf)); 4693 lifcp->lifc_family = AF_UNSPEC; 4694 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 4695 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 4696 4697 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 4698 if (err != 0) { 4699 kmem_free(lifcp->lifc_buf, *bufsizep); 4700 return (err); 4701 } 4702 return (0); 4703 } 4704 4705 static boolean_t 4706 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 4707 { 4708 uint_t i, nifs; 4709 uint_t bufsize; 4710 struct lifconf lifc; 4711 struct lifreq *lifrp; 4712 struct sockaddr_in *sinp; 4713 struct sockaddr_in6 *sin6p; 4714 4715 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 4716 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 4717 4718 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 4719 return (B_FALSE); 4720 4721 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 4722 kmem_free(lifc.lifc_buf, bufsize); 4723 return (B_FALSE); 4724 } 4725 4726 /* 4727 * Worst case is that all of the addresses are IB-capable and have 4728 * the same address family, so size our buffers accordingly. 4729 */ 4730 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 4731 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 4732 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 4733 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 4734 4735 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 4736 if (!rpcib_rdma_capable_interface(lifrp)) 4737 continue; 4738 4739 if (lifrp->lifr_addr.ss_family == AF_INET) { 4740 sinp = addrs4->ri_list; 4741 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 4742 sizeof (struct sockaddr_in)); 4743 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 4744 sin6p = addrs6->ri_list; 4745 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 4746 sizeof (struct sockaddr_in6)); 4747 } 4748 } 4749 4750 kmem_free(lifc.lifc_buf, bufsize); 4751 return (B_TRUE); 4752 } 4753 4754 /* ARGSUSED */ 4755 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { 4756 4757 if (KSTAT_WRITE == rw) { 4758 return (EACCES); 4759 } 4760 rpcib_kstat.cache_limit.value.ui64 = 4761 (uint64_t)cache_limit; 4762 rpcib_kstat.cache_allocation.value.ui64 = 4763 (uint64_t)cache_allocation; 4764 rpcib_kstat.cache_hits.value.ui64 = 4765 (uint64_t)cache_hits; 4766 rpcib_kstat.cache_misses.value.ui64 = 4767 (uint64_t)cache_misses; 4768 rpcib_kstat.cache_misses_above_the_limit.value.ui64 = 4769 (uint64_t)cache_misses_above_the_limit; 4770 return (0); 4771 } 4772