1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 2050 86 87 /* 88 * Convenience structure used by rpcib_get_ib_addresses() 89 */ 90 typedef struct rpcib_ipaddrs { 91 void *ri_list; /* pointer to list of addresses */ 92 uint_t ri_count; /* number of addresses in list */ 93 uint_t ri_size; /* size of ri_list in bytes */ 94 } rpcib_ipaddrs_t; 95 96 /* 97 * Prototype declarations for driver ops 98 */ 99 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 100 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 101 void *, void **); 102 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 103 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 104 static int rpcib_do_ip_ioctl(int, int, void *); 105 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 106 static int rpcib_cache_kstat_update(kstat_t *, int); 107 static void rib_force_cleanup(void *); 108 109 struct { 110 kstat_named_t cache_limit; 111 kstat_named_t cache_allocation; 112 kstat_named_t cache_hits; 113 kstat_named_t cache_misses; 114 kstat_named_t cache_misses_above_the_limit; 115 } rpcib_kstat = { 116 {"cache_limit", KSTAT_DATA_UINT64 }, 117 {"cache_allocation", KSTAT_DATA_UINT64 }, 118 {"cache_hits", KSTAT_DATA_UINT64 }, 119 {"cache_misses", KSTAT_DATA_UINT64 }, 120 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 121 }; 122 123 /* rpcib cb_ops */ 124 static struct cb_ops rpcib_cbops = { 125 nulldev, /* open */ 126 nulldev, /* close */ 127 nodev, /* strategy */ 128 nodev, /* print */ 129 nodev, /* dump */ 130 nodev, /* read */ 131 nodev, /* write */ 132 nodev, /* ioctl */ 133 nodev, /* devmap */ 134 nodev, /* mmap */ 135 nodev, /* segmap */ 136 nochpoll, /* poll */ 137 ddi_prop_op, /* prop_op */ 138 NULL, /* stream */ 139 D_MP, /* cb_flag */ 140 CB_REV, /* rev */ 141 nodev, /* int (*cb_aread)() */ 142 nodev /* int (*cb_awrite)() */ 143 }; 144 145 /* 146 * Device options 147 */ 148 static struct dev_ops rpcib_ops = { 149 DEVO_REV, /* devo_rev, */ 150 0, /* refcnt */ 151 rpcib_getinfo, /* info */ 152 nulldev, /* identify */ 153 nulldev, /* probe */ 154 rpcib_attach, /* attach */ 155 rpcib_detach, /* detach */ 156 nodev, /* reset */ 157 &rpcib_cbops, /* driver ops - devctl interfaces */ 158 NULL, /* bus operations */ 159 NULL, /* power */ 160 ddi_quiesce_not_needed, /* quiesce */ 161 }; 162 163 /* 164 * Module linkage information. 165 */ 166 167 static struct modldrv rib_modldrv = { 168 &mod_driverops, /* Driver module */ 169 "RPCIB plugin driver", /* Driver name and version */ 170 &rpcib_ops, /* Driver ops */ 171 }; 172 173 static struct modlinkage rib_modlinkage = { 174 MODREV_1, 175 (void *)&rib_modldrv, 176 NULL 177 }; 178 179 typedef struct rib_lrc_entry { 180 struct rib_lrc_entry *forw; 181 struct rib_lrc_entry *back; 182 char *lrc_buf; 183 184 uint32_t lrc_len; 185 void *avl_node; 186 bool_t registered; 187 188 struct mrc lrc_mhandle; 189 bool_t lrc_on_freed_list; 190 } rib_lrc_entry_t; 191 192 typedef struct cache_struct { 193 rib_lrc_entry_t r; 194 uint32_t len; 195 uint32_t elements; 196 kmutex_t node_lock; 197 avl_node_t avl_link; 198 } cache_avl_struct_t; 199 200 static uint64_t rib_total_buffers = 0; 201 uint64_t cache_limit = 100 * 1024 * 1024; 202 static volatile uint64_t cache_allocation = 0; 203 static uint64_t cache_watermark = 80 * 1024 * 1024; 204 static uint64_t cache_hits = 0; 205 static uint64_t cache_misses = 0; 206 static uint64_t cache_cold_misses = 0; 207 static uint64_t cache_hot_misses = 0; 208 static uint64_t cache_misses_above_the_limit = 0; 209 static bool_t stats_enabled = FALSE; 210 211 static uint64_t max_unsignaled_rws = 5; 212 213 /* 214 * rib_stat: private data pointer used when registering 215 * with the IBTF. It is returned to the consumer 216 * in all callbacks. 217 */ 218 static rpcib_state_t *rib_stat = NULL; 219 220 #define RNR_RETRIES IBT_RNR_RETRY_1 221 #define MAX_PORTS 2 222 223 int preposted_rbufs = RDMA_BUFS_GRANT; 224 int send_threshold = 1; 225 226 /* 227 * State of the plugin. 228 * ACCEPT = accepting new connections and requests. 229 * NO_ACCEPT = not accepting new connection and requests. 230 * This should eventually move to rpcib_state_t structure, since this 231 * will tell in which state the plugin is for a particular type of service 232 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 233 * state for one and in no_accept state for the other. 234 */ 235 int plugin_state; 236 kmutex_t plugin_state_lock; 237 238 ldi_ident_t rpcib_li; 239 240 /* 241 * RPCIB RDMATF operations 242 */ 243 #if defined(MEASURE_POOL_DEPTH) 244 static void rib_posted_rbufs(uint32_t x) { return; } 245 #endif 246 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 247 static rdma_stat rib_disconnect(CONN *conn); 248 static void rib_listen(struct rdma_svc_data *rd); 249 static void rib_listen_stop(struct rdma_svc_data *rd); 250 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 251 uint_t buflen, struct mrc *buf_handle); 252 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 253 struct mrc buf_handle); 254 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 255 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 256 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 257 struct mrc buf_handle); 258 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 259 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 260 void *lrc); 261 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 262 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 263 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 264 caddr_t buf, int len, int cpu); 265 266 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 267 268 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 269 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 270 271 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 272 273 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 274 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 275 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 276 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 277 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 278 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 279 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 280 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 281 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 282 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 283 static rdma_stat rib_conn_release(CONN *conn); 284 static rdma_stat rib_getinfo(rdma_info_t *info); 285 286 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 287 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 288 static void rib_destroy_cache(rib_hca_t *hca); 289 static void rib_server_side_cache_reclaim(void *argp); 290 static int avl_compare(const void *t1, const void *t2); 291 292 static void rib_stop_services(rib_hca_t *); 293 static void rib_close_channels(rib_conn_list_t *); 294 295 /* 296 * RPCIB addressing operations 297 */ 298 299 /* 300 * RDMA operations the RPCIB module exports 301 */ 302 static rdmaops_t rib_ops = { 303 rib_reachable, 304 rib_conn_get, 305 rib_conn_release, 306 rib_listen, 307 rib_listen_stop, 308 rib_registermem, 309 rib_deregistermem, 310 rib_registermemsync, 311 rib_deregistermemsync, 312 rib_syncmem, 313 rib_reg_buf_alloc, 314 rib_reg_buf_free, 315 rib_send, 316 rib_send_resp, 317 rib_post_resp, 318 rib_post_resp_remove, 319 rib_post_recv, 320 rib_recv, 321 rib_read, 322 rib_write, 323 rib_getinfo, 324 }; 325 326 /* 327 * RDMATF RPCIB plugin details 328 */ 329 static rdma_mod_t rib_mod = { 330 "ibtf", /* api name */ 331 RDMATF_VERS_1, 332 0, 333 &rib_ops, /* rdma op vector for ibtf */ 334 }; 335 336 static rdma_stat open_hcas(rpcib_state_t *); 337 static rdma_stat rib_qp_init(rib_qp_t *, int); 338 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 339 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 340 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 341 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 342 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 343 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 344 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 345 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 346 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 347 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *, 348 ibt_ip_addr_t *, ibt_ip_addr_t *); 349 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 350 rib_qp_t **); 351 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 352 rib_qp_t **); 353 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 354 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 355 static int rib_free_sendwait(struct send_wid *); 356 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 357 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 358 static void rdma_done_rem_list(rib_qp_t *); 359 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 360 361 static void rib_async_handler(void *, 362 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 363 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 364 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 365 static int rib_free_svc_recv(struct svc_recv *); 366 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 367 static void rib_free_wid(struct recv_wid *); 368 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 369 static void rib_detach_hca(rib_hca_t *); 370 static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int, 371 ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *); 372 373 /* 374 * Registration with IBTF as a consumer 375 */ 376 static struct ibt_clnt_modinfo_s rib_modinfo = { 377 IBTI_V_CURR, 378 IBT_GENERIC, 379 rib_async_handler, /* async event handler */ 380 NULL, /* Memory Region Handler */ 381 "nfs/ib" 382 }; 383 384 /* 385 * Global strucuture 386 */ 387 388 typedef struct rpcib_s { 389 dev_info_t *rpcib_dip; 390 kmutex_t rpcib_mutex; 391 } rpcib_t; 392 393 rpcib_t rpcib; 394 395 /* 396 * /etc/system controlled variable to control 397 * debugging in rpcib kernel module. 398 * Set it to values greater that 1 to control 399 * the amount of debugging messages required. 400 */ 401 int rib_debug = 0; 402 403 int 404 _init(void) 405 { 406 int error; 407 408 error = mod_install((struct modlinkage *)&rib_modlinkage); 409 if (error != 0) { 410 /* 411 * Could not load module 412 */ 413 return (error); 414 } 415 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 416 return (0); 417 } 418 419 int 420 _fini() 421 { 422 int status; 423 424 /* 425 * Remove module 426 */ 427 if ((status = mod_remove(&rib_modlinkage)) != 0) { 428 return (status); 429 } 430 mutex_destroy(&plugin_state_lock); 431 return (0); 432 } 433 434 int 435 _info(struct modinfo *modinfop) 436 { 437 return (mod_info(&rib_modlinkage, modinfop)); 438 } 439 440 /* 441 * rpcib_getinfo() 442 * Given the device number, return the devinfo pointer or the 443 * instance number. 444 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 445 */ 446 447 /*ARGSUSED*/ 448 static int 449 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 450 { 451 int ret = DDI_SUCCESS; 452 453 switch (cmd) { 454 case DDI_INFO_DEVT2DEVINFO: 455 if (rpcib.rpcib_dip != NULL) 456 *result = rpcib.rpcib_dip; 457 else { 458 *result = NULL; 459 ret = DDI_FAILURE; 460 } 461 break; 462 463 case DDI_INFO_DEVT2INSTANCE: 464 *result = NULL; 465 break; 466 467 default: 468 ret = DDI_FAILURE; 469 } 470 return (ret); 471 } 472 473 static int 474 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 475 { 476 ibt_status_t ibt_status; 477 rdma_stat r_status; 478 479 switch (cmd) { 480 case DDI_ATTACH: 481 break; 482 case DDI_RESUME: 483 return (DDI_SUCCESS); 484 default: 485 return (DDI_FAILURE); 486 } 487 488 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 489 490 mutex_enter(&rpcib.rpcib_mutex); 491 if (rpcib.rpcib_dip != NULL) { 492 mutex_exit(&rpcib.rpcib_mutex); 493 return (DDI_FAILURE); 494 } 495 rpcib.rpcib_dip = dip; 496 mutex_exit(&rpcib.rpcib_mutex); 497 /* 498 * Create the "rpcib" minor-node. 499 */ 500 if (ddi_create_minor_node(dip, 501 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 502 /* Error message, no cmn_err as they print on console */ 503 return (DDI_FAILURE); 504 } 505 506 if (rib_stat == NULL) { 507 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 508 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 509 } 510 511 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 512 if (rib_stat->hca_count < 1) { 513 mutex_destroy(&rib_stat->open_hca_lock); 514 kmem_free(rib_stat, sizeof (*rib_stat)); 515 rib_stat = NULL; 516 return (DDI_FAILURE); 517 } 518 519 ibt_status = ibt_attach(&rib_modinfo, dip, 520 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 521 522 if (ibt_status != IBT_SUCCESS) { 523 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 524 mutex_destroy(&rib_stat->open_hca_lock); 525 kmem_free(rib_stat, sizeof (*rib_stat)); 526 rib_stat = NULL; 527 return (DDI_FAILURE); 528 } 529 530 mutex_enter(&rib_stat->open_hca_lock); 531 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 532 mutex_exit(&rib_stat->open_hca_lock); 533 goto open_fail; 534 } 535 mutex_exit(&rib_stat->open_hca_lock); 536 537 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 538 DDI_PROP_SUCCESS) { 539 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 540 "failed."); 541 goto register_fail; 542 } 543 544 /* 545 * Register with rdmatf 546 */ 547 rib_mod.rdma_count = rib_stat->nhca_inited; 548 r_status = rdma_register_mod(&rib_mod); 549 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 550 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 551 "status = %d", r_status); 552 goto register_fail; 553 } 554 555 return (DDI_SUCCESS); 556 557 register_fail: 558 rib_detach_hca(rib_stat->hca); 559 open_fail: 560 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 561 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 562 mutex_destroy(&rib_stat->open_hca_lock); 563 kmem_free(rib_stat, sizeof (*rib_stat)); 564 rib_stat = NULL; 565 return (DDI_FAILURE); 566 } 567 568 /*ARGSUSED*/ 569 static int 570 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 571 { 572 switch (cmd) { 573 574 case DDI_DETACH: 575 break; 576 577 case DDI_SUSPEND: 578 default: 579 return (DDI_FAILURE); 580 } 581 582 /* 583 * Detach the hca and free resources 584 */ 585 mutex_enter(&plugin_state_lock); 586 plugin_state = NO_ACCEPT; 587 mutex_exit(&plugin_state_lock); 588 rib_detach_hca(rib_stat->hca); 589 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 590 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 591 mutex_destroy(&rib_stat->open_hca_lock); 592 if (rib_stat->hcas) { 593 kmem_free(rib_stat->hcas, rib_stat->hca_count * 594 sizeof (rib_hca_t)); 595 rib_stat->hcas = NULL; 596 } 597 kmem_free(rib_stat, sizeof (*rib_stat)); 598 rib_stat = NULL; 599 600 mutex_enter(&rpcib.rpcib_mutex); 601 rpcib.rpcib_dip = NULL; 602 mutex_exit(&rpcib.rpcib_mutex); 603 mutex_destroy(&rpcib.rpcib_mutex); 604 return (DDI_SUCCESS); 605 } 606 607 608 static void rib_rbufpool_free(rib_hca_t *, int); 609 static void rib_rbufpool_deregister(rib_hca_t *, int); 610 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 611 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 612 static rdma_stat rib_rem_replylist(rib_qp_t *); 613 static int rib_remreply(rib_qp_t *, struct reply *); 614 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 615 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 616 617 618 /* 619 * One CQ pair per HCA 620 */ 621 static rdma_stat 622 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 623 rib_cq_t **cqp, rpcib_state_t *ribstat) 624 { 625 rib_cq_t *cq; 626 ibt_cq_attr_t cq_attr; 627 uint32_t real_size; 628 ibt_status_t status; 629 rdma_stat error = RDMA_SUCCESS; 630 631 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 632 cq->rib_hca = hca; 633 cq_attr.cq_size = cq_size; 634 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 635 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 636 &real_size); 637 if (status != IBT_SUCCESS) { 638 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 639 " status=%d", status); 640 error = RDMA_FAILED; 641 goto fail; 642 } 643 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 644 645 /* 646 * Enable CQ callbacks. CQ Callbacks are single shot 647 * (e.g. you have to call ibt_enable_cq_notify() 648 * after each callback to get another one). 649 */ 650 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 651 if (status != IBT_SUCCESS) { 652 cmn_err(CE_WARN, "rib_create_cq: " 653 "enable_cq_notify failed, status %d", status); 654 error = RDMA_FAILED; 655 goto fail; 656 } 657 *cqp = cq; 658 659 return (error); 660 fail: 661 if (cq->rib_cq_hdl) 662 (void) ibt_free_cq(cq->rib_cq_hdl); 663 if (cq) 664 kmem_free(cq, sizeof (rib_cq_t)); 665 return (error); 666 } 667 668 static rdma_stat 669 open_hcas(rpcib_state_t *ribstat) 670 { 671 rib_hca_t *hca; 672 ibt_status_t ibt_status; 673 rdma_stat status; 674 ibt_hca_portinfo_t *pinfop; 675 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 676 uint_t size, cq_size; 677 int i; 678 kstat_t *ksp; 679 cache_avl_struct_t example_avl_node; 680 char rssc_name[32]; 681 682 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 683 684 if (ribstat->hcas == NULL) 685 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 686 sizeof (rib_hca_t), KM_SLEEP); 687 688 /* 689 * Open a hca and setup for RDMA 690 */ 691 for (i = 0; i < ribstat->hca_count; i++) { 692 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 693 ribstat->hca_guids[i], 694 &ribstat->hcas[i].hca_hdl); 695 if (ibt_status != IBT_SUCCESS) { 696 continue; 697 } 698 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 699 hca = &(ribstat->hcas[i]); 700 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 701 hca->state = HCA_INITED; 702 703 /* 704 * query HCA info 705 */ 706 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 707 if (ibt_status != IBT_SUCCESS) { 708 goto fail1; 709 } 710 711 /* 712 * One PD (Protection Domain) per HCA. 713 * A qp is allowed to access a memory region 714 * only when it's in the same PD as that of 715 * the memory region. 716 */ 717 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 718 if (ibt_status != IBT_SUCCESS) { 719 goto fail1; 720 } 721 722 /* 723 * query HCA ports 724 */ 725 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 726 0, &pinfop, &hca->hca_nports, &size); 727 if (ibt_status != IBT_SUCCESS) { 728 goto fail2; 729 } 730 hca->hca_ports = pinfop; 731 hca->hca_pinfosz = size; 732 pinfop = NULL; 733 734 cq_size = DEF_CQ_SIZE; /* default cq size */ 735 /* 736 * Create 2 pairs of cq's (1 pair for client 737 * and the other pair for server) on this hca. 738 * If number of qp's gets too large, then several 739 * cq's will be needed. 740 */ 741 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 742 &hca->svc_rcq, ribstat); 743 if (status != RDMA_SUCCESS) { 744 goto fail3; 745 } 746 747 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 748 &hca->svc_scq, ribstat); 749 if (status != RDMA_SUCCESS) { 750 goto fail3; 751 } 752 753 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 754 &hca->clnt_rcq, ribstat); 755 if (status != RDMA_SUCCESS) { 756 goto fail3; 757 } 758 759 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 760 &hca->clnt_scq, ribstat); 761 if (status != RDMA_SUCCESS) { 762 goto fail3; 763 } 764 765 /* 766 * Create buffer pools. 767 * Note rib_rbuf_create also allocates memory windows. 768 */ 769 hca->recv_pool = rib_rbufpool_create(hca, 770 RECV_BUFFER, MAX_BUFS); 771 if (hca->recv_pool == NULL) { 772 goto fail3; 773 } 774 775 hca->send_pool = rib_rbufpool_create(hca, 776 SEND_BUFFER, MAX_BUFS); 777 if (hca->send_pool == NULL) { 778 rib_rbufpool_destroy(hca, RECV_BUFFER); 779 goto fail3; 780 } 781 782 if (hca->server_side_cache == NULL) { 783 (void) sprintf(rssc_name, 784 "rib_server_side_cache_%04d", i); 785 hca->server_side_cache = kmem_cache_create( 786 rssc_name, 787 sizeof (cache_avl_struct_t), 0, 788 NULL, 789 NULL, 790 rib_server_side_cache_reclaim, 791 hca, NULL, 0); 792 } 793 794 avl_create(&hca->avl_tree, 795 avl_compare, 796 sizeof (cache_avl_struct_t), 797 (uint_t)(uintptr_t)&example_avl_node.avl_link- 798 (uint_t)(uintptr_t)&example_avl_node); 799 800 rw_init(&hca->avl_rw_lock, 801 NULL, RW_DRIVER, hca->iblock); 802 mutex_init(&hca->cache_allocation, 803 NULL, MUTEX_DRIVER, NULL); 804 hca->avl_init = TRUE; 805 806 /* Create kstats for the cache */ 807 ASSERT(INGLOBALZONE(curproc)); 808 809 if (!stats_enabled) { 810 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 811 KSTAT_TYPE_NAMED, 812 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 813 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 814 GLOBAL_ZONEID); 815 if (ksp) { 816 ksp->ks_data = (void *) &rpcib_kstat; 817 ksp->ks_update = rpcib_cache_kstat_update; 818 kstat_install(ksp); 819 stats_enabled = TRUE; 820 } 821 } 822 if (NULL == hca->reg_cache_clean_up) { 823 hca->reg_cache_clean_up = ddi_taskq_create(NULL, 824 "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0); 825 } 826 827 /* 828 * Initialize the registered service list and 829 * the lock 830 */ 831 hca->service_list = NULL; 832 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 833 834 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 835 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 836 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 837 hca->iblock); 838 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 839 hca->iblock); 840 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 841 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 842 hca->inuse = TRUE; 843 /* 844 * XXX One hca only. Add multi-hca functionality if needed 845 * later. 846 */ 847 ribstat->hca = hca; 848 ribstat->nhca_inited++; 849 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 850 break; 851 852 fail3: 853 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 854 fail2: 855 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 856 fail1: 857 (void) ibt_close_hca(hca->hca_hdl); 858 859 } 860 if (ribstat->hca != NULL) 861 return (RDMA_SUCCESS); 862 else 863 return (RDMA_FAILED); 864 } 865 866 /* 867 * Callback routines 868 */ 869 870 /* 871 * SCQ handlers 872 */ 873 /* ARGSUSED */ 874 static void 875 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 876 { 877 ibt_status_t ibt_status; 878 ibt_wc_t wc; 879 int i; 880 881 /* 882 * Re-enable cq notify here to avoid missing any 883 * completion queue notification. 884 */ 885 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 886 887 ibt_status = IBT_SUCCESS; 888 while (ibt_status != IBT_CQ_EMPTY) { 889 bzero(&wc, sizeof (wc)); 890 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 891 if (ibt_status != IBT_SUCCESS) 892 return; 893 894 /* 895 * Got a send completion 896 */ 897 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 898 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 899 CONN *conn = qptoc(wd->qp); 900 901 mutex_enter(&wd->sendwait_lock); 902 switch (wc.wc_status) { 903 case IBT_WC_SUCCESS: 904 wd->status = RDMA_SUCCESS; 905 break; 906 case IBT_WC_WR_FLUSHED_ERR: 907 wd->status = RDMA_FAILED; 908 break; 909 default: 910 /* 911 * RC Send Q Error Code Local state Remote State 912 * ==================== =========== ============ 913 * IBT_WC_BAD_RESPONSE_ERR ERROR None 914 * IBT_WC_LOCAL_LEN_ERR ERROR None 915 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 916 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 917 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 918 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 919 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 920 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 921 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 922 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 923 * IBT_WC_WR_FLUSHED_ERR None None 924 */ 925 /* 926 * Channel in error state. Set connection to 927 * ERROR and cleanup will happen either from 928 * conn_release or from rib_conn_get 929 */ 930 wd->status = RDMA_FAILED; 931 mutex_enter(&conn->c_lock); 932 if (conn->c_state != C_DISCONN_PEND) 933 conn->c_state = C_ERROR_CONN; 934 mutex_exit(&conn->c_lock); 935 break; 936 } 937 938 if (wd->cv_sig == 1) { 939 /* 940 * Notify poster 941 */ 942 cv_signal(&wd->wait_cv); 943 mutex_exit(&wd->sendwait_lock); 944 } else { 945 /* 946 * Poster not waiting for notification. 947 * Free the send buffers and send_wid 948 */ 949 for (i = 0; i < wd->nsbufs; i++) { 950 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 951 (void *)(uintptr_t)wd->sbufaddr[i]); 952 } 953 mutex_exit(&wd->sendwait_lock); 954 (void) rib_free_sendwait(wd); 955 } 956 } 957 } 958 } 959 960 /* ARGSUSED */ 961 static void 962 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 963 { 964 ibt_status_t ibt_status; 965 ibt_wc_t wc; 966 int i; 967 968 /* 969 * Re-enable cq notify here to avoid missing any 970 * completion queue notification. 971 */ 972 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 973 974 ibt_status = IBT_SUCCESS; 975 while (ibt_status != IBT_CQ_EMPTY) { 976 bzero(&wc, sizeof (wc)); 977 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 978 if (ibt_status != IBT_SUCCESS) 979 return; 980 981 /* 982 * Got a send completion 983 */ 984 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 985 struct send_wid *wd = 986 (struct send_wid *)(uintptr_t)wc.wc_id; 987 mutex_enter(&wd->sendwait_lock); 988 if (wd->cv_sig == 1) { 989 /* 990 * Update completion status and notify poster 991 */ 992 if (wc.wc_status == IBT_WC_SUCCESS) 993 wd->status = RDMA_SUCCESS; 994 else 995 wd->status = RDMA_FAILED; 996 cv_signal(&wd->wait_cv); 997 mutex_exit(&wd->sendwait_lock); 998 } else { 999 /* 1000 * Poster not waiting for notification. 1001 * Free the send buffers and send_wid 1002 */ 1003 for (i = 0; i < wd->nsbufs; i++) { 1004 rib_rbuf_free(qptoc(wd->qp), 1005 SEND_BUFFER, 1006 (void *)(uintptr_t)wd->sbufaddr[i]); 1007 } 1008 mutex_exit(&wd->sendwait_lock); 1009 (void) rib_free_sendwait(wd); 1010 } 1011 } 1012 } 1013 } 1014 1015 /* 1016 * RCQ handler 1017 */ 1018 /* ARGSUSED */ 1019 static void 1020 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1021 { 1022 rib_qp_t *qp; 1023 ibt_status_t ibt_status; 1024 ibt_wc_t wc; 1025 struct recv_wid *rwid; 1026 1027 /* 1028 * Re-enable cq notify here to avoid missing any 1029 * completion queue notification. 1030 */ 1031 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1032 1033 ibt_status = IBT_SUCCESS; 1034 while (ibt_status != IBT_CQ_EMPTY) { 1035 bzero(&wc, sizeof (wc)); 1036 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1037 if (ibt_status != IBT_SUCCESS) 1038 return; 1039 1040 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1041 qp = rwid->qp; 1042 if (wc.wc_status == IBT_WC_SUCCESS) { 1043 XDR inxdrs, *xdrs; 1044 uint_t xid, vers, op, find_xid = 0; 1045 struct reply *r; 1046 CONN *conn = qptoc(qp); 1047 uint32_t rdma_credit = 0; 1048 1049 xdrs = &inxdrs; 1050 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1051 wc.wc_bytes_xfer, XDR_DECODE); 1052 /* 1053 * Treat xid as opaque (xid is the first entity 1054 * in the rpc rdma message). 1055 */ 1056 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1057 1058 /* Skip xid and set the xdr position accordingly. */ 1059 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1060 (void) xdr_u_int(xdrs, &vers); 1061 (void) xdr_u_int(xdrs, &rdma_credit); 1062 (void) xdr_u_int(xdrs, &op); 1063 XDR_DESTROY(xdrs); 1064 1065 if (vers != RPCRDMA_VERS) { 1066 /* 1067 * Invalid RPC/RDMA version. Cannot 1068 * interoperate. Set connection to 1069 * ERROR state and bail out. 1070 */ 1071 mutex_enter(&conn->c_lock); 1072 if (conn->c_state != C_DISCONN_PEND) 1073 conn->c_state = C_ERROR_CONN; 1074 mutex_exit(&conn->c_lock); 1075 rib_rbuf_free(conn, RECV_BUFFER, 1076 (void *)(uintptr_t)rwid->addr); 1077 rib_free_wid(rwid); 1078 continue; 1079 } 1080 1081 mutex_enter(&qp->replylist_lock); 1082 for (r = qp->replylist; r != NULL; r = r->next) { 1083 if (r->xid == xid) { 1084 find_xid = 1; 1085 switch (op) { 1086 case RDMA_MSG: 1087 case RDMA_NOMSG: 1088 case RDMA_MSGP: 1089 r->status = RDMA_SUCCESS; 1090 r->vaddr_cq = rwid->addr; 1091 r->bytes_xfer = 1092 wc.wc_bytes_xfer; 1093 cv_signal(&r->wait_cv); 1094 break; 1095 default: 1096 rib_rbuf_free(qptoc(qp), 1097 RECV_BUFFER, 1098 (void *)(uintptr_t) 1099 rwid->addr); 1100 break; 1101 } 1102 break; 1103 } 1104 } 1105 mutex_exit(&qp->replylist_lock); 1106 if (find_xid == 0) { 1107 /* RPC caller not waiting for reply */ 1108 1109 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1110 int, xid); 1111 1112 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1113 (void *)(uintptr_t)rwid->addr); 1114 } 1115 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1116 CONN *conn = qptoc(qp); 1117 1118 /* 1119 * Connection being flushed. Just free 1120 * the posted buffer 1121 */ 1122 rib_rbuf_free(conn, RECV_BUFFER, 1123 (void *)(uintptr_t)rwid->addr); 1124 } else { 1125 CONN *conn = qptoc(qp); 1126 /* 1127 * RC Recv Q Error Code Local state Remote State 1128 * ==================== =========== ============ 1129 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1130 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1131 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1132 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1133 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1134 * IBT_WC_WR_FLUSHED_ERR None None 1135 */ 1136 /* 1137 * Channel in error state. Set connection 1138 * in ERROR state. 1139 */ 1140 mutex_enter(&conn->c_lock); 1141 if (conn->c_state != C_DISCONN_PEND) 1142 conn->c_state = C_ERROR_CONN; 1143 mutex_exit(&conn->c_lock); 1144 rib_rbuf_free(conn, RECV_BUFFER, 1145 (void *)(uintptr_t)rwid->addr); 1146 } 1147 rib_free_wid(rwid); 1148 } 1149 } 1150 1151 /* Server side */ 1152 /* ARGSUSED */ 1153 static void 1154 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1155 { 1156 rdma_recv_data_t *rdp; 1157 rib_qp_t *qp; 1158 ibt_status_t ibt_status; 1159 ibt_wc_t wc; 1160 struct svc_recv *s_recvp; 1161 CONN *conn; 1162 mblk_t *mp; 1163 1164 /* 1165 * Re-enable cq notify here to avoid missing any 1166 * completion queue notification. 1167 */ 1168 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1169 1170 ibt_status = IBT_SUCCESS; 1171 while (ibt_status != IBT_CQ_EMPTY) { 1172 bzero(&wc, sizeof (wc)); 1173 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1174 if (ibt_status != IBT_SUCCESS) 1175 return; 1176 1177 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1178 qp = s_recvp->qp; 1179 conn = qptoc(qp); 1180 mutex_enter(&qp->posted_rbufs_lock); 1181 qp->n_posted_rbufs--; 1182 #if defined(MEASURE_POOL_DEPTH) 1183 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); 1184 #endif 1185 if (qp->n_posted_rbufs == 0) 1186 cv_signal(&qp->posted_rbufs_cv); 1187 mutex_exit(&qp->posted_rbufs_lock); 1188 1189 if (wc.wc_status == IBT_WC_SUCCESS) { 1190 XDR inxdrs, *xdrs; 1191 uint_t xid, vers, op; 1192 uint32_t rdma_credit; 1193 1194 xdrs = &inxdrs; 1195 /* s_recvp->vaddr stores data */ 1196 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1197 wc.wc_bytes_xfer, XDR_DECODE); 1198 1199 /* 1200 * Treat xid as opaque (xid is the first entity 1201 * in the rpc rdma message). 1202 */ 1203 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1204 /* Skip xid and set the xdr position accordingly. */ 1205 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1206 if (!xdr_u_int(xdrs, &vers) || 1207 !xdr_u_int(xdrs, &rdma_credit) || 1208 !xdr_u_int(xdrs, &op)) { 1209 rib_rbuf_free(conn, RECV_BUFFER, 1210 (void *)(uintptr_t)s_recvp->vaddr); 1211 XDR_DESTROY(xdrs); 1212 (void) rib_free_svc_recv(s_recvp); 1213 continue; 1214 } 1215 XDR_DESTROY(xdrs); 1216 1217 if (vers != RPCRDMA_VERS) { 1218 /* 1219 * Invalid RPC/RDMA version. 1220 * Drop rpc rdma message. 1221 */ 1222 rib_rbuf_free(conn, RECV_BUFFER, 1223 (void *)(uintptr_t)s_recvp->vaddr); 1224 (void) rib_free_svc_recv(s_recvp); 1225 continue; 1226 } 1227 /* 1228 * Is this for RDMA_DONE? 1229 */ 1230 if (op == RDMA_DONE) { 1231 rib_rbuf_free(conn, RECV_BUFFER, 1232 (void *)(uintptr_t)s_recvp->vaddr); 1233 /* 1234 * Wake up the thread waiting on 1235 * a RDMA_DONE for xid 1236 */ 1237 mutex_enter(&qp->rdlist_lock); 1238 rdma_done_notify(qp, xid); 1239 mutex_exit(&qp->rdlist_lock); 1240 (void) rib_free_svc_recv(s_recvp); 1241 continue; 1242 } 1243 1244 mutex_enter(&plugin_state_lock); 1245 if (plugin_state == ACCEPT) { 1246 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1247 == NULL) 1248 (void) strwaitbuf( 1249 sizeof (*rdp), BPRI_LO); 1250 /* 1251 * Plugin is in accept state, hence the master 1252 * transport queue for this is still accepting 1253 * requests. Hence we can call svc_queuereq to 1254 * queue this recieved msg. 1255 */ 1256 rdp = (rdma_recv_data_t *)mp->b_rptr; 1257 rdp->conn = conn; 1258 rdp->rpcmsg.addr = 1259 (caddr_t)(uintptr_t)s_recvp->vaddr; 1260 rdp->rpcmsg.type = RECV_BUFFER; 1261 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1262 rdp->status = wc.wc_status; 1263 mutex_enter(&conn->c_lock); 1264 conn->c_ref++; 1265 mutex_exit(&conn->c_lock); 1266 mp->b_wptr += sizeof (*rdp); 1267 svc_queuereq((queue_t *)rib_stat->q, mp); 1268 mutex_exit(&plugin_state_lock); 1269 } else { 1270 /* 1271 * The master transport for this is going 1272 * away and the queue is not accepting anymore 1273 * requests for krpc, so don't do anything, just 1274 * free the msg. 1275 */ 1276 mutex_exit(&plugin_state_lock); 1277 rib_rbuf_free(conn, RECV_BUFFER, 1278 (void *)(uintptr_t)s_recvp->vaddr); 1279 } 1280 } else { 1281 rib_rbuf_free(conn, RECV_BUFFER, 1282 (void *)(uintptr_t)s_recvp->vaddr); 1283 } 1284 (void) rib_free_svc_recv(s_recvp); 1285 } 1286 } 1287 1288 /* 1289 * Handles DR event of IBT_HCA_DETACH_EVENT. 1290 */ 1291 /* ARGSUSED */ 1292 static void 1293 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1294 ibt_async_code_t code, ibt_async_event_t *event) 1295 { 1296 1297 switch (code) { 1298 case IBT_HCA_ATTACH_EVENT: 1299 /* ignore */ 1300 break; 1301 case IBT_HCA_DETACH_EVENT: 1302 { 1303 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1304 rib_detach_hca(rib_stat->hca); 1305 #ifdef DEBUG 1306 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1307 #endif 1308 break; 1309 } 1310 #ifdef DEBUG 1311 case IBT_EVENT_PATH_MIGRATED: 1312 cmn_err(CE_NOTE, "rib_async_handler(): " 1313 "IBT_EVENT_PATH_MIGRATED\n"); 1314 break; 1315 case IBT_EVENT_SQD: 1316 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1317 break; 1318 case IBT_EVENT_COM_EST: 1319 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1320 break; 1321 case IBT_ERROR_CATASTROPHIC_CHAN: 1322 cmn_err(CE_NOTE, "rib_async_handler(): " 1323 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1324 break; 1325 case IBT_ERROR_INVALID_REQUEST_CHAN: 1326 cmn_err(CE_NOTE, "rib_async_handler(): " 1327 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1328 break; 1329 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1330 cmn_err(CE_NOTE, "rib_async_handler(): " 1331 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1332 break; 1333 case IBT_ERROR_PATH_MIGRATE_REQ: 1334 cmn_err(CE_NOTE, "rib_async_handler(): " 1335 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1336 break; 1337 case IBT_ERROR_CQ: 1338 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1339 break; 1340 case IBT_ERROR_PORT_DOWN: 1341 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1342 break; 1343 case IBT_EVENT_PORT_UP: 1344 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1345 break; 1346 case IBT_ASYNC_OPAQUE1: 1347 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1348 break; 1349 case IBT_ASYNC_OPAQUE2: 1350 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1351 break; 1352 case IBT_ASYNC_OPAQUE3: 1353 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1354 break; 1355 case IBT_ASYNC_OPAQUE4: 1356 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1357 break; 1358 #endif 1359 default: 1360 break; 1361 } 1362 } 1363 1364 /* 1365 * Client's reachable function. 1366 */ 1367 static rdma_stat 1368 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1369 { 1370 rib_hca_t *hca; 1371 rdma_stat status; 1372 1373 /* 1374 * First check if a hca is still attached 1375 */ 1376 *handle = NULL; 1377 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1378 if (rib_stat->hca->state != HCA_INITED) { 1379 rw_exit(&rib_stat->hca->state_lock); 1380 return (RDMA_FAILED); 1381 } 1382 status = rib_ping_srv(addr_type, raddr, &hca); 1383 rw_exit(&rib_stat->hca->state_lock); 1384 1385 if (status == RDMA_SUCCESS) { 1386 *handle = (void *)hca; 1387 return (RDMA_SUCCESS); 1388 } else { 1389 *handle = NULL; 1390 DTRACE_PROBE(rpcib__i__pingfailed); 1391 return (RDMA_FAILED); 1392 } 1393 } 1394 1395 /* Client side qp creation */ 1396 static rdma_stat 1397 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1398 { 1399 rib_qp_t *kqp = NULL; 1400 CONN *conn; 1401 rdma_clnt_cred_ctrl_t *cc_info; 1402 1403 ASSERT(qp != NULL); 1404 *qp = NULL; 1405 1406 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1407 conn = qptoc(kqp); 1408 kqp->hca = hca; 1409 kqp->rdmaconn.c_rdmamod = &rib_mod; 1410 kqp->rdmaconn.c_private = (caddr_t)kqp; 1411 1412 kqp->mode = RIB_CLIENT; 1413 kqp->chan_flags = IBT_BLOCKING; 1414 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1415 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1416 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1417 /* 1418 * Initialize 1419 */ 1420 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1421 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1422 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1423 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1424 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1425 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1426 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1427 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1428 /* 1429 * Initialize the client credit control 1430 * portion of the rdmaconn struct. 1431 */ 1432 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1433 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1434 cc_info->clnt_cc_granted_ops = 0; 1435 cc_info->clnt_cc_in_flight_ops = 0; 1436 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1437 1438 *qp = kqp; 1439 return (RDMA_SUCCESS); 1440 } 1441 1442 /* Server side qp creation */ 1443 static rdma_stat 1444 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1445 { 1446 rib_qp_t *kqp = NULL; 1447 ibt_chan_sizes_t chan_sizes; 1448 ibt_rc_chan_alloc_args_t qp_attr; 1449 ibt_status_t ibt_status; 1450 rdma_srv_cred_ctrl_t *cc_info; 1451 1452 *qp = NULL; 1453 1454 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1455 kqp->hca = hca; 1456 kqp->port_num = port; 1457 kqp->rdmaconn.c_rdmamod = &rib_mod; 1458 kqp->rdmaconn.c_private = (caddr_t)kqp; 1459 1460 /* 1461 * Create the qp handle 1462 */ 1463 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1464 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1465 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1466 qp_attr.rc_pd = hca->pd_hdl; 1467 qp_attr.rc_hca_port_num = port; 1468 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1469 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1470 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1471 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1472 qp_attr.rc_clone_chan = NULL; 1473 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1474 qp_attr.rc_flags = IBT_WR_SIGNALED; 1475 1476 rw_enter(&hca->state_lock, RW_READER); 1477 if (hca->state != HCA_DETACHED) { 1478 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1479 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1480 &chan_sizes); 1481 } else { 1482 rw_exit(&hca->state_lock); 1483 goto fail; 1484 } 1485 rw_exit(&hca->state_lock); 1486 1487 if (ibt_status != IBT_SUCCESS) { 1488 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1489 int, ibt_status); 1490 goto fail; 1491 } 1492 1493 kqp->mode = RIB_SERVER; 1494 kqp->chan_flags = IBT_BLOCKING; 1495 kqp->q = q; /* server ONLY */ 1496 1497 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1498 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1499 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1500 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1501 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1502 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1503 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1504 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1505 /* 1506 * Set the private data area to qp to be used in callbacks 1507 */ 1508 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1509 kqp->rdmaconn.c_state = C_CONNECTED; 1510 1511 /* 1512 * Initialize the server credit control 1513 * portion of the rdmaconn struct. 1514 */ 1515 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1516 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1517 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1518 cc_info->srv_cc_cur_buffers_used = 0; 1519 cc_info->srv_cc_posted = preposted_rbufs; 1520 1521 *qp = kqp; 1522 1523 return (RDMA_SUCCESS); 1524 fail: 1525 if (kqp) 1526 kmem_free(kqp, sizeof (rib_qp_t)); 1527 1528 return (RDMA_FAILED); 1529 } 1530 1531 /* ARGSUSED */ 1532 ibt_cm_status_t 1533 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1534 ibt_cm_return_args_t *ret_args, void *priv_data, 1535 ibt_priv_data_len_t len) 1536 { 1537 rpcib_state_t *ribstat; 1538 rib_hca_t *hca; 1539 1540 ribstat = (rpcib_state_t *)clnt_hdl; 1541 hca = (rib_hca_t *)ribstat->hca; 1542 1543 switch (event->cm_type) { 1544 1545 /* got a connection close event */ 1546 case IBT_CM_EVENT_CONN_CLOSED: 1547 { 1548 CONN *conn; 1549 rib_qp_t *qp; 1550 1551 /* check reason why connection was closed */ 1552 switch (event->cm_event.closed) { 1553 case IBT_CM_CLOSED_DREP_RCVD: 1554 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1555 case IBT_CM_CLOSED_DUP: 1556 case IBT_CM_CLOSED_ABORT: 1557 case IBT_CM_CLOSED_ALREADY: 1558 /* 1559 * These cases indicate the local end initiated 1560 * the closing of the channel. Nothing to do here. 1561 */ 1562 break; 1563 default: 1564 /* 1565 * Reason for CONN_CLOSED event must be one of 1566 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1567 * or IBT_CM_CLOSED_STALE. These indicate cases were 1568 * the remote end is closing the channel. In these 1569 * cases free the channel and transition to error 1570 * state 1571 */ 1572 qp = ibt_get_chan_private(event->cm_channel); 1573 conn = qptoc(qp); 1574 mutex_enter(&conn->c_lock); 1575 if (conn->c_state == C_DISCONN_PEND) { 1576 mutex_exit(&conn->c_lock); 1577 break; 1578 } 1579 1580 conn->c_state = C_ERROR_CONN; 1581 1582 /* 1583 * Free the rc_channel. Channel has already 1584 * transitioned to ERROR state and WRs have been 1585 * FLUSHED_ERR already. 1586 */ 1587 (void) ibt_free_channel(qp->qp_hdl); 1588 qp->qp_hdl = NULL; 1589 1590 /* 1591 * Free the conn if c_ref is down to 0 already 1592 */ 1593 if (conn->c_ref == 0) { 1594 /* 1595 * Remove from list and free conn 1596 */ 1597 conn->c_state = C_DISCONN_PEND; 1598 mutex_exit(&conn->c_lock); 1599 (void) rib_disconnect_channel(conn, 1600 &hca->cl_conn_list); 1601 } else { 1602 mutex_exit(&conn->c_lock); 1603 } 1604 #ifdef DEBUG 1605 if (rib_debug) 1606 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1607 "(CONN_CLOSED) channel disconnected"); 1608 #endif 1609 break; 1610 } 1611 break; 1612 } 1613 default: 1614 break; 1615 } 1616 return (IBT_CM_ACCEPT); 1617 } 1618 1619 /* Check server ib address */ 1620 rdma_stat 1621 rib_chk_srv_ibaddr(struct netbuf *raddr, 1622 int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip, 1623 ibt_ip_addr_t *d_ip) 1624 { 1625 struct sockaddr_in *sin4; 1626 struct sockaddr_in6 *sin6; 1627 ibt_status_t ibt_status; 1628 ibt_ip_path_attr_t ipattr; 1629 uint8_t npaths = 0; 1630 ibt_path_ip_src_t srcip; 1631 1632 ASSERT(raddr->buf != NULL); 1633 1634 (void) bzero(path, sizeof (ibt_path_info_t)); 1635 1636 switch (addr_type) { 1637 case AF_INET: 1638 sin4 = (struct sockaddr_in *)raddr->buf; 1639 d_ip->family = AF_INET; 1640 d_ip->un.ip4addr = sin4->sin_addr.s_addr; 1641 break; 1642 1643 case AF_INET6: 1644 sin6 = (struct sockaddr_in6 *)raddr->buf; 1645 d_ip->family = AF_INET6; 1646 d_ip->un.ip6addr = sin6->sin6_addr; 1647 break; 1648 1649 default: 1650 return (RDMA_INVAL); 1651 } 1652 1653 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1654 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1655 1656 ipattr.ipa_dst_ip = d_ip; 1657 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1658 ipattr.ipa_ndst = 1; 1659 ipattr.ipa_max_paths = 1; 1660 npaths = 0; 1661 1662 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1663 IBT_PATH_NO_FLAGS, 1664 &ipattr, 1665 path, 1666 &npaths, 1667 &srcip); 1668 1669 if (ibt_status != IBT_SUCCESS || 1670 npaths < 1 || 1671 path->pi_hca_guid != rib_stat->hca->hca_guid) { 1672 1673 bzero(s_ip, sizeof (ibt_path_ip_src_t)); 1674 return (RDMA_FAILED); 1675 } 1676 1677 if (srcip.ip_primary.family == AF_INET) { 1678 s_ip->family = AF_INET; 1679 s_ip->un.ip4addr = srcip.ip_primary.un.ip4addr; 1680 } else { 1681 s_ip->family = AF_INET6; 1682 s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr; 1683 } 1684 1685 return (RDMA_SUCCESS); 1686 } 1687 1688 1689 /* 1690 * Connect to the server. 1691 */ 1692 rdma_stat 1693 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path, 1694 ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip) 1695 { 1696 ibt_chan_open_args_t chan_args; /* channel args */ 1697 ibt_chan_sizes_t chan_sizes; 1698 ibt_rc_chan_alloc_args_t qp_attr; 1699 ibt_status_t ibt_status; 1700 ibt_rc_returns_t ret_args; /* conn reject info */ 1701 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1702 ibt_ip_cm_info_t ipcm_info; 1703 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1704 1705 1706 (void) bzero(&chan_args, sizeof (chan_args)); 1707 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1708 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1709 1710 switch (ipcm_info.src_addr.family = s_ip->family) { 1711 case AF_INET: 1712 ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr; 1713 break; 1714 case AF_INET6: 1715 ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr; 1716 break; 1717 } 1718 1719 switch (ipcm_info.dst_addr.family = d_ip->family) { 1720 case AF_INET: 1721 ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr; 1722 break; 1723 case AF_INET6: 1724 ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr; 1725 break; 1726 } 1727 1728 ipcm_info.src_port = NFS_RDMA_PORT; 1729 1730 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1731 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1732 1733 if (ibt_status != IBT_SUCCESS) { 1734 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1735 return (-1); 1736 } 1737 1738 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 1739 /* Alloc a RC channel */ 1740 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1741 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1742 qp_attr.rc_pd = hca->pd_hdl; 1743 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1744 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1745 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1746 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1747 qp_attr.rc_clone_chan = NULL; 1748 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1749 qp_attr.rc_flags = IBT_WR_SIGNALED; 1750 1751 path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT); 1752 chan_args.oc_path = path; 1753 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1754 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1755 chan_args.oc_rdma_ra_out = 4; 1756 chan_args.oc_rdma_ra_in = 4; 1757 chan_args.oc_path_retry_cnt = 2; 1758 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1759 chan_args.oc_priv_data = cmp_ip_pvt; 1760 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1761 1762 refresh: 1763 rw_enter(&hca->state_lock, RW_READER); 1764 if (hca->state != HCA_DETACHED) { 1765 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1766 IBT_ACHAN_NO_FLAGS, 1767 &qp_attr, &qp->qp_hdl, 1768 &chan_sizes); 1769 } else { 1770 rw_exit(&hca->state_lock); 1771 return (RDMA_FAILED); 1772 } 1773 rw_exit(&hca->state_lock); 1774 1775 if (ibt_status != IBT_SUCCESS) { 1776 DTRACE_PROBE1(rpcib__i_conntosrv, 1777 int, ibt_status); 1778 return (RDMA_FAILED); 1779 } 1780 1781 /* Connect to the Server */ 1782 (void) bzero(&ret_args, sizeof (ret_args)); 1783 mutex_enter(&qp->cb_lock); 1784 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1785 IBT_BLOCKING, &chan_args, &ret_args); 1786 if (ibt_status != IBT_SUCCESS) { 1787 DTRACE_PROBE2(rpcib__i_openrctosrv, 1788 int, ibt_status, int, ret_args.rc_status); 1789 1790 (void) ibt_free_channel(qp->qp_hdl); 1791 qp->qp_hdl = NULL; 1792 mutex_exit(&qp->cb_lock); 1793 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1794 ret_args.rc_status == IBT_CM_CONN_STALE) { 1795 /* 1796 * Got IBT_CM_CONN_STALE probably because of stale 1797 * data on the passive end of a channel that existed 1798 * prior to reboot. Retry establishing a channel 1799 * REFRESH_ATTEMPTS times, during which time the 1800 * stale conditions on the server might clear up. 1801 */ 1802 goto refresh; 1803 } 1804 return (RDMA_FAILED); 1805 } 1806 mutex_exit(&qp->cb_lock); 1807 /* 1808 * Set the private data area to qp to be used in callbacks 1809 */ 1810 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1811 return (RDMA_SUCCESS); 1812 } 1813 1814 rdma_stat 1815 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) 1816 { 1817 uint_t i; 1818 ibt_path_info_t path; 1819 ibt_status_t ibt_status; 1820 uint8_t num_paths_p; 1821 ibt_ip_path_attr_t ipattr; 1822 ibt_ip_addr_t dstip; 1823 ibt_path_ip_src_t srcip; 1824 rpcib_ipaddrs_t addrs4; 1825 rpcib_ipaddrs_t addrs6; 1826 struct sockaddr_in *sinp; 1827 struct sockaddr_in6 *sin6p; 1828 rdma_stat retval = RDMA_SUCCESS; 1829 1830 *hca = NULL; 1831 ASSERT(raddr->buf != NULL); 1832 1833 bzero(&path, sizeof (ibt_path_info_t)); 1834 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1835 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1836 1837 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1838 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1839 retval = RDMA_FAILED; 1840 goto done; 1841 } 1842 1843 /* Prep the destination address */ 1844 switch (addr_type) { 1845 case AF_INET: 1846 sinp = (struct sockaddr_in *)raddr->buf; 1847 dstip.family = AF_INET; 1848 dstip.un.ip4addr = sinp->sin_addr.s_addr; 1849 sinp = addrs4.ri_list; 1850 1851 for (i = 0; i < addrs4.ri_count; i++) { 1852 num_paths_p = 0; 1853 ipattr.ipa_dst_ip = &dstip; 1854 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1855 ipattr.ipa_ndst = 1; 1856 ipattr.ipa_max_paths = 1; 1857 ipattr.ipa_src_ip.family = dstip.family; 1858 ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; 1859 1860 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1861 IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, 1862 &srcip); 1863 if (ibt_status == IBT_SUCCESS && 1864 num_paths_p != 0 && 1865 path.pi_hca_guid == rib_stat->hca->hca_guid) { 1866 *hca = rib_stat->hca; 1867 goto done; 1868 } 1869 } 1870 retval = RDMA_FAILED; 1871 break; 1872 1873 case AF_INET6: 1874 sin6p = (struct sockaddr_in6 *)raddr->buf; 1875 dstip.family = AF_INET6; 1876 dstip.un.ip6addr = sin6p->sin6_addr; 1877 sin6p = addrs6.ri_list; 1878 1879 for (i = 0; i < addrs6.ri_count; i++) { 1880 num_paths_p = 0; 1881 ipattr.ipa_dst_ip = &dstip; 1882 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1883 ipattr.ipa_ndst = 1; 1884 ipattr.ipa_max_paths = 1; 1885 ipattr.ipa_src_ip.family = dstip.family; 1886 ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; 1887 1888 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1889 IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, 1890 &srcip); 1891 if (ibt_status == IBT_SUCCESS && 1892 num_paths_p != 0 && 1893 path.pi_hca_guid == rib_stat->hca->hca_guid) { 1894 *hca = rib_stat->hca; 1895 goto done; 1896 } 1897 } 1898 retval = RDMA_FAILED; 1899 break; 1900 1901 default: 1902 retval = RDMA_INVAL; 1903 break; 1904 } 1905 done: 1906 if (addrs4.ri_size > 0) 1907 kmem_free(addrs4.ri_list, addrs4.ri_size); 1908 if (addrs6.ri_size > 0) 1909 kmem_free(addrs6.ri_list, addrs6.ri_size); 1910 return (retval); 1911 } 1912 1913 /* 1914 * Close channel, remove from connection list and 1915 * free up resources allocated for that channel. 1916 */ 1917 rdma_stat 1918 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1919 { 1920 rib_qp_t *qp = ctoqp(conn); 1921 rib_hca_t *hca; 1922 1923 /* 1924 * c_ref == 0 and connection is in C_DISCONN_PEND 1925 */ 1926 hca = qp->hca; 1927 if (conn_list != NULL) 1928 (void) rib_rm_conn(conn, conn_list); 1929 1930 if (qp->qp_hdl != NULL) { 1931 /* 1932 * If the channel has not been establised, 1933 * ibt_flush_channel is called to flush outstanding WRs 1934 * on the Qs. Otherwise, ibt_close_rc_channel() is 1935 * called. The channel is then freed. 1936 */ 1937 if (conn_list != NULL) 1938 (void) ibt_close_rc_channel(qp->qp_hdl, 1939 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1940 else 1941 (void) ibt_flush_channel(qp->qp_hdl); 1942 1943 mutex_enter(&qp->posted_rbufs_lock); 1944 while (qp->n_posted_rbufs) 1945 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1946 mutex_exit(&qp->posted_rbufs_lock); 1947 (void) ibt_free_channel(qp->qp_hdl); 1948 qp->qp_hdl = NULL; 1949 } 1950 1951 ASSERT(qp->rdlist == NULL); 1952 1953 if (qp->replylist != NULL) { 1954 (void) rib_rem_replylist(qp); 1955 } 1956 1957 cv_destroy(&qp->cb_conn_cv); 1958 cv_destroy(&qp->posted_rbufs_cv); 1959 mutex_destroy(&qp->cb_lock); 1960 1961 mutex_destroy(&qp->replylist_lock); 1962 mutex_destroy(&qp->posted_rbufs_lock); 1963 mutex_destroy(&qp->rdlist_lock); 1964 1965 cv_destroy(&conn->c_cv); 1966 mutex_destroy(&conn->c_lock); 1967 1968 if (conn->c_raddr.buf != NULL) { 1969 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1970 } 1971 if (conn->c_laddr.buf != NULL) { 1972 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1973 } 1974 1975 /* 1976 * Credit control cleanup. 1977 */ 1978 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 1979 rdma_clnt_cred_ctrl_t *cc_info; 1980 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1981 cv_destroy(&cc_info->clnt_cc_cv); 1982 } 1983 1984 kmem_free(qp, sizeof (rib_qp_t)); 1985 1986 /* 1987 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1988 * then the hca is no longer being used. 1989 */ 1990 if (conn_list != NULL) { 1991 rw_enter(&hca->state_lock, RW_READER); 1992 if (hca->state == HCA_DETACHED) { 1993 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1994 if (hca->srv_conn_list.conn_hd == NULL) { 1995 rw_enter(&hca->cl_conn_list.conn_lock, 1996 RW_READER); 1997 1998 if (hca->cl_conn_list.conn_hd == NULL) { 1999 mutex_enter(&hca->inuse_lock); 2000 hca->inuse = FALSE; 2001 cv_signal(&hca->cb_cv); 2002 mutex_exit(&hca->inuse_lock); 2003 } 2004 rw_exit(&hca->cl_conn_list.conn_lock); 2005 } 2006 rw_exit(&hca->srv_conn_list.conn_lock); 2007 } 2008 rw_exit(&hca->state_lock); 2009 } 2010 2011 return (RDMA_SUCCESS); 2012 } 2013 2014 /* 2015 * Wait for send completion notification. Only on receiving a 2016 * notification be it a successful or error completion, free the 2017 * send_wid. 2018 */ 2019 static rdma_stat 2020 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2021 { 2022 clock_t timout, cv_wait_ret; 2023 rdma_stat error = RDMA_SUCCESS; 2024 int i; 2025 2026 /* 2027 * Wait for send to complete 2028 */ 2029 ASSERT(wd != NULL); 2030 mutex_enter(&wd->sendwait_lock); 2031 if (wd->status == (uint_t)SEND_WAIT) { 2032 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2033 ddi_get_lbolt(); 2034 2035 if (qp->mode == RIB_SERVER) { 2036 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2037 &wd->sendwait_lock, timout)) > 0 && 2038 wd->status == (uint_t)SEND_WAIT) 2039 ; 2040 switch (cv_wait_ret) { 2041 case -1: /* timeout */ 2042 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2043 2044 wd->cv_sig = 0; /* no signal needed */ 2045 error = RDMA_TIMEDOUT; 2046 break; 2047 default: /* got send completion */ 2048 break; 2049 } 2050 } else { 2051 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2052 &wd->sendwait_lock, timout)) > 0 && 2053 wd->status == (uint_t)SEND_WAIT) 2054 ; 2055 switch (cv_wait_ret) { 2056 case -1: /* timeout */ 2057 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2058 2059 wd->cv_sig = 0; /* no signal needed */ 2060 error = RDMA_TIMEDOUT; 2061 break; 2062 case 0: /* interrupted */ 2063 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2064 2065 wd->cv_sig = 0; /* no signal needed */ 2066 error = RDMA_INTR; 2067 break; 2068 default: /* got send completion */ 2069 break; 2070 } 2071 } 2072 } 2073 2074 if (wd->status != (uint_t)SEND_WAIT) { 2075 /* got send completion */ 2076 if (wd->status != RDMA_SUCCESS) { 2077 error = wd->status; 2078 if (wd->status != RDMA_CONNLOST) 2079 error = RDMA_FAILED; 2080 } 2081 for (i = 0; i < wd->nsbufs; i++) { 2082 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2083 (void *)(uintptr_t)wd->sbufaddr[i]); 2084 } 2085 mutex_exit(&wd->sendwait_lock); 2086 (void) rib_free_sendwait(wd); 2087 } else { 2088 mutex_exit(&wd->sendwait_lock); 2089 } 2090 return (error); 2091 } 2092 2093 static struct send_wid * 2094 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2095 { 2096 struct send_wid *wd; 2097 2098 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2099 wd->xid = xid; 2100 wd->cv_sig = cv_sig; 2101 wd->qp = qp; 2102 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2103 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2104 wd->status = (uint_t)SEND_WAIT; 2105 2106 return (wd); 2107 } 2108 2109 static int 2110 rib_free_sendwait(struct send_wid *wdesc) 2111 { 2112 cv_destroy(&wdesc->wait_cv); 2113 mutex_destroy(&wdesc->sendwait_lock); 2114 kmem_free(wdesc, sizeof (*wdesc)); 2115 2116 return (0); 2117 } 2118 2119 static rdma_stat 2120 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2121 { 2122 mutex_enter(&qp->replylist_lock); 2123 if (rep != NULL) { 2124 (void) rib_remreply(qp, rep); 2125 mutex_exit(&qp->replylist_lock); 2126 return (RDMA_SUCCESS); 2127 } 2128 mutex_exit(&qp->replylist_lock); 2129 return (RDMA_FAILED); 2130 } 2131 2132 /* 2133 * Send buffers are freed here only in case of error in posting 2134 * on QP. If the post succeeded, the send buffers are freed upon 2135 * send completion in rib_sendwait() or in the scq_handler. 2136 */ 2137 rdma_stat 2138 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2139 int send_sig, int cv_sig, caddr_t *swid) 2140 { 2141 struct send_wid *wdesc; 2142 struct clist *clp; 2143 ibt_status_t ibt_status = IBT_SUCCESS; 2144 rdma_stat ret = RDMA_SUCCESS; 2145 ibt_send_wr_t tx_wr; 2146 int i, nds; 2147 ibt_wr_ds_t sgl[DSEG_MAX]; 2148 uint_t total_msg_size; 2149 rib_qp_t *qp; 2150 2151 qp = ctoqp(conn); 2152 2153 ASSERT(cl != NULL); 2154 2155 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2156 2157 nds = 0; 2158 total_msg_size = 0; 2159 clp = cl; 2160 while (clp != NULL) { 2161 if (nds >= DSEG_MAX) { 2162 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2163 return (RDMA_FAILED); 2164 } 2165 sgl[nds].ds_va = clp->w.c_saddr; 2166 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2167 sgl[nds].ds_len = clp->c_len; 2168 total_msg_size += clp->c_len; 2169 clp = clp->c_next; 2170 nds++; 2171 } 2172 2173 if (send_sig) { 2174 /* Set SEND_SIGNAL flag. */ 2175 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2176 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2177 *swid = (caddr_t)wdesc; 2178 } else { 2179 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2180 wdesc = rib_init_sendwait(msgid, 0, qp); 2181 *swid = (caddr_t)wdesc; 2182 } 2183 wdesc->nsbufs = nds; 2184 for (i = 0; i < nds; i++) { 2185 wdesc->sbufaddr[i] = sgl[i].ds_va; 2186 } 2187 2188 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2189 tx_wr.wr_opcode = IBT_WRC_SEND; 2190 tx_wr.wr_trans = IBT_RC_SRV; 2191 tx_wr.wr_nds = nds; 2192 tx_wr.wr_sgl = sgl; 2193 2194 mutex_enter(&conn->c_lock); 2195 if (conn->c_state == C_CONNECTED) { 2196 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2197 } 2198 if (conn->c_state != C_CONNECTED || 2199 ibt_status != IBT_SUCCESS) { 2200 if (conn->c_state != C_DISCONN_PEND) 2201 conn->c_state = C_ERROR_CONN; 2202 mutex_exit(&conn->c_lock); 2203 for (i = 0; i < nds; i++) { 2204 rib_rbuf_free(conn, SEND_BUFFER, 2205 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2206 } 2207 2208 (void) rib_free_sendwait(wdesc); 2209 2210 return (RDMA_CONNLOST); 2211 } 2212 mutex_exit(&conn->c_lock); 2213 2214 if (send_sig) { 2215 if (cv_sig) { 2216 /* 2217 * cv_wait for send to complete. 2218 * We can fail due to a timeout or signal or 2219 * unsuccessful send. 2220 */ 2221 ret = rib_sendwait(qp, wdesc); 2222 2223 return (ret); 2224 } 2225 } 2226 2227 return (RDMA_SUCCESS); 2228 } 2229 2230 2231 rdma_stat 2232 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2233 { 2234 rdma_stat ret; 2235 caddr_t wd; 2236 2237 /* send-wait & cv_signal */ 2238 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2239 return (ret); 2240 } 2241 2242 /* 2243 * Server interface (svc_rdma_ksend). 2244 * Send RPC reply and wait for RDMA_DONE. 2245 */ 2246 rdma_stat 2247 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2248 { 2249 rdma_stat ret = RDMA_SUCCESS; 2250 struct rdma_done_list *rd; 2251 clock_t timout, cv_wait_ret; 2252 caddr_t *wid = NULL; 2253 rib_qp_t *qp = ctoqp(conn); 2254 2255 mutex_enter(&qp->rdlist_lock); 2256 rd = rdma_done_add(qp, msgid); 2257 2258 /* No cv_signal (whether send-wait or no-send-wait) */ 2259 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2260 2261 if (ret != RDMA_SUCCESS) { 2262 rdma_done_rm(qp, rd); 2263 } else { 2264 /* 2265 * Wait for RDMA_DONE from remote end 2266 */ 2267 timout = 2268 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2269 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2270 &qp->rdlist_lock, 2271 timout); 2272 2273 rdma_done_rm(qp, rd); 2274 2275 if (cv_wait_ret < 0) { 2276 ret = RDMA_TIMEDOUT; 2277 } 2278 } 2279 2280 mutex_exit(&qp->rdlist_lock); 2281 return (ret); 2282 } 2283 2284 static struct recv_wid * 2285 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2286 { 2287 struct recv_wid *rwid; 2288 2289 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2290 rwid->xid = msgid; 2291 rwid->addr = sgl->ds_va; 2292 rwid->qp = qp; 2293 2294 return (rwid); 2295 } 2296 2297 static void 2298 rib_free_wid(struct recv_wid *rwid) 2299 { 2300 kmem_free(rwid, sizeof (struct recv_wid)); 2301 } 2302 2303 rdma_stat 2304 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2305 { 2306 rib_qp_t *qp = ctoqp(conn); 2307 struct clist *clp = cl; 2308 struct reply *rep; 2309 struct recv_wid *rwid; 2310 int nds; 2311 ibt_wr_ds_t sgl[DSEG_MAX]; 2312 ibt_recv_wr_t recv_wr; 2313 rdma_stat ret; 2314 ibt_status_t ibt_status; 2315 2316 /* 2317 * rdma_clnt_postrecv uses RECV_BUFFER. 2318 */ 2319 2320 nds = 0; 2321 while (cl != NULL) { 2322 if (nds >= DSEG_MAX) { 2323 ret = RDMA_FAILED; 2324 goto done; 2325 } 2326 sgl[nds].ds_va = cl->w.c_saddr; 2327 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2328 sgl[nds].ds_len = cl->c_len; 2329 cl = cl->c_next; 2330 nds++; 2331 } 2332 2333 if (nds != 1) { 2334 ret = RDMA_FAILED; 2335 goto done; 2336 } 2337 2338 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2339 recv_wr.wr_nds = nds; 2340 recv_wr.wr_sgl = sgl; 2341 2342 rwid = rib_create_wid(qp, &sgl[0], msgid); 2343 if (rwid) { 2344 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2345 } else { 2346 ret = RDMA_NORESOURCE; 2347 goto done; 2348 } 2349 rep = rib_addreplylist(qp, msgid); 2350 if (!rep) { 2351 rib_free_wid(rwid); 2352 ret = RDMA_NORESOURCE; 2353 goto done; 2354 } 2355 2356 mutex_enter(&conn->c_lock); 2357 2358 if (conn->c_state == C_CONNECTED) { 2359 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2360 } 2361 2362 if (conn->c_state != C_CONNECTED || 2363 ibt_status != IBT_SUCCESS) { 2364 if (conn->c_state != C_DISCONN_PEND) 2365 conn->c_state = C_ERROR_CONN; 2366 mutex_exit(&conn->c_lock); 2367 rib_free_wid(rwid); 2368 (void) rib_rem_rep(qp, rep); 2369 ret = RDMA_CONNLOST; 2370 goto done; 2371 } 2372 mutex_exit(&conn->c_lock); 2373 return (RDMA_SUCCESS); 2374 2375 done: 2376 while (clp != NULL) { 2377 rib_rbuf_free(conn, RECV_BUFFER, 2378 (void *)(uintptr_t)clp->w.c_saddr3); 2379 clp = clp->c_next; 2380 } 2381 return (ret); 2382 } 2383 2384 rdma_stat 2385 rib_svc_post(CONN* conn, struct clist *cl) 2386 { 2387 rib_qp_t *qp = ctoqp(conn); 2388 struct svc_recv *s_recvp; 2389 int nds; 2390 ibt_wr_ds_t sgl[DSEG_MAX]; 2391 ibt_recv_wr_t recv_wr; 2392 ibt_status_t ibt_status; 2393 2394 nds = 0; 2395 while (cl != NULL) { 2396 if (nds >= DSEG_MAX) { 2397 return (RDMA_FAILED); 2398 } 2399 sgl[nds].ds_va = cl->w.c_saddr; 2400 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2401 sgl[nds].ds_len = cl->c_len; 2402 cl = cl->c_next; 2403 nds++; 2404 } 2405 2406 if (nds != 1) { 2407 rib_rbuf_free(conn, RECV_BUFFER, 2408 (caddr_t)(uintptr_t)sgl[0].ds_va); 2409 2410 return (RDMA_FAILED); 2411 } 2412 2413 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2414 recv_wr.wr_nds = nds; 2415 recv_wr.wr_sgl = sgl; 2416 2417 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2418 /* Use s_recvp's addr as wr id */ 2419 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2420 mutex_enter(&conn->c_lock); 2421 if (conn->c_state == C_CONNECTED) { 2422 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2423 } 2424 if (conn->c_state != C_CONNECTED || 2425 ibt_status != IBT_SUCCESS) { 2426 if (conn->c_state != C_DISCONN_PEND) 2427 conn->c_state = C_ERROR_CONN; 2428 mutex_exit(&conn->c_lock); 2429 rib_rbuf_free(conn, RECV_BUFFER, 2430 (caddr_t)(uintptr_t)sgl[0].ds_va); 2431 (void) rib_free_svc_recv(s_recvp); 2432 2433 return (RDMA_CONNLOST); 2434 } 2435 mutex_exit(&conn->c_lock); 2436 2437 return (RDMA_SUCCESS); 2438 } 2439 2440 /* Client */ 2441 rdma_stat 2442 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2443 { 2444 2445 return (rib_clnt_post(conn, cl, msgid)); 2446 } 2447 2448 /* Client */ 2449 rdma_stat 2450 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2451 { 2452 rib_qp_t *qp = ctoqp(conn); 2453 struct reply *rep; 2454 2455 mutex_enter(&qp->replylist_lock); 2456 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2457 if (rep->xid == msgid) { 2458 if (rep->vaddr_cq) { 2459 rib_rbuf_free(conn, RECV_BUFFER, 2460 (caddr_t)(uintptr_t)rep->vaddr_cq); 2461 } 2462 (void) rib_remreply(qp, rep); 2463 break; 2464 } 2465 } 2466 mutex_exit(&qp->replylist_lock); 2467 2468 return (RDMA_SUCCESS); 2469 } 2470 2471 /* Server */ 2472 rdma_stat 2473 rib_post_recv(CONN *conn, struct clist *cl) 2474 { 2475 rib_qp_t *qp = ctoqp(conn); 2476 2477 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2478 mutex_enter(&qp->posted_rbufs_lock); 2479 qp->n_posted_rbufs++; 2480 mutex_exit(&qp->posted_rbufs_lock); 2481 return (RDMA_SUCCESS); 2482 } 2483 return (RDMA_FAILED); 2484 } 2485 2486 /* 2487 * Client side only interface to "recv" the rpc reply buf 2488 * posted earlier by rib_post_resp(conn, cl, msgid). 2489 */ 2490 rdma_stat 2491 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2492 { 2493 struct reply *rep = NULL; 2494 clock_t timout, cv_wait_ret; 2495 rdma_stat ret = RDMA_SUCCESS; 2496 rib_qp_t *qp = ctoqp(conn); 2497 2498 /* 2499 * Find the reply structure for this msgid 2500 */ 2501 mutex_enter(&qp->replylist_lock); 2502 2503 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2504 if (rep->xid == msgid) 2505 break; 2506 } 2507 2508 if (rep != NULL) { 2509 /* 2510 * If message not yet received, wait. 2511 */ 2512 if (rep->status == (uint_t)REPLY_WAIT) { 2513 timout = ddi_get_lbolt() + 2514 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2515 2516 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2517 &qp->replylist_lock, timout)) > 0 && 2518 rep->status == (uint_t)REPLY_WAIT) 2519 ; 2520 2521 switch (cv_wait_ret) { 2522 case -1: /* timeout */ 2523 ret = RDMA_TIMEDOUT; 2524 break; 2525 case 0: 2526 ret = RDMA_INTR; 2527 break; 2528 default: 2529 break; 2530 } 2531 } 2532 2533 if (rep->status == RDMA_SUCCESS) { 2534 struct clist *cl = NULL; 2535 2536 /* 2537 * Got message successfully 2538 */ 2539 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2540 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2541 *clp = cl; 2542 } else { 2543 if (rep->status != (uint_t)REPLY_WAIT) { 2544 /* 2545 * Got error in reply message. Free 2546 * recv buffer here. 2547 */ 2548 ret = rep->status; 2549 rib_rbuf_free(conn, RECV_BUFFER, 2550 (caddr_t)(uintptr_t)rep->vaddr_cq); 2551 } 2552 } 2553 (void) rib_remreply(qp, rep); 2554 } else { 2555 /* 2556 * No matching reply structure found for given msgid on the 2557 * reply wait list. 2558 */ 2559 ret = RDMA_INVAL; 2560 DTRACE_PROBE(rpcib__i__nomatchxid2); 2561 } 2562 2563 /* 2564 * Done. 2565 */ 2566 mutex_exit(&qp->replylist_lock); 2567 return (ret); 2568 } 2569 2570 /* 2571 * RDMA write a buffer to the remote address. 2572 */ 2573 rdma_stat 2574 rib_write(CONN *conn, struct clist *cl, int wait) 2575 { 2576 ibt_send_wr_t tx_wr; 2577 int cv_sig; 2578 int i; 2579 ibt_wr_ds_t sgl[DSEG_MAX]; 2580 struct send_wid *wdesc; 2581 ibt_status_t ibt_status; 2582 rdma_stat ret = RDMA_SUCCESS; 2583 rib_qp_t *qp = ctoqp(conn); 2584 uint64_t n_writes = 0; 2585 bool_t force_wait = FALSE; 2586 2587 if (cl == NULL) { 2588 return (RDMA_FAILED); 2589 } 2590 2591 2592 while ((cl != NULL)) { 2593 if (cl->c_len > 0) { 2594 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2595 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2596 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2597 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2598 sgl[0].ds_va = cl->w.c_saddr; 2599 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2600 sgl[0].ds_len = cl->c_len; 2601 2602 if (wait) { 2603 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2604 cv_sig = 1; 2605 } else { 2606 if (n_writes > max_unsignaled_rws) { 2607 n_writes = 0; 2608 force_wait = TRUE; 2609 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2610 cv_sig = 1; 2611 } else { 2612 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2613 cv_sig = 0; 2614 } 2615 } 2616 2617 wdesc = rib_init_sendwait(0, cv_sig, qp); 2618 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2619 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2620 tx_wr.wr_trans = IBT_RC_SRV; 2621 tx_wr.wr_nds = 1; 2622 tx_wr.wr_sgl = sgl; 2623 2624 mutex_enter(&conn->c_lock); 2625 if (conn->c_state == C_CONNECTED) { 2626 ibt_status = 2627 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2628 } 2629 if (conn->c_state != C_CONNECTED || 2630 ibt_status != IBT_SUCCESS) { 2631 if (conn->c_state != C_DISCONN_PEND) 2632 conn->c_state = C_ERROR_CONN; 2633 mutex_exit(&conn->c_lock); 2634 (void) rib_free_sendwait(wdesc); 2635 return (RDMA_CONNLOST); 2636 } 2637 mutex_exit(&conn->c_lock); 2638 2639 /* 2640 * Wait for send to complete 2641 */ 2642 if (wait || force_wait) { 2643 force_wait = FALSE; 2644 ret = rib_sendwait(qp, wdesc); 2645 if (ret != 0) { 2646 return (ret); 2647 } 2648 } else { 2649 mutex_enter(&wdesc->sendwait_lock); 2650 for (i = 0; i < wdesc->nsbufs; i++) { 2651 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2652 (void *)(uintptr_t) 2653 wdesc->sbufaddr[i]); 2654 } 2655 mutex_exit(&wdesc->sendwait_lock); 2656 (void) rib_free_sendwait(wdesc); 2657 } 2658 n_writes ++; 2659 } 2660 cl = cl->c_next; 2661 } 2662 return (RDMA_SUCCESS); 2663 } 2664 2665 /* 2666 * RDMA Read a buffer from the remote address. 2667 */ 2668 rdma_stat 2669 rib_read(CONN *conn, struct clist *cl, int wait) 2670 { 2671 ibt_send_wr_t rx_wr; 2672 int cv_sig; 2673 int i; 2674 ibt_wr_ds_t sgl; 2675 struct send_wid *wdesc; 2676 ibt_status_t ibt_status = IBT_SUCCESS; 2677 rdma_stat ret = RDMA_SUCCESS; 2678 rib_qp_t *qp = ctoqp(conn); 2679 2680 if (cl == NULL) { 2681 return (RDMA_FAILED); 2682 } 2683 2684 while (cl != NULL) { 2685 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2686 /* 2687 * Remote address is at the head chunk item in list. 2688 */ 2689 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2690 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2691 2692 sgl.ds_va = cl->u.c_daddr; 2693 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2694 sgl.ds_len = cl->c_len; 2695 2696 if (wait) { 2697 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2698 cv_sig = 1; 2699 } else { 2700 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2701 cv_sig = 0; 2702 } 2703 2704 wdesc = rib_init_sendwait(0, cv_sig, qp); 2705 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2706 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2707 rx_wr.wr_trans = IBT_RC_SRV; 2708 rx_wr.wr_nds = 1; 2709 rx_wr.wr_sgl = &sgl; 2710 2711 mutex_enter(&conn->c_lock); 2712 if (conn->c_state == C_CONNECTED) { 2713 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2714 } 2715 if (conn->c_state != C_CONNECTED || 2716 ibt_status != IBT_SUCCESS) { 2717 if (conn->c_state != C_DISCONN_PEND) 2718 conn->c_state = C_ERROR_CONN; 2719 mutex_exit(&conn->c_lock); 2720 (void) rib_free_sendwait(wdesc); 2721 return (RDMA_CONNLOST); 2722 } 2723 mutex_exit(&conn->c_lock); 2724 2725 /* 2726 * Wait for send to complete if this is the 2727 * last item in the list. 2728 */ 2729 if (wait && cl->c_next == NULL) { 2730 ret = rib_sendwait(qp, wdesc); 2731 if (ret != 0) { 2732 return (ret); 2733 } 2734 } else { 2735 mutex_enter(&wdesc->sendwait_lock); 2736 for (i = 0; i < wdesc->nsbufs; i++) { 2737 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2738 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2739 } 2740 mutex_exit(&wdesc->sendwait_lock); 2741 (void) rib_free_sendwait(wdesc); 2742 } 2743 cl = cl->c_next; 2744 } 2745 return (RDMA_SUCCESS); 2746 } 2747 2748 /* 2749 * rib_srv_cm_handler() 2750 * Connection Manager callback to handle RC connection requests. 2751 */ 2752 /* ARGSUSED */ 2753 static ibt_cm_status_t 2754 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2755 ibt_cm_return_args_t *ret_args, void *priv_data, 2756 ibt_priv_data_len_t len) 2757 { 2758 queue_t *q; 2759 rib_qp_t *qp; 2760 rpcib_state_t *ribstat; 2761 rib_hca_t *hca; 2762 rdma_stat status = RDMA_SUCCESS; 2763 int i; 2764 struct clist cl; 2765 rdma_buf_t rdbuf = {0}; 2766 void *buf = NULL; 2767 CONN *conn; 2768 ibt_ip_cm_info_t ipinfo; 2769 struct sockaddr_in *s; 2770 struct sockaddr_in6 *s6; 2771 int sin_size = sizeof (struct sockaddr_in); 2772 int in_size = sizeof (struct in_addr); 2773 int sin6_size = sizeof (struct sockaddr_in6); 2774 2775 ASSERT(any != NULL); 2776 ASSERT(event != NULL); 2777 2778 ribstat = (rpcib_state_t *)any; 2779 hca = (rib_hca_t *)ribstat->hca; 2780 ASSERT(hca != NULL); 2781 2782 /* got a connection request */ 2783 switch (event->cm_type) { 2784 case IBT_CM_EVENT_REQ_RCV: 2785 /* 2786 * If the plugin is in the NO_ACCEPT state, bail out. 2787 */ 2788 mutex_enter(&plugin_state_lock); 2789 if (plugin_state == NO_ACCEPT) { 2790 mutex_exit(&plugin_state_lock); 2791 return (IBT_CM_REJECT); 2792 } 2793 mutex_exit(&plugin_state_lock); 2794 2795 /* 2796 * Need to send a MRA MAD to CM so that it does not 2797 * timeout on us. 2798 */ 2799 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2800 event->cm_event.req.req_timeout * 8, NULL, 0); 2801 2802 mutex_enter(&rib_stat->open_hca_lock); 2803 q = rib_stat->q; 2804 mutex_exit(&rib_stat->open_hca_lock); 2805 2806 status = rib_svc_create_chan(hca, (caddr_t)q, 2807 event->cm_event.req.req_prim_hca_port, &qp); 2808 2809 if (status) { 2810 return (IBT_CM_REJECT); 2811 } 2812 2813 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2814 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2815 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2816 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2817 2818 /* 2819 * Pre-posts RECV buffers 2820 */ 2821 conn = qptoc(qp); 2822 for (i = 0; i < preposted_rbufs; i++) { 2823 bzero(&rdbuf, sizeof (rdbuf)); 2824 rdbuf.type = RECV_BUFFER; 2825 buf = rib_rbuf_alloc(conn, &rdbuf); 2826 if (buf == NULL) { 2827 (void) rib_disconnect_channel(conn, NULL); 2828 return (IBT_CM_REJECT); 2829 } 2830 2831 bzero(&cl, sizeof (cl)); 2832 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2833 cl.c_len = rdbuf.len; 2834 cl.c_smemhandle.mrc_lmr = 2835 rdbuf.handle.mrc_lmr; /* lkey */ 2836 cl.c_next = NULL; 2837 status = rib_post_recv(conn, &cl); 2838 if (status != RDMA_SUCCESS) { 2839 (void) rib_disconnect_channel(conn, NULL); 2840 return (IBT_CM_REJECT); 2841 } 2842 } 2843 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2844 2845 /* 2846 * Get the address translation 2847 */ 2848 rw_enter(&hca->state_lock, RW_READER); 2849 if (hca->state == HCA_DETACHED) { 2850 rw_exit(&hca->state_lock); 2851 return (IBT_CM_REJECT); 2852 } 2853 rw_exit(&hca->state_lock); 2854 2855 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 2856 2857 if (ibt_get_ip_data(event->cm_priv_data_len, 2858 event->cm_priv_data, 2859 &ipinfo) != IBT_SUCCESS) { 2860 2861 return (IBT_CM_REJECT); 2862 } 2863 2864 switch (ipinfo.src_addr.family) { 2865 case AF_INET: 2866 2867 conn->c_raddr.maxlen = 2868 conn->c_raddr.len = sin_size; 2869 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 2870 2871 s = (struct sockaddr_in *)conn->c_raddr.buf; 2872 s->sin_family = AF_INET; 2873 2874 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 2875 &s->sin_addr, in_size); 2876 2877 break; 2878 2879 case AF_INET6: 2880 2881 conn->c_raddr.maxlen = 2882 conn->c_raddr.len = sin6_size; 2883 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 2884 2885 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2886 s6->sin6_family = AF_INET6; 2887 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 2888 &s6->sin6_addr, 2889 sizeof (struct in6_addr)); 2890 2891 break; 2892 2893 default: 2894 return (IBT_CM_REJECT); 2895 } 2896 2897 break; 2898 2899 case IBT_CM_EVENT_CONN_CLOSED: 2900 { 2901 CONN *conn; 2902 rib_qp_t *qp; 2903 2904 switch (event->cm_event.closed) { 2905 case IBT_CM_CLOSED_DREP_RCVD: 2906 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2907 case IBT_CM_CLOSED_DUP: 2908 case IBT_CM_CLOSED_ABORT: 2909 case IBT_CM_CLOSED_ALREADY: 2910 /* 2911 * These cases indicate the local end initiated 2912 * the closing of the channel. Nothing to do here. 2913 */ 2914 break; 2915 default: 2916 /* 2917 * Reason for CONN_CLOSED event must be one of 2918 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2919 * or IBT_CM_CLOSED_STALE. These indicate cases were 2920 * the remote end is closing the channel. In these 2921 * cases free the channel and transition to error 2922 * state 2923 */ 2924 qp = ibt_get_chan_private(event->cm_channel); 2925 conn = qptoc(qp); 2926 mutex_enter(&conn->c_lock); 2927 if (conn->c_state == C_DISCONN_PEND) { 2928 mutex_exit(&conn->c_lock); 2929 break; 2930 } 2931 conn->c_state = C_ERROR_CONN; 2932 2933 /* 2934 * Free the rc_channel. Channel has already 2935 * transitioned to ERROR state and WRs have been 2936 * FLUSHED_ERR already. 2937 */ 2938 (void) ibt_free_channel(qp->qp_hdl); 2939 qp->qp_hdl = NULL; 2940 2941 /* 2942 * Free the conn if c_ref goes down to 0 2943 */ 2944 if (conn->c_ref == 0) { 2945 /* 2946 * Remove from list and free conn 2947 */ 2948 conn->c_state = C_DISCONN_PEND; 2949 mutex_exit(&conn->c_lock); 2950 (void) rib_disconnect_channel(conn, 2951 &hca->srv_conn_list); 2952 } else { 2953 mutex_exit(&conn->c_lock); 2954 } 2955 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 2956 break; 2957 } 2958 break; 2959 } 2960 case IBT_CM_EVENT_CONN_EST: 2961 /* 2962 * RTU received, hence connection established. 2963 */ 2964 if (rib_debug > 1) 2965 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2966 "(CONN_EST) channel established"); 2967 break; 2968 2969 default: 2970 if (rib_debug > 2) { 2971 /* Let CM handle the following events. */ 2972 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 2973 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2974 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 2975 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 2976 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2977 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 2978 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 2979 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2980 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 2981 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 2982 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2983 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 2984 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 2985 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2986 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 2987 } 2988 } 2989 return (IBT_CM_DEFAULT); 2990 } 2991 2992 /* accept all other CM messages (i.e. let the CM handle them) */ 2993 return (IBT_CM_ACCEPT); 2994 } 2995 2996 static rdma_stat 2997 rib_register_service(rib_hca_t *hca, int service_type) 2998 { 2999 ibt_srv_desc_t sdesc; 3000 ibt_hca_portinfo_t *port_infop; 3001 ib_svc_id_t srv_id; 3002 ibt_srv_hdl_t srv_hdl; 3003 uint_t port_size; 3004 uint_t pki, i, num_ports, nbinds; 3005 ibt_status_t ibt_status; 3006 rib_service_t *new_service; 3007 ib_pkey_t pkey; 3008 3009 /* 3010 * Query all ports for the given HCA 3011 */ 3012 rw_enter(&hca->state_lock, RW_READER); 3013 if (hca->state != HCA_DETACHED) { 3014 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3015 &num_ports, &port_size); 3016 rw_exit(&hca->state_lock); 3017 } else { 3018 rw_exit(&hca->state_lock); 3019 return (RDMA_FAILED); 3020 } 3021 if (ibt_status != IBT_SUCCESS) { 3022 return (RDMA_FAILED); 3023 } 3024 3025 DTRACE_PROBE1(rpcib__i__regservice_numports, 3026 int, num_ports); 3027 3028 for (i = 0; i < num_ports; i++) { 3029 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3030 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3031 int, i+1); 3032 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3033 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3034 int, i+1); 3035 } 3036 } 3037 3038 /* 3039 * Get all the IP addresses on this system to register the 3040 * given "service type" on all DNS recognized IP addrs. 3041 * Each service type such as NFS will have all the systems 3042 * IP addresses as its different names. For now the only 3043 * type of service we support in RPCIB is NFS. 3044 */ 3045 rw_enter(&hca->service_list_lock, RW_WRITER); 3046 /* 3047 * Start registering and binding service to active 3048 * on active ports on this HCA. 3049 */ 3050 nbinds = 0; 3051 new_service = NULL; 3052 3053 /* 3054 * We use IP addresses as the service names for 3055 * service registration. Register each of them 3056 * with CM to obtain a svc_id and svc_hdl. We do not 3057 * register the service with machine's loopback address. 3058 */ 3059 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3060 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3061 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3062 3063 sdesc.sd_handler = rib_srv_cm_handler; 3064 sdesc.sd_flags = 0; 3065 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3066 &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT), 3067 1, &srv_hdl, &srv_id); 3068 3069 for (i = 0; i < num_ports; i++) { 3070 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3071 continue; 3072 3073 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3074 pkey = port_infop[i].p_pkey_tbl[pki]; 3075 if ((pkey & IBSRM_HB) && 3076 (pkey != IB_PKEY_INVALID_FULL)) { 3077 3078 /* 3079 * Allocate and prepare a service entry 3080 */ 3081 new_service = 3082 kmem_zalloc(1 * sizeof (rib_service_t), 3083 KM_SLEEP); 3084 3085 new_service->srv_type = service_type; 3086 new_service->srv_hdl = srv_hdl; 3087 new_service->srv_next = NULL; 3088 3089 ibt_status = ibt_bind_service(srv_hdl, 3090 port_infop[i].p_sgid_tbl[0], 3091 NULL, rib_stat, NULL); 3092 3093 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3094 int, ibt_status); 3095 3096 if (ibt_status != IBT_SUCCESS) { 3097 kmem_free(new_service, 3098 sizeof (rib_service_t)); 3099 new_service = NULL; 3100 continue; 3101 } 3102 3103 /* 3104 * Add to the service list for this HCA 3105 */ 3106 new_service->srv_next = hca->service_list; 3107 hca->service_list = new_service; 3108 new_service = NULL; 3109 nbinds++; 3110 } 3111 } 3112 } 3113 rw_exit(&hca->service_list_lock); 3114 3115 ibt_free_portinfo(port_infop, port_size); 3116 3117 if (nbinds == 0) { 3118 return (RDMA_FAILED); 3119 } else { 3120 /* 3121 * Put this plugin into accept state, since atleast 3122 * one registration was successful. 3123 */ 3124 mutex_enter(&plugin_state_lock); 3125 plugin_state = ACCEPT; 3126 mutex_exit(&plugin_state_lock); 3127 return (RDMA_SUCCESS); 3128 } 3129 } 3130 3131 void 3132 rib_listen(struct rdma_svc_data *rd) 3133 { 3134 rdma_stat status = RDMA_SUCCESS; 3135 3136 rd->active = 0; 3137 rd->err_code = RDMA_FAILED; 3138 3139 /* 3140 * First check if a hca is still attached 3141 */ 3142 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3143 if (rib_stat->hca->state != HCA_INITED) { 3144 rw_exit(&rib_stat->hca->state_lock); 3145 return; 3146 } 3147 rw_exit(&rib_stat->hca->state_lock); 3148 3149 rib_stat->q = &rd->q; 3150 /* 3151 * Right now the only service type is NFS. Hence force feed this 3152 * value. Ideally to communicate the service type it should be 3153 * passed down in rdma_svc_data. 3154 */ 3155 rib_stat->service_type = NFS; 3156 status = rib_register_service(rib_stat->hca, NFS); 3157 if (status != RDMA_SUCCESS) { 3158 rd->err_code = status; 3159 return; 3160 } 3161 /* 3162 * Service active on an HCA, check rd->err_code for more 3163 * explainable errors. 3164 */ 3165 rd->active = 1; 3166 rd->err_code = status; 3167 } 3168 3169 /* XXXX */ 3170 /* ARGSUSED */ 3171 static void 3172 rib_listen_stop(struct rdma_svc_data *svcdata) 3173 { 3174 rib_hca_t *hca; 3175 3176 /* 3177 * KRPC called the RDMATF to stop the listeners, this means 3178 * stop sending incomming or recieved requests to KRPC master 3179 * transport handle for RDMA-IB. This is also means that the 3180 * master transport handle, responsible for us, is going away. 3181 */ 3182 mutex_enter(&plugin_state_lock); 3183 plugin_state = NO_ACCEPT; 3184 if (svcdata != NULL) 3185 svcdata->active = 0; 3186 mutex_exit(&plugin_state_lock); 3187 3188 /* 3189 * First check if a hca is still attached 3190 */ 3191 hca = rib_stat->hca; 3192 rw_enter(&hca->state_lock, RW_READER); 3193 if (hca->state != HCA_INITED) { 3194 rw_exit(&hca->state_lock); 3195 return; 3196 } 3197 rib_close_channels(&hca->srv_conn_list); 3198 rib_stop_services(hca); 3199 rw_exit(&hca->state_lock); 3200 } 3201 3202 /* 3203 * Traverse the HCA's service list to unbind and deregister services. 3204 * Instead of unbinding the service for a service handle by 3205 * calling ibt_unbind_service() for each port/pkey, we unbind 3206 * all the services for the service handle by making only one 3207 * call to ibt_unbind_all_services(). Then, we deregister the 3208 * service for the service handle. 3209 * 3210 * When traversing the entries in service_list, we compare the 3211 * srv_hdl of the current entry with that of the next. If they 3212 * are different or if the next entry is NULL, the current entry 3213 * marks the last binding of the service handle. In this case, 3214 * call ibt_unbind_all_services() and deregister the service for 3215 * the service handle. If they are the same, the current and the 3216 * next entries are bound to the same service handle. In this 3217 * case, move on to the next entry. 3218 */ 3219 static void 3220 rib_stop_services(rib_hca_t *hca) 3221 { 3222 rib_service_t *srv_list, *to_remove; 3223 3224 /* 3225 * unbind and deregister the services for this service type. 3226 * Right now there is only one service type. In future it will 3227 * be passed down to this function. 3228 */ 3229 rw_enter(&hca->service_list_lock, RW_WRITER); 3230 srv_list = hca->service_list; 3231 while (srv_list != NULL) { 3232 to_remove = srv_list; 3233 srv_list = to_remove->srv_next; 3234 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3235 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3236 3237 (void) ibt_unbind_all_services(to_remove->srv_hdl); 3238 (void) ibt_deregister_service(hca->ibt_clnt_hdl, 3239 to_remove->srv_hdl); 3240 } 3241 3242 kmem_free(to_remove, sizeof (rib_service_t)); 3243 } 3244 hca->service_list = NULL; 3245 rw_exit(&hca->service_list_lock); 3246 } 3247 3248 static struct svc_recv * 3249 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3250 { 3251 struct svc_recv *recvp; 3252 3253 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3254 recvp->vaddr = sgl->ds_va; 3255 recvp->qp = qp; 3256 recvp->bytes_xfer = 0; 3257 return (recvp); 3258 } 3259 3260 static int 3261 rib_free_svc_recv(struct svc_recv *recvp) 3262 { 3263 kmem_free(recvp, sizeof (*recvp)); 3264 3265 return (0); 3266 } 3267 3268 static struct reply * 3269 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3270 { 3271 struct reply *rep; 3272 3273 3274 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3275 if (rep == NULL) { 3276 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3277 return (NULL); 3278 } 3279 rep->xid = msgid; 3280 rep->vaddr_cq = NULL; 3281 rep->bytes_xfer = 0; 3282 rep->status = (uint_t)REPLY_WAIT; 3283 rep->prev = NULL; 3284 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3285 3286 mutex_enter(&qp->replylist_lock); 3287 if (qp->replylist) { 3288 rep->next = qp->replylist; 3289 qp->replylist->prev = rep; 3290 } 3291 qp->rep_list_size++; 3292 3293 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3294 int, qp->rep_list_size); 3295 3296 qp->replylist = rep; 3297 mutex_exit(&qp->replylist_lock); 3298 3299 return (rep); 3300 } 3301 3302 static rdma_stat 3303 rib_rem_replylist(rib_qp_t *qp) 3304 { 3305 struct reply *r, *n; 3306 3307 mutex_enter(&qp->replylist_lock); 3308 for (r = qp->replylist; r != NULL; r = n) { 3309 n = r->next; 3310 (void) rib_remreply(qp, r); 3311 } 3312 mutex_exit(&qp->replylist_lock); 3313 3314 return (RDMA_SUCCESS); 3315 } 3316 3317 static int 3318 rib_remreply(rib_qp_t *qp, struct reply *rep) 3319 { 3320 3321 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3322 if (rep->prev) { 3323 rep->prev->next = rep->next; 3324 } 3325 if (rep->next) { 3326 rep->next->prev = rep->prev; 3327 } 3328 if (qp->replylist == rep) 3329 qp->replylist = rep->next; 3330 3331 cv_destroy(&rep->wait_cv); 3332 qp->rep_list_size--; 3333 3334 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3335 int, qp->rep_list_size); 3336 3337 kmem_free(rep, sizeof (*rep)); 3338 3339 return (0); 3340 } 3341 3342 rdma_stat 3343 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3344 struct mrc *buf_handle) 3345 { 3346 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3347 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3348 rdma_stat status; 3349 rib_hca_t *hca = (ctoqp(conn))->hca; 3350 3351 /* 3352 * Note: ALL buffer pools use the same memory type RDMARW. 3353 */ 3354 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3355 if (status == RDMA_SUCCESS) { 3356 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3357 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3358 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3359 } else { 3360 buf_handle->mrc_linfo = NULL; 3361 buf_handle->mrc_lmr = 0; 3362 buf_handle->mrc_rmr = 0; 3363 } 3364 return (status); 3365 } 3366 3367 static rdma_stat 3368 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3369 ibt_mr_flags_t spec, 3370 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3371 { 3372 ibt_mr_attr_t mem_attr; 3373 ibt_status_t ibt_status; 3374 mem_attr.mr_vaddr = (uintptr_t)buf; 3375 mem_attr.mr_len = (ib_msglen_t)size; 3376 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3377 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3378 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3379 IBT_MR_ENABLE_WINDOW_BIND | spec; 3380 3381 rw_enter(&hca->state_lock, RW_READER); 3382 if (hca->state == HCA_INITED) { 3383 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3384 &mem_attr, mr_hdlp, mr_descp); 3385 rw_exit(&hca->state_lock); 3386 } else { 3387 rw_exit(&hca->state_lock); 3388 return (RDMA_FAILED); 3389 } 3390 3391 if (ibt_status != IBT_SUCCESS) { 3392 return (RDMA_FAILED); 3393 } 3394 return (RDMA_SUCCESS); 3395 } 3396 3397 rdma_stat 3398 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3399 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3400 { 3401 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3402 rib_lrc_entry_t *l; 3403 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3404 rdma_stat status; 3405 rib_hca_t *hca = (ctoqp(conn))->hca; 3406 3407 /* 3408 * Non-coherent memory registration. 3409 */ 3410 l = (rib_lrc_entry_t *)lrc; 3411 if (l) { 3412 if (l->registered) { 3413 buf_handle->mrc_linfo = 3414 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3415 buf_handle->mrc_lmr = 3416 (uint32_t)l->lrc_mhandle.mrc_lmr; 3417 buf_handle->mrc_rmr = 3418 (uint32_t)l->lrc_mhandle.mrc_rmr; 3419 *sync_handle = (RIB_SYNCMEM_HANDLE) 3420 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3421 return (RDMA_SUCCESS); 3422 } else { 3423 /* Always register the whole buffer */ 3424 buf = (caddr_t)l->lrc_buf; 3425 buflen = l->lrc_len; 3426 } 3427 } 3428 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3429 3430 if (status == RDMA_SUCCESS) { 3431 if (l) { 3432 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3433 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3434 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3435 l->registered = TRUE; 3436 } 3437 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3438 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3439 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3440 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3441 } else { 3442 buf_handle->mrc_linfo = NULL; 3443 buf_handle->mrc_lmr = 0; 3444 buf_handle->mrc_rmr = 0; 3445 } 3446 return (status); 3447 } 3448 3449 /* ARGSUSED */ 3450 rdma_stat 3451 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3452 { 3453 rib_hca_t *hca = (ctoqp(conn))->hca; 3454 /* 3455 * Allow memory deregistration even if HCA is 3456 * getting detached. Need all outstanding 3457 * memory registrations to be deregistered 3458 * before HCA_DETACH_EVENT can be accepted. 3459 */ 3460 (void) ibt_deregister_mr(hca->hca_hdl, 3461 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3462 return (RDMA_SUCCESS); 3463 } 3464 3465 /* ARGSUSED */ 3466 rdma_stat 3467 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3468 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3469 { 3470 rib_lrc_entry_t *l; 3471 l = (rib_lrc_entry_t *)lrc; 3472 if (l) 3473 if (l->registered) 3474 return (RDMA_SUCCESS); 3475 3476 (void) rib_deregistermem(conn, buf, buf_handle); 3477 3478 return (RDMA_SUCCESS); 3479 } 3480 3481 /* ARGSUSED */ 3482 rdma_stat 3483 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3484 int len, int cpu) 3485 { 3486 ibt_status_t status; 3487 rib_hca_t *hca = (ctoqp(conn))->hca; 3488 ibt_mr_sync_t mr_segment; 3489 3490 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3491 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3492 mr_segment.ms_len = (ib_memlen_t)len; 3493 if (cpu) { 3494 /* make incoming data visible to memory */ 3495 mr_segment.ms_flags = IBT_SYNC_WRITE; 3496 } else { 3497 /* make memory changes visible to IO */ 3498 mr_segment.ms_flags = IBT_SYNC_READ; 3499 } 3500 rw_enter(&hca->state_lock, RW_READER); 3501 if (hca->state == HCA_INITED) { 3502 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3503 rw_exit(&hca->state_lock); 3504 } else { 3505 rw_exit(&hca->state_lock); 3506 return (RDMA_FAILED); 3507 } 3508 3509 if (status == IBT_SUCCESS) 3510 return (RDMA_SUCCESS); 3511 else { 3512 return (RDMA_FAILED); 3513 } 3514 } 3515 3516 /* 3517 * XXXX ???? 3518 */ 3519 static rdma_stat 3520 rib_getinfo(rdma_info_t *info) 3521 { 3522 /* 3523 * XXXX Hack! 3524 */ 3525 info->addrlen = 16; 3526 info->mts = 1000000; 3527 info->mtu = 1000000; 3528 3529 return (RDMA_SUCCESS); 3530 } 3531 3532 rib_bufpool_t * 3533 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3534 { 3535 rib_bufpool_t *rbp = NULL; 3536 bufpool_t *bp = NULL; 3537 caddr_t buf; 3538 ibt_mr_attr_t mem_attr; 3539 ibt_status_t ibt_status; 3540 int i, j; 3541 3542 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3543 3544 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3545 num * sizeof (void *), KM_SLEEP); 3546 3547 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3548 bp->numelems = num; 3549 3550 3551 switch (ptype) { 3552 case SEND_BUFFER: 3553 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3554 bp->rsize = RPC_MSG_SZ; 3555 break; 3556 case RECV_BUFFER: 3557 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3558 bp->rsize = RPC_BUF_SIZE; 3559 break; 3560 default: 3561 goto fail; 3562 } 3563 3564 /* 3565 * Register the pool. 3566 */ 3567 bp->bufsize = num * bp->rsize; 3568 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3569 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3570 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3571 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3572 sizeof (ibt_mr_desc_t), KM_SLEEP); 3573 rw_enter(&hca->state_lock, RW_READER); 3574 3575 if (hca->state != HCA_INITED) { 3576 rw_exit(&hca->state_lock); 3577 goto fail; 3578 } 3579 3580 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3581 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3582 mem_attr.mr_vaddr = (uintptr_t)buf; 3583 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3584 mem_attr.mr_as = NULL; 3585 ibt_status = ibt_register_mr(hca->hca_hdl, 3586 hca->pd_hdl, &mem_attr, 3587 &rbp->mr_hdl[i], 3588 &rbp->mr_desc[i]); 3589 if (ibt_status != IBT_SUCCESS) { 3590 for (j = 0; j < i; j++) { 3591 (void) ibt_deregister_mr(hca->hca_hdl, 3592 rbp->mr_hdl[j]); 3593 } 3594 rw_exit(&hca->state_lock); 3595 goto fail; 3596 } 3597 } 3598 rw_exit(&hca->state_lock); 3599 buf = (caddr_t)bp->buf; 3600 for (i = 0; i < num; i++, buf += bp->rsize) { 3601 bp->buflist[i] = (void *)buf; 3602 } 3603 bp->buffree = num - 1; /* no. of free buffers */ 3604 rbp->bpool = bp; 3605 3606 return (rbp); 3607 fail: 3608 if (bp) { 3609 if (bp->buf) 3610 kmem_free(bp->buf, bp->bufsize); 3611 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3612 } 3613 if (rbp) { 3614 if (rbp->mr_hdl) 3615 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3616 if (rbp->mr_desc) 3617 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3618 kmem_free(rbp, sizeof (rib_bufpool_t)); 3619 } 3620 return (NULL); 3621 } 3622 3623 static void 3624 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3625 { 3626 int i; 3627 rib_bufpool_t *rbp = NULL; 3628 bufpool_t *bp; 3629 3630 /* 3631 * Obtain pool address based on type of pool 3632 */ 3633 switch (ptype) { 3634 case SEND_BUFFER: 3635 rbp = hca->send_pool; 3636 break; 3637 case RECV_BUFFER: 3638 rbp = hca->recv_pool; 3639 break; 3640 default: 3641 return; 3642 } 3643 if (rbp == NULL) 3644 return; 3645 3646 bp = rbp->bpool; 3647 3648 /* 3649 * Deregister the pool memory and free it. 3650 */ 3651 for (i = 0; i < bp->numelems; i++) { 3652 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3653 } 3654 } 3655 3656 static void 3657 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3658 { 3659 3660 rib_bufpool_t *rbp = NULL; 3661 bufpool_t *bp; 3662 3663 /* 3664 * Obtain pool address based on type of pool 3665 */ 3666 switch (ptype) { 3667 case SEND_BUFFER: 3668 rbp = hca->send_pool; 3669 break; 3670 case RECV_BUFFER: 3671 rbp = hca->recv_pool; 3672 break; 3673 default: 3674 return; 3675 } 3676 if (rbp == NULL) 3677 return; 3678 3679 bp = rbp->bpool; 3680 3681 /* 3682 * Free the pool memory. 3683 */ 3684 if (rbp->mr_hdl) 3685 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3686 3687 if (rbp->mr_desc) 3688 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3689 if (bp->buf) 3690 kmem_free(bp->buf, bp->bufsize); 3691 mutex_destroy(&bp->buflock); 3692 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3693 kmem_free(rbp, sizeof (rib_bufpool_t)); 3694 } 3695 3696 void 3697 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3698 { 3699 /* 3700 * Deregister the pool memory and free it. 3701 */ 3702 rib_rbufpool_deregister(hca, ptype); 3703 rib_rbufpool_free(hca, ptype); 3704 } 3705 3706 /* 3707 * Fetch a buffer from the pool of type specified in rdbuf->type. 3708 */ 3709 static rdma_stat 3710 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3711 { 3712 rib_lrc_entry_t *rlep; 3713 3714 if (rdbuf->type == RDMA_LONG_BUFFER) { 3715 rlep = rib_get_cache_buf(conn, rdbuf->len); 3716 rdbuf->rb_private = (caddr_t)rlep; 3717 rdbuf->addr = rlep->lrc_buf; 3718 rdbuf->handle = rlep->lrc_mhandle; 3719 return (RDMA_SUCCESS); 3720 } 3721 3722 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3723 if (rdbuf->addr) { 3724 switch (rdbuf->type) { 3725 case SEND_BUFFER: 3726 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3727 break; 3728 case RECV_BUFFER: 3729 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3730 break; 3731 default: 3732 rdbuf->len = 0; 3733 } 3734 return (RDMA_SUCCESS); 3735 } else 3736 return (RDMA_FAILED); 3737 } 3738 3739 #if defined(MEASURE_POOL_DEPTH) 3740 static void rib_recv_bufs(uint32_t x) { 3741 3742 } 3743 3744 static void rib_send_bufs(uint32_t x) { 3745 3746 } 3747 #endif 3748 3749 /* 3750 * Fetch a buffer of specified type. 3751 * Note that rdbuf->handle is mw's rkey. 3752 */ 3753 static void * 3754 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3755 { 3756 rib_qp_t *qp = ctoqp(conn); 3757 rib_hca_t *hca = qp->hca; 3758 rdma_btype ptype = rdbuf->type; 3759 void *buf; 3760 rib_bufpool_t *rbp = NULL; 3761 bufpool_t *bp; 3762 int i; 3763 3764 /* 3765 * Obtain pool address based on type of pool 3766 */ 3767 switch (ptype) { 3768 case SEND_BUFFER: 3769 rbp = hca->send_pool; 3770 break; 3771 case RECV_BUFFER: 3772 rbp = hca->recv_pool; 3773 break; 3774 default: 3775 return (NULL); 3776 } 3777 if (rbp == NULL) 3778 return (NULL); 3779 3780 bp = rbp->bpool; 3781 3782 mutex_enter(&bp->buflock); 3783 if (bp->buffree < 0) { 3784 mutex_exit(&bp->buflock); 3785 return (NULL); 3786 } 3787 3788 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 3789 buf = bp->buflist[bp->buffree]; 3790 rdbuf->addr = buf; 3791 rdbuf->len = bp->rsize; 3792 for (i = bp->numelems - 1; i >= 0; i--) { 3793 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 3794 rdbuf->handle.mrc_rmr = 3795 (uint32_t)rbp->mr_desc[i].md_rkey; 3796 rdbuf->handle.mrc_linfo = 3797 (uintptr_t)rbp->mr_hdl[i]; 3798 rdbuf->handle.mrc_lmr = 3799 (uint32_t)rbp->mr_desc[i].md_lkey; 3800 #if defined(MEASURE_POOL_DEPTH) 3801 if (ptype == SEND_BUFFER) 3802 rib_send_bufs(MAX_BUFS - (bp->buffree+1)); 3803 if (ptype == RECV_BUFFER) 3804 rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); 3805 #endif 3806 bp->buffree--; 3807 3808 mutex_exit(&bp->buflock); 3809 3810 return (buf); 3811 } 3812 } 3813 3814 mutex_exit(&bp->buflock); 3815 3816 return (NULL); 3817 } 3818 3819 static void 3820 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 3821 { 3822 3823 if (rdbuf->type == RDMA_LONG_BUFFER) { 3824 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 3825 rdbuf->rb_private = NULL; 3826 return; 3827 } 3828 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 3829 } 3830 3831 static void 3832 rib_rbuf_free(CONN *conn, int ptype, void *buf) 3833 { 3834 rib_qp_t *qp = ctoqp(conn); 3835 rib_hca_t *hca = qp->hca; 3836 rib_bufpool_t *rbp = NULL; 3837 bufpool_t *bp; 3838 3839 /* 3840 * Obtain pool address based on type of pool 3841 */ 3842 switch (ptype) { 3843 case SEND_BUFFER: 3844 rbp = hca->send_pool; 3845 break; 3846 case RECV_BUFFER: 3847 rbp = hca->recv_pool; 3848 break; 3849 default: 3850 return; 3851 } 3852 if (rbp == NULL) 3853 return; 3854 3855 bp = rbp->bpool; 3856 3857 mutex_enter(&bp->buflock); 3858 if (++bp->buffree >= bp->numelems) { 3859 /* 3860 * Should never happen 3861 */ 3862 bp->buffree--; 3863 } else { 3864 bp->buflist[bp->buffree] = buf; 3865 } 3866 mutex_exit(&bp->buflock); 3867 } 3868 3869 static rdma_stat 3870 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 3871 { 3872 rw_enter(&connlist->conn_lock, RW_WRITER); 3873 if (connlist->conn_hd) { 3874 cn->c_next = connlist->conn_hd; 3875 connlist->conn_hd->c_prev = cn; 3876 } 3877 connlist->conn_hd = cn; 3878 rw_exit(&connlist->conn_lock); 3879 3880 return (RDMA_SUCCESS); 3881 } 3882 3883 static rdma_stat 3884 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 3885 { 3886 rw_enter(&connlist->conn_lock, RW_WRITER); 3887 if (cn->c_prev) { 3888 cn->c_prev->c_next = cn->c_next; 3889 } 3890 if (cn->c_next) { 3891 cn->c_next->c_prev = cn->c_prev; 3892 } 3893 if (connlist->conn_hd == cn) 3894 connlist->conn_hd = cn->c_next; 3895 rw_exit(&connlist->conn_lock); 3896 3897 return (RDMA_SUCCESS); 3898 } 3899 3900 /* 3901 * Connection management. 3902 * IBTF does not support recycling of channels. So connections are only 3903 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 3904 * C_DISCONN_PEND state. No C_IDLE state. 3905 * C_CONN_PEND state: Connection establishment in progress to the server. 3906 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 3907 * It has an RC channel associated with it. ibt_post_send/recv are allowed 3908 * only in this state. 3909 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 3910 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 3911 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 3912 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 3913 * c_ref drops to 0 (this indicates that RPC has no more references to this 3914 * connection), the connection should be destroyed. A connection transitions 3915 * into this state when it is being destroyed. 3916 */ 3917 /* ARGSUSED */ 3918 static rdma_stat 3919 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 3920 { 3921 CONN *cn; 3922 int status = RDMA_SUCCESS; 3923 rib_hca_t *hca = rib_stat->hca; 3924 rib_qp_t *qp; 3925 clock_t cv_stat, timout; 3926 ibt_path_info_t path; 3927 ibt_ip_addr_t s_ip, d_ip; 3928 3929 if (hca == NULL) 3930 return (RDMA_FAILED); 3931 3932 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3933 if (hca->state == HCA_DETACHED) { 3934 rw_exit(&rib_stat->hca->state_lock); 3935 return (RDMA_FAILED); 3936 } 3937 rw_exit(&rib_stat->hca->state_lock); 3938 3939 again: 3940 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 3941 cn = hca->cl_conn_list.conn_hd; 3942 while (cn != NULL) { 3943 /* 3944 * First, clear up any connection in the ERROR state 3945 */ 3946 mutex_enter(&cn->c_lock); 3947 if (cn->c_state == C_ERROR_CONN) { 3948 if (cn->c_ref == 0) { 3949 /* 3950 * Remove connection from list and destroy it. 3951 */ 3952 cn->c_state = C_DISCONN_PEND; 3953 mutex_exit(&cn->c_lock); 3954 rw_exit(&hca->cl_conn_list.conn_lock); 3955 (void) rib_disconnect_channel(cn, 3956 &hca->cl_conn_list); 3957 goto again; 3958 } 3959 mutex_exit(&cn->c_lock); 3960 cn = cn->c_next; 3961 continue; 3962 } 3963 if (cn->c_state == C_DISCONN_PEND) { 3964 mutex_exit(&cn->c_lock); 3965 cn = cn->c_next; 3966 continue; 3967 } 3968 if ((cn->c_raddr.len == svcaddr->len) && 3969 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 3970 /* 3971 * Our connection. Give up conn list lock 3972 * as we are done traversing the list. 3973 */ 3974 rw_exit(&hca->cl_conn_list.conn_lock); 3975 if (cn->c_state == C_CONNECTED) { 3976 cn->c_ref++; /* sharing a conn */ 3977 mutex_exit(&cn->c_lock); 3978 *conn = cn; 3979 return (status); 3980 } 3981 if (cn->c_state == C_CONN_PEND) { 3982 /* 3983 * Hold a reference to this conn before 3984 * we give up the lock. 3985 */ 3986 cn->c_ref++; 3987 timout = ddi_get_lbolt() + 3988 drv_usectohz(CONN_WAIT_TIME * 1000000); 3989 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 3990 &cn->c_lock, timout)) > 0 && 3991 cn->c_state == C_CONN_PEND) 3992 ; 3993 if (cv_stat == 0) { 3994 cn->c_ref--; 3995 mutex_exit(&cn->c_lock); 3996 return (RDMA_INTR); 3997 } 3998 if (cv_stat < 0) { 3999 cn->c_ref--; 4000 mutex_exit(&cn->c_lock); 4001 return (RDMA_TIMEDOUT); 4002 } 4003 if (cn->c_state == C_CONNECTED) { 4004 *conn = cn; 4005 mutex_exit(&cn->c_lock); 4006 return (status); 4007 } else { 4008 cn->c_ref--; 4009 mutex_exit(&cn->c_lock); 4010 return (RDMA_TIMEDOUT); 4011 } 4012 } 4013 } 4014 mutex_exit(&cn->c_lock); 4015 cn = cn->c_next; 4016 } 4017 rw_exit(&hca->cl_conn_list.conn_lock); 4018 4019 bzero(&path, sizeof (ibt_path_info_t)); 4020 bzero(&s_ip, sizeof (ibt_ip_addr_t)); 4021 bzero(&d_ip, sizeof (ibt_ip_addr_t)); 4022 4023 status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip); 4024 if (status != RDMA_SUCCESS) { 4025 return (RDMA_FAILED); 4026 } 4027 4028 /* 4029 * Channel to server doesn't exist yet, create one. 4030 */ 4031 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 4032 return (RDMA_FAILED); 4033 } 4034 cn = qptoc(qp); 4035 cn->c_state = C_CONN_PEND; 4036 cn->c_ref = 1; 4037 4038 /* 4039 * Add to conn list. 4040 * We had given up the READER lock. In the time since then, 4041 * another thread might have created the connection we are 4042 * trying here. But for now, that is quiet alright - there 4043 * might be two connections between a pair of hosts instead 4044 * of one. If we really want to close that window, 4045 * then need to check the list after acquiring the 4046 * WRITER lock. 4047 */ 4048 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4049 status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip); 4050 mutex_enter(&cn->c_lock); 4051 if (status == RDMA_SUCCESS) { 4052 cn->c_state = C_CONNECTED; 4053 *conn = cn; 4054 } else { 4055 cn->c_state = C_ERROR_CONN; 4056 cn->c_ref--; 4057 } 4058 cv_broadcast(&cn->c_cv); 4059 mutex_exit(&cn->c_lock); 4060 return (status); 4061 } 4062 4063 static rdma_stat 4064 rib_conn_release(CONN *conn) 4065 { 4066 rib_qp_t *qp = ctoqp(conn); 4067 4068 mutex_enter(&conn->c_lock); 4069 conn->c_ref--; 4070 4071 /* 4072 * If a conn is C_ERROR_CONN, close the channel. 4073 * If it's CONNECTED, keep it that way. 4074 */ 4075 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4076 conn->c_state = C_DISCONN_PEND; 4077 mutex_exit(&conn->c_lock); 4078 if (qp->mode == RIB_SERVER) 4079 (void) rib_disconnect_channel(conn, 4080 &qp->hca->srv_conn_list); 4081 else 4082 (void) rib_disconnect_channel(conn, 4083 &qp->hca->cl_conn_list); 4084 return (RDMA_SUCCESS); 4085 } 4086 mutex_exit(&conn->c_lock); 4087 return (RDMA_SUCCESS); 4088 } 4089 4090 /* 4091 * Add at front of list 4092 */ 4093 static struct rdma_done_list * 4094 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4095 { 4096 struct rdma_done_list *rd; 4097 4098 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4099 4100 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4101 rd->xid = xid; 4102 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4103 4104 rd->prev = NULL; 4105 rd->next = qp->rdlist; 4106 if (qp->rdlist != NULL) 4107 qp->rdlist->prev = rd; 4108 qp->rdlist = rd; 4109 4110 return (rd); 4111 } 4112 4113 static void 4114 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4115 { 4116 struct rdma_done_list *r; 4117 4118 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4119 4120 r = rd->next; 4121 if (r != NULL) { 4122 r->prev = rd->prev; 4123 } 4124 4125 r = rd->prev; 4126 if (r != NULL) { 4127 r->next = rd->next; 4128 } else { 4129 qp->rdlist = rd->next; 4130 } 4131 4132 cv_destroy(&rd->rdma_done_cv); 4133 kmem_free(rd, sizeof (*rd)); 4134 } 4135 4136 static void 4137 rdma_done_rem_list(rib_qp_t *qp) 4138 { 4139 struct rdma_done_list *r, *n; 4140 4141 mutex_enter(&qp->rdlist_lock); 4142 for (r = qp->rdlist; r != NULL; r = n) { 4143 n = r->next; 4144 rdma_done_rm(qp, r); 4145 } 4146 mutex_exit(&qp->rdlist_lock); 4147 } 4148 4149 static void 4150 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4151 { 4152 struct rdma_done_list *r = qp->rdlist; 4153 4154 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4155 4156 while (r) { 4157 if (r->xid == xid) { 4158 cv_signal(&r->rdma_done_cv); 4159 return; 4160 } else { 4161 r = r->next; 4162 } 4163 } 4164 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4165 int, xid); 4166 } 4167 4168 4169 /* 4170 * Goes through all connections and closes the channel 4171 * This will cause all the WRs on those channels to be 4172 * flushed. 4173 */ 4174 static void 4175 rib_close_channels(rib_conn_list_t *connlist) 4176 { 4177 CONN *conn; 4178 rib_qp_t *qp; 4179 4180 rw_enter(&connlist->conn_lock, RW_READER); 4181 conn = connlist->conn_hd; 4182 while (conn != NULL) { 4183 mutex_enter(&conn->c_lock); 4184 qp = ctoqp(conn); 4185 if (conn->c_state == C_CONNECTED) { 4186 /* 4187 * Live connection in CONNECTED state. 4188 * Call ibt_close_rc_channel in nonblocking mode 4189 * with no callbacks. 4190 */ 4191 conn->c_state = C_ERROR_CONN; 4192 (void) ibt_close_rc_channel(qp->qp_hdl, 4193 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4194 (void) ibt_free_channel(qp->qp_hdl); 4195 qp->qp_hdl = NULL; 4196 } else { 4197 if (conn->c_state == C_ERROR_CONN && 4198 qp->qp_hdl != NULL) { 4199 /* 4200 * Connection in ERROR state but 4201 * channel is not yet freed. 4202 */ 4203 (void) ibt_close_rc_channel(qp->qp_hdl, 4204 IBT_NOCALLBACKS, NULL, 0, NULL, 4205 NULL, 0); 4206 (void) ibt_free_channel(qp->qp_hdl); 4207 qp->qp_hdl = NULL; 4208 } 4209 } 4210 mutex_exit(&conn->c_lock); 4211 conn = conn->c_next; 4212 } 4213 rw_exit(&connlist->conn_lock); 4214 } 4215 4216 /* 4217 * Frees up all connections that are no longer being referenced 4218 */ 4219 static void 4220 rib_purge_connlist(rib_conn_list_t *connlist) 4221 { 4222 CONN *conn; 4223 4224 top: 4225 rw_enter(&connlist->conn_lock, RW_READER); 4226 conn = connlist->conn_hd; 4227 while (conn != NULL) { 4228 mutex_enter(&conn->c_lock); 4229 4230 /* 4231 * At this point connection is either in ERROR 4232 * or DISCONN_PEND state. If in DISCONN_PEND state 4233 * then some other thread is culling that connection. 4234 * If not and if c_ref is 0, then destroy the connection. 4235 */ 4236 if (conn->c_ref == 0 && 4237 conn->c_state != C_DISCONN_PEND) { 4238 /* 4239 * Cull the connection 4240 */ 4241 conn->c_state = C_DISCONN_PEND; 4242 mutex_exit(&conn->c_lock); 4243 rw_exit(&connlist->conn_lock); 4244 (void) rib_disconnect_channel(conn, connlist); 4245 goto top; 4246 } else { 4247 /* 4248 * conn disconnect already scheduled or will 4249 * happen from conn_release when c_ref drops to 0. 4250 */ 4251 mutex_exit(&conn->c_lock); 4252 } 4253 conn = conn->c_next; 4254 } 4255 rw_exit(&connlist->conn_lock); 4256 4257 /* 4258 * At this point, only connections with c_ref != 0 are on the list 4259 */ 4260 } 4261 4262 /* 4263 * Cleans and closes up all uses of the HCA 4264 */ 4265 static void 4266 rib_detach_hca(rib_hca_t *hca) 4267 { 4268 4269 /* 4270 * Stop all services on the HCA 4271 * Go through cl_conn_list and close all rc_channels 4272 * Go through svr_conn_list and close all rc_channels 4273 * Free connections whose c_ref has dropped to 0 4274 * Destroy all CQs 4275 * Deregister and released all buffer pool memory after all 4276 * connections are destroyed 4277 * Free the protection domain 4278 * ibt_close_hca() 4279 */ 4280 rw_enter(&hca->state_lock, RW_WRITER); 4281 if (hca->state == HCA_DETACHED) { 4282 rw_exit(&hca->state_lock); 4283 return; 4284 } 4285 4286 hca->state = HCA_DETACHED; 4287 rib_stat->nhca_inited--; 4288 4289 rib_stop_services(hca); 4290 rib_close_channels(&hca->cl_conn_list); 4291 rib_close_channels(&hca->srv_conn_list); 4292 4293 rib_mod.rdma_count--; 4294 4295 rw_exit(&hca->state_lock); 4296 4297 /* 4298 * purge will free all datastructures used by CQ handlers. We don't 4299 * want to receive completions after purge, so we'll free the CQs now. 4300 */ 4301 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4302 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4303 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4304 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4305 4306 rib_purge_connlist(&hca->cl_conn_list); 4307 rib_purge_connlist(&hca->srv_conn_list); 4308 4309 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4310 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4311 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4312 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4313 if (stats_enabled) { 4314 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4315 GLOBAL_ZONEID); 4316 } 4317 4318 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4319 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4320 if (hca->srv_conn_list.conn_hd == NULL && 4321 hca->cl_conn_list.conn_hd == NULL) { 4322 /* 4323 * conn_lists are NULL, so destroy 4324 * buffers, close hca and be done. 4325 */ 4326 rib_rbufpool_destroy(hca, RECV_BUFFER); 4327 rib_rbufpool_destroy(hca, SEND_BUFFER); 4328 rib_destroy_cache(hca); 4329 rdma_unregister_mod(&rib_mod); 4330 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4331 (void) ibt_close_hca(hca->hca_hdl); 4332 hca->hca_hdl = NULL; 4333 } 4334 rw_exit(&hca->cl_conn_list.conn_lock); 4335 rw_exit(&hca->srv_conn_list.conn_lock); 4336 4337 if (hca->hca_hdl != NULL) { 4338 mutex_enter(&hca->inuse_lock); 4339 while (hca->inuse) 4340 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4341 mutex_exit(&hca->inuse_lock); 4342 4343 rdma_unregister_mod(&rib_mod); 4344 4345 /* 4346 * conn_lists are now NULL, so destroy 4347 * buffers, close hca and be done. 4348 */ 4349 rib_rbufpool_destroy(hca, RECV_BUFFER); 4350 rib_rbufpool_destroy(hca, SEND_BUFFER); 4351 rib_destroy_cache(hca); 4352 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4353 (void) ibt_close_hca(hca->hca_hdl); 4354 hca->hca_hdl = NULL; 4355 } 4356 } 4357 4358 static void 4359 rib_server_side_cache_reclaim(void *argp) 4360 { 4361 cache_avl_struct_t *rcas; 4362 rib_lrc_entry_t *rb; 4363 rib_hca_t *hca = (rib_hca_t *)argp; 4364 4365 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4366 rcas = avl_first(&hca->avl_tree); 4367 if (rcas != NULL) 4368 avl_remove(&hca->avl_tree, rcas); 4369 4370 while (rcas != NULL) { 4371 while (rcas->r.forw != &rcas->r) { 4372 rcas->elements--; 4373 rib_total_buffers --; 4374 rb = rcas->r.forw; 4375 remque(rb); 4376 if (rb->registered) 4377 (void) rib_deregistermem_via_hca(hca, 4378 rb->lrc_buf, rb->lrc_mhandle); 4379 cache_allocation -= rb->lrc_len; 4380 kmem_free(rb->lrc_buf, rb->lrc_len); 4381 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4382 } 4383 mutex_destroy(&rcas->node_lock); 4384 kmem_cache_free(hca->server_side_cache, rcas); 4385 rcas = avl_first(&hca->avl_tree); 4386 if (rcas != NULL) 4387 avl_remove(&hca->avl_tree, rcas); 4388 } 4389 rw_exit(&hca->avl_rw_lock); 4390 } 4391 4392 static void 4393 rib_server_side_cache_cleanup(void *argp) 4394 { 4395 cache_avl_struct_t *rcas; 4396 rib_lrc_entry_t *rb; 4397 rib_hca_t *hca = (rib_hca_t *)argp; 4398 4399 rw_enter(&hca->avl_rw_lock, RW_READER); 4400 if (cache_allocation < cache_limit) { 4401 rw_exit(&hca->avl_rw_lock); 4402 return; 4403 } 4404 rw_exit(&hca->avl_rw_lock); 4405 4406 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4407 rcas = avl_last(&hca->avl_tree); 4408 if (rcas != NULL) 4409 avl_remove(&hca->avl_tree, rcas); 4410 4411 while (rcas != NULL) { 4412 while (rcas->r.forw != &rcas->r) { 4413 rcas->elements--; 4414 rib_total_buffers --; 4415 rb = rcas->r.forw; 4416 remque(rb); 4417 if (rb->registered) 4418 (void) rib_deregistermem_via_hca(hca, 4419 rb->lrc_buf, rb->lrc_mhandle); 4420 cache_allocation -= rb->lrc_len; 4421 kmem_free(rb->lrc_buf, rb->lrc_len); 4422 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4423 } 4424 mutex_destroy(&rcas->node_lock); 4425 if (hca->server_side_cache) { 4426 kmem_cache_free(hca->server_side_cache, rcas); 4427 } 4428 if ((cache_allocation) < cache_limit) { 4429 rw_exit(&hca->avl_rw_lock); 4430 return; 4431 } 4432 4433 rcas = avl_last(&hca->avl_tree); 4434 if (rcas != NULL) 4435 avl_remove(&hca->avl_tree, rcas); 4436 } 4437 rw_exit(&hca->avl_rw_lock); 4438 } 4439 4440 static int 4441 avl_compare(const void *t1, const void *t2) 4442 { 4443 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4444 return (0); 4445 4446 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4447 return (-1); 4448 4449 return (1); 4450 } 4451 4452 static void 4453 rib_destroy_cache(rib_hca_t *hca) 4454 { 4455 if (hca->reg_cache_clean_up != NULL) { 4456 ddi_taskq_destroy(hca->reg_cache_clean_up); 4457 hca->reg_cache_clean_up = NULL; 4458 } 4459 if (hca->avl_init) { 4460 rib_server_side_cache_reclaim((void *)hca); 4461 if (hca->server_side_cache) { 4462 kmem_cache_destroy(hca->server_side_cache); 4463 hca->server_side_cache = NULL; 4464 } 4465 avl_destroy(&hca->avl_tree); 4466 mutex_destroy(&hca->cache_allocation); 4467 rw_destroy(&hca->avl_rw_lock); 4468 } 4469 hca->avl_init = FALSE; 4470 } 4471 4472 static void 4473 rib_force_cleanup(void *hca) 4474 { 4475 if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL) 4476 (void) ddi_taskq_dispatch( 4477 ((rib_hca_t *)hca)->reg_cache_clean_up, 4478 rib_server_side_cache_cleanup, 4479 (void *)hca, DDI_NOSLEEP); 4480 } 4481 4482 static rib_lrc_entry_t * 4483 rib_get_cache_buf(CONN *conn, uint32_t len) 4484 { 4485 cache_avl_struct_t cas, *rcas; 4486 rib_hca_t *hca = (ctoqp(conn))->hca; 4487 rib_lrc_entry_t *reply_buf; 4488 avl_index_t where = NULL; 4489 uint64_t c_alloc = 0; 4490 4491 if (!hca->avl_init) 4492 goto error_alloc; 4493 4494 cas.len = len; 4495 4496 rw_enter(&hca->avl_rw_lock, RW_READER); 4497 4498 mutex_enter(&hca->cache_allocation); 4499 c_alloc = cache_allocation; 4500 mutex_exit(&hca->cache_allocation); 4501 4502 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4503 &where)) == NULL) { 4504 /* Am I above the cache limit */ 4505 if ((c_alloc + len) >= cache_limit) { 4506 rib_force_cleanup((void *)hca); 4507 rw_exit(&hca->avl_rw_lock); 4508 cache_misses_above_the_limit ++; 4509 4510 /* Allocate and register the buffer directly */ 4511 goto error_alloc; 4512 } 4513 4514 rw_exit(&hca->avl_rw_lock); 4515 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4516 4517 /* Recheck to make sure no other thread added the entry in */ 4518 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4519 &cas, &where)) == NULL) { 4520 /* Allocate an avl tree entry */ 4521 rcas = (cache_avl_struct_t *) 4522 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4523 4524 bzero(rcas, sizeof (cache_avl_struct_t)); 4525 rcas->elements = 0; 4526 rcas->r.forw = &rcas->r; 4527 rcas->r.back = &rcas->r; 4528 rcas->len = len; 4529 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4530 avl_insert(&hca->avl_tree, rcas, where); 4531 } 4532 } 4533 4534 mutex_enter(&rcas->node_lock); 4535 4536 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4537 rib_total_buffers--; 4538 cache_hits++; 4539 reply_buf = rcas->r.forw; 4540 remque(reply_buf); 4541 rcas->elements--; 4542 mutex_exit(&rcas->node_lock); 4543 rw_exit(&hca->avl_rw_lock); 4544 mutex_enter(&hca->cache_allocation); 4545 cache_allocation -= len; 4546 mutex_exit(&hca->cache_allocation); 4547 } else { 4548 /* Am I above the cache limit */ 4549 mutex_exit(&rcas->node_lock); 4550 if ((c_alloc + len) >= cache_limit) { 4551 rib_force_cleanup((void *)hca); 4552 rw_exit(&hca->avl_rw_lock); 4553 cache_misses_above_the_limit ++; 4554 /* Allocate and register the buffer directly */ 4555 goto error_alloc; 4556 } 4557 rw_exit(&hca->avl_rw_lock); 4558 cache_misses ++; 4559 /* Allocate a reply_buf entry */ 4560 reply_buf = (rib_lrc_entry_t *) 4561 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4562 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4563 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4564 reply_buf->lrc_len = len; 4565 reply_buf->registered = FALSE; 4566 reply_buf->avl_node = (void *)rcas; 4567 } 4568 4569 return (reply_buf); 4570 4571 error_alloc: 4572 reply_buf = (rib_lrc_entry_t *) 4573 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4574 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4575 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4576 reply_buf->lrc_len = len; 4577 reply_buf->registered = FALSE; 4578 reply_buf->avl_node = NULL; 4579 4580 return (reply_buf); 4581 } 4582 4583 /* 4584 * Return a pre-registered back to the cache (without 4585 * unregistering the buffer).. 4586 */ 4587 4588 static void 4589 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 4590 { 4591 cache_avl_struct_t cas, *rcas; 4592 avl_index_t where = NULL; 4593 rib_hca_t *hca = (ctoqp(conn))->hca; 4594 4595 if (!hca->avl_init) 4596 goto error_free; 4597 4598 cas.len = reg_buf->lrc_len; 4599 rw_enter(&hca->avl_rw_lock, RW_READER); 4600 if ((rcas = (cache_avl_struct_t *) 4601 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 4602 rw_exit(&hca->avl_rw_lock); 4603 goto error_free; 4604 } else { 4605 rib_total_buffers ++; 4606 cas.len = reg_buf->lrc_len; 4607 mutex_enter(&rcas->node_lock); 4608 insque(reg_buf, &rcas->r); 4609 rcas->elements ++; 4610 mutex_exit(&rcas->node_lock); 4611 rw_exit(&hca->avl_rw_lock); 4612 mutex_enter(&hca->cache_allocation); 4613 cache_allocation += cas.len; 4614 mutex_exit(&hca->cache_allocation); 4615 } 4616 4617 return; 4618 4619 error_free: 4620 4621 if (reg_buf->registered) 4622 (void) rib_deregistermem_via_hca(hca, 4623 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 4624 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 4625 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 4626 } 4627 4628 static rdma_stat 4629 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 4630 uint_t buflen, struct mrc *buf_handle) 4631 { 4632 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4633 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4634 rdma_stat status; 4635 4636 4637 /* 4638 * Note: ALL buffer pools use the same memory type RDMARW. 4639 */ 4640 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4641 if (status == RDMA_SUCCESS) { 4642 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 4643 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4644 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4645 } else { 4646 buf_handle->mrc_linfo = NULL; 4647 buf_handle->mrc_lmr = 0; 4648 buf_handle->mrc_rmr = 0; 4649 } 4650 return (status); 4651 } 4652 4653 /* ARGSUSED */ 4654 static rdma_stat 4655 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 4656 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 4657 { 4658 4659 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 4660 return (RDMA_SUCCESS); 4661 } 4662 4663 /* ARGSUSED */ 4664 static rdma_stat 4665 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 4666 { 4667 4668 (void) ibt_deregister_mr(hca->hca_hdl, 4669 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4670 return (RDMA_SUCCESS); 4671 } 4672 4673 /* 4674 * Check if the IP interface named by `lifrp' is RDMA-capable. 4675 */ 4676 static boolean_t 4677 rpcib_rdma_capable_interface(struct lifreq *lifrp) 4678 { 4679 char ifname[LIFNAMSIZ]; 4680 char *cp; 4681 4682 if (lifrp->lifr_type == IFT_IB) 4683 return (B_TRUE); 4684 4685 /* 4686 * Strip off the logical interface portion before getting 4687 * intimate with the name. 4688 */ 4689 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 4690 if ((cp = strchr(ifname, ':')) != NULL) 4691 *cp = '\0'; 4692 4693 return (strcmp("lo0", ifname) == 0); 4694 } 4695 4696 static int 4697 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 4698 { 4699 vnode_t *kvp, *vp; 4700 TIUSER *tiptr; 4701 struct strioctl iocb; 4702 k_sigset_t smask; 4703 int err = 0; 4704 4705 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 4706 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 4707 &tiptr, CRED()) == 0) { 4708 vp = tiptr->fp->f_vnode; 4709 } else { 4710 VN_RELE(kvp); 4711 return (EPROTO); 4712 } 4713 } else { 4714 return (EPROTO); 4715 } 4716 4717 iocb.ic_cmd = cmd; 4718 iocb.ic_timout = 0; 4719 iocb.ic_len = len; 4720 iocb.ic_dp = (caddr_t)arg; 4721 sigintr(&smask, 0); 4722 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4723 sigunintr(&smask); 4724 (void) t_kclose(tiptr, 0); 4725 VN_RELE(kvp); 4726 return (err); 4727 } 4728 4729 /* 4730 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 4731 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 4732 */ 4733 static int 4734 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 4735 { 4736 int err; 4737 struct lifnum lifn; 4738 4739 bzero(&lifn, sizeof (struct lifnum)); 4740 lifn.lifn_family = AF_UNSPEC; 4741 4742 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 4743 if (err != 0) 4744 return (err); 4745 4746 /* 4747 * Pad the interface count to account for additional interfaces that 4748 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 4749 */ 4750 lifn.lifn_count += 4; 4751 4752 bzero(lifcp, sizeof (struct lifconf)); 4753 lifcp->lifc_family = AF_UNSPEC; 4754 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 4755 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 4756 4757 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 4758 if (err != 0) { 4759 kmem_free(lifcp->lifc_buf, *bufsizep); 4760 return (err); 4761 } 4762 return (0); 4763 } 4764 4765 static boolean_t 4766 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 4767 { 4768 uint_t i, nifs; 4769 uint_t bufsize; 4770 struct lifconf lifc; 4771 struct lifreq *lifrp; 4772 struct sockaddr_in *sinp; 4773 struct sockaddr_in6 *sin6p; 4774 4775 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 4776 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 4777 4778 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 4779 return (B_FALSE); 4780 4781 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 4782 kmem_free(lifc.lifc_buf, bufsize); 4783 return (B_FALSE); 4784 } 4785 4786 /* 4787 * Worst case is that all of the addresses are IB-capable and have 4788 * the same address family, so size our buffers accordingly. 4789 */ 4790 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 4791 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 4792 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 4793 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 4794 4795 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 4796 if (!rpcib_rdma_capable_interface(lifrp)) 4797 continue; 4798 4799 if (lifrp->lifr_addr.ss_family == AF_INET) { 4800 sinp = addrs4->ri_list; 4801 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 4802 sizeof (struct sockaddr_in)); 4803 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 4804 sin6p = addrs6->ri_list; 4805 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 4806 sizeof (struct sockaddr_in6)); 4807 } 4808 } 4809 4810 kmem_free(lifc.lifc_buf, bufsize); 4811 return (B_TRUE); 4812 } 4813 4814 /* ARGSUSED */ 4815 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { 4816 4817 if (KSTAT_WRITE == rw) { 4818 return (EACCES); 4819 } 4820 rpcib_kstat.cache_limit.value.ui64 = 4821 (uint64_t)cache_limit; 4822 rpcib_kstat.cache_allocation.value.ui64 = 4823 (uint64_t)cache_allocation; 4824 rpcib_kstat.cache_hits.value.ui64 = 4825 (uint64_t)cache_hits; 4826 rpcib_kstat.cache_misses.value.ui64 = 4827 (uint64_t)cache_misses; 4828 rpcib_kstat.cache_misses_above_the_limit.value.ui64 = 4829 (uint64_t)cache_misses_above_the_limit; 4830 return (0); 4831 } 4832